Package net.yacy.kelondro.data.meta

Examples of net.yacy.kelondro.data.meta.URIMetadataRow


    return this.document;
 

  public EnumMap<METADATA, String> getMetadata() {
    final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
        final URIMetadataRow urlEntry = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(this.uri.hash());
        if (urlEntry != null) {
          metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
          metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
          metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate()));
          metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate()));
          metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
          metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
          metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
          metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language()));

          final URIMetadataRow.Components meta = urlEntry.metadata();
          if (meta != null) {
            metadata.put(METADATA.TITLE, meta.dc_title());
            metadata.put(METADATA.CREATOR, meta.dc_creator());
            metadata.put(METADATA.KEYWORDS, meta.dc_subject());
            metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
View Full Code Here


                    Reference iEntry;
                    while (wordIdxEntries.hasNext()) {
                        iEntry = wordIdxEntries.next();
                        final byte[] urlHash = iEntry.urlhash();
                        if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
                            final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
                            urlCounter++;
                            minimizedUrlDB.store(urlEntry);
                            if (urlCounter % 500 == 0) {
                                log.logInfo(urlCounter + " URLs found so far.");
                            }
View Full Code Here

     * @param args
     */
    public static void main(final String[] args) {
        try {
            final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
            final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
            final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
            System.out.println("valid test:\n=======");
            // add
            stack(urlRef, urlRef.hash(), url.hash(), stackNo);
            // size
            System.out.println("size of stack:\t"+ getStackSize(stackNo));
        } catch (final MalformedURLException e) {
            Log.logException(e);
        }
View Full Code Here

        Date lastMod = entry.lastmod(null);
        if (lastMod != null) {
            final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
            if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
                // the url was already loaded. we need to check the date
                final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash);
                if (oldEntry != null) {
                    final Date modDate = oldEntry.moddate();
                    // check if modDate is null
                    if (modDate.after(lastMod)) return;
                }
            }
        }
View Full Code Here

                // built urlCache
                final Iterator<WordReference> urlIter = index.entries();
                final TreeMap<byte[], URIMetadataRow> knownURLs = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
                final HandleSet unknownURLEntries = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size());
                Reference iEntry;
                URIMetadataRow lurl;
                while (urlIter.hasNext()) {
                    iEntry = urlIter.next();
                    lurl = segment.urlMetadata().load(iEntry.urlhash());
                    if (lurl == null) {
                        try {
                            unknownURLEntries.put(iEntry.urlhash());
                        } catch (final RowSpaceExceededException e) {
                            Log.logException(e);
                        }
                        urlIter.remove();
                    } else {
                        knownURLs.put(iEntry.urlhash(), lurl);
                    }
                }

                // make an indexContainerCache
                final ReferenceContainerCache<WordReference> icc = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
                try {
                    icc.add(index);
                } catch (final RowSpaceExceededException e) {
                    Log.logException(e);
                }

                // transport to other peer
                final boolean gzipBody = sb.getConfigBool("indexControl.gzipBody", false);
                final int timeout = (int) sb.getConfigLong("indexControl.timeout", 60000);
                final String error = Protocol.transferIndex(
                             seed,
                             icc,
                             knownURLs,
                             gzipBody,
                             timeout);
                prop.put("result", (error == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries.size() + " URL not found") : "error: " + error);
                index = null;
            } catch (final IOException e) {
                Log.logException(e);
            }

            // generate list
            if (post.containsKey("keyhashsimilar")) try {
                final Iterator<ReferenceContainer<WordReference>> containerIt = segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator();
                    ReferenceContainer<WordReference> container;
                    i = 0;
                    int rows = 0, cols = 0;
                    prop.put("keyhashsimilar", "1");
                    while (containerIt.hasNext() && i < 256) {
                        container = containerIt.next();
                        prop.put("keyhashsimilar_rows_"+rows+"_cols_"+cols+"_wordHash", container.getTermHash());
                        cols++;
                        if (cols==8) {
                            prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                            cols = 0;
                            rows++;
                        }
                        i++;
                    }
                    prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
                    prop.put("keyhashsimilar_rows", rows + 1);
                    prop.put("result", "");
            } catch (final IOException e) {
                Log.logException(e);
            }

            if (post.containsKey("blacklist")) {
                final String blacklist = post.get("blacklist", "");
                final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, urlb.size());
                if (post.containsKey("blacklisturls")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/" + url.getFile());
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
                                                url.getHost(),
                                                url.getFile());
                                    }
                                }
                                SearchEventCache.cleanupEvents(true);
                            }
                        }
                        pw.close();
                    } catch (final IOException e) {
                    }
                }

                if (post.containsKey("blacklistdomains")) {
                    PrintWriter pw;
                    try {
                        final String[] supportedBlacklistTypes = Blacklist.BLACKLIST_TYPES_STRING.split(",");
                        pw = new PrintWriter(new FileWriter(new File(ListManager.listsPath, blacklist), true));
                        DigestURI url;
                        for (final byte[] b: urlb) {
                            try {
                                urlHashes.put(b);
                            } catch (final RowSpaceExceededException e) {
                                Log.logException(e);
                            }
                            final URIMetadataRow e = segment.urlMetadata().load(b);
                            segment.urlMetadata().remove(b);
                            if (e != null) {
                                url = e.metadata().url();
                                pw.println(url.getHost() + "/.*");
                                for (final String supportedBlacklistType : supportedBlacklistTypes) {
                                    if (ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist)) {
                                        Switchboard.urlBlacklist.add(
                                                supportedBlacklistType,
View Full Code Here

            prop.put("searchresult", 3);
            prop.put("genUrlList_flags", (flags == null) ? "" : flags.exportB64());
            prop.put("genUrlList_lines", maxlines);
            int i = 0;
            DigestURI url;
            URIMetadataRow entry;
            String us;
            long rn = -1;
            while (!ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null) {
                if ((entry == null) || (entry.metadata() == null)) continue;
                url = entry.metadata().url();
                if (url == null) continue;
                us = url.toNormalform(false, false);
                if (rn == -1) rn = entry.ranking();
                prop.put("genUrlList_urlList_"+i+"_urlExists", "1");
                prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxCount", i);
                prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlhxValue", entry.word().urlhash());
                prop.putHTML("genUrlList_urlList_"+i+"_urlExists_keyString", keystring);
                prop.put("genUrlList_urlList_"+i+"_urlExists_keyHash", keyhashs);
                prop.putHTML("genUrlList_urlList_"+i+"_urlExists_urlString", us);
                prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "<br>" + us.substring(2040) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "<br>" + us.substring(20)) : us));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_domlength", DigestURI.domLengthEstimation(entry.hash()));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", BlockRank.ranking(entry.hash()));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_tf", 1000.0 * entry.word().termFrequency());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_authority", (ranked.getOrder() == null) ? -1 : ranked.getOrder().authority(ASCII.String(entry.hash(), 6, 6)));
                prop.put("genUrlList_urlList_"+i+"_urlExists_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(entry.word().lastModified())));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_worddistance", 0);
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_ybr", BlockRank.ranking(entry.hash()));
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().minposition());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps());
                prop.putNum("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength());
                prop.put("genUrlList_urlList_"+i+"_urlExists_props",
                        ((entry.word().flags().get(Condenser.flag_cat_indexof)) ? "appears on index page, " : "") +
                        ((entry.word().flags().get(Condenser.flag_cat_hasimage)) ? "contains images, " : "") +
                        ((entry.word().flags().get(Condenser.flag_cat_hasaudio)) ? "contains audio, " : "") +
                        ((entry.word().flags().get(Condenser.flag_cat_hasvideo)) ? "contains video, " : "") +
                        ((entry.word().flags().get(Condenser.flag_cat_hasapp)) ? "contains applications, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_dc_identifier)) ? "appears in url, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_dc_title)) ? "appears in title, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_dc_creator)) ? "appears in author, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_dc_subject)) ? "appears in subject, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_dc_description)) ? "appears in description, " : "") +
                        ((entry.word().flags().get(WordReferenceRow.flag_app_emphasized)) ? "appears emphasized, " : "") +
                        ((DigestURI.probablyRootURL(entry.word().urlhash())) ? "probably root url" : "")
                );
                if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, url)) {
                    prop.put("genUrlList_urlList_"+i+"_urlExists_urlhxChecked", "1");
                }
                i++;
View Full Code Here

      long timeleft;
      while ((timeleft = timeout - System.currentTimeMillis()) > 0) {
          //System.out.println("timeleft = " + timeleft);
            final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
            if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
            final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
            if (page == null) {
              try {
                    this.misses.putUnique(obrwi.getElement().urlhash());
                } catch (final RowSpaceExceededException e) {
                }
              continue;
            }

            // prepare values for constraint check
            final URIMetadataRow.Components metadata = page.metadata();

            // check errors
            if (metadata == null) {
                this.sortout++;
                continue; // rare case where the url is corrupted
            }

            if (!this.query.urlMask_isCatchall) {
                // check url mask
                if (!metadata.matches(this.query.urlMask)) {
                    this.sortout++;
                    continue;
                }

                // in case that we do not have e catchall filter for urls
                // we must also construct the domain navigator here
                //if (query.sitehash == null) {
                //    this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
                //    this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
                //}
            }

            // check for more errors
            if (metadata.url() == null) {
                this.sortout++;
                continue; // rare case where the url is corrupted
            }

            final String pageurl = metadata.url().toNormalform(true, true);
            final String pageauthor = metadata.dc_creator();
            final String pagetitle = metadata.dc_title().toLowerCase();

            // check exclusion
            if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) ||
                (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) ||
                (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) {
                this.sortout++;
                continue;
            }

            // check index-of constraint
            if ((this.query.constraint != null) &&
                (this.query.constraint.get(Condenser.flag_cat_indexof)) &&
                (!(pagetitle.startsWith("index of")))) {
                final Iterator<byte[]> wi = this.query.queryHashes.iterator();
                while (wi.hasNext()) {
                    this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
                }
                this.sortout++;
                continue;
            }

            // check location constraint
            if ((this.query.constraint != null) &&
                (this.query.constraint.get(Condenser.flag_cat_haslocation)) &&
                (metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
                this.sortout++;
                continue;
            }

            // check content domain
            if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
                (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
                (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
                (this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
                this.sortout++;
              continue;
            }

            // evaluate information of metadata for navigation
View Full Code Here

    public ScoreMap<String> getHostNavigator() {
        final ScoreMap<String> result = new ConcurrentScoreMap<String>();
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts",0) < 0) return result;

        final Iterator<String> domhashs = this.hostNavigator.keys(false);
        URIMetadataRow row;
        byte[] urlhash;
        String hosthash, hostname;
        if (this.hostResolver != null) while (domhashs.hasNext() && result.sizeSmaller(30)) {
            hosthash = domhashs.next();
            if (hosthash == null) continue;
            urlhash = this.hostResolver.get(hosthash);
            row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash);
            hostname = row == null ? null : row.metadata().url().getHost();
            if (hostname != null) {
                result.set(hostname, this.hostNavigator.get(hosthash));
            }
        }
        if (result.sizeSmaller(2)) result.clear(); // navigators with one entry are not useful
View Full Code Here

    }

    public DigestURI getURL(final Segments.Process process, final byte[] urlhash) {
        if (urlhash == null) return null;
        if (urlhash.length == 0) return null;
        final URIMetadataRow le = this.indexSegments.urlMetadata(process).load(urlhash);
        if (le != null) {
            final Components metadata = le.metadata();
            if (metadata == null) return null;
            return metadata.url();
        }
        return this.crawlQueues.getURL(urlhash);
    }
View Full Code Here

        // remove stopwords
        this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);

        // STORE WORD INDEX
        URIMetadataRow newEntry = null;
        try {
            newEntry = this.indexSegments.segment(process).storeDocument(
                    url,
                    referrerURL,
                    queueEntry.lastModified(),
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.data.meta.URIMetadataRow

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.