Examples of URIMetadataRow


Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        }
    }

    public void store(final URIMetadataRow entry) throws IOException {
        // Check if there is a more recent Entry already in the DB
        URIMetadataRow oldEntry;
        if (urlIndexFile == null) return; // case may happen during shutdown or startup
        try {
            Row.Entry oe = urlIndexFile.get(entry.hash());
            oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
        } catch (final Exception e) {
            Log.logException(e);
            oldEntry = null;
        }
        if (oldEntry != null && entry.isOlder(oldEntry)) {
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

     */
    public TreeSet<String> domainNameCollector(int count, Map<String, URLHashCounter> domainSamples) {
        // collect hashes from all domains
       
        // fetch urls from the database to determine the host in clear text
        URIMetadataRow urlref;
        if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
        statsDump = new ArrayList<HostStat>();
        TreeSet<String> set = new TreeSet<String>();
        for (URLHashCounter hs: domainSamples.values()) {
            if (hs == null) continue;
            urlref = this.load(hs.urlhashb);
            if (urlref == null || urlref.metadata() == null || urlref.metadata().url() == null || urlref.metadata().url().getHost() == null) continue;
            set.add(urlref.metadata().url().getHost());
            count--;
            if (count == 0) break;
        }
        return set;
    }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

     * @param domainSamples a map from domain hashes to hash statistics
     * @return a map from domain hashes to host stats including domain names
     */
    public Map<String, HostStat> domainHashResolver(Map<String, URLHashCounter> domainSamples) {
        HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
        URIMetadataRow urlref;
       
        ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
        for (Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
            hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
        }
        URIMetadataRow.Components comps;
        DigestURI url;
        for (Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
            urlref = this.load(e.getValue().urlhashb);
            comps = urlref.metadata();
            url = comps.url();
            hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
        }
        return hostMap;
    }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        // prevent too heavy IO.
        if (statsDump != null && count <= statsDump.size()) return statsDump.iterator();
   
        // fetch urls from the database to determine the host in clear text
        Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
        URIMetadataRow urlref;
        String urlhash;
        count += 10; // make some more to prevent that we have to do this again after deletions too soon.
        if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
        statsDump = new ArrayList<HostStat>();
        URIMetadataRow.Components comps;
        DigestURI url;
        while (j.hasNext()) {
            urlhash = j.next();
            if (urlhash == null) continue;
            urlref = this.load(ASCII.getBytes(urlhash));
            if (urlref == null || urlref.metadata() == null || urlref.metadata().url() == null || urlref.metadata().url().getHost() == null) continue;
            if (statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
            comps = urlref.metadata();
            url = comps.url();
            statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
            count--;
            if (count == 0) break;
        }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        public final URIMetadataRow next() {
            Row.Entry e = null;
            if (this.iter == null) { return null; }
            if (this.iter.hasNext()) { e = this.iter.next(); }
            if (e == null) { return null; }
            return new URIMetadataRow(e, null, 0);
        }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

                                this.run = false;
                                return;
                            }
                        }
                    }
                    final URIMetadataRow entry = eiter.next();
                    if (entry == null) {
                        if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
                    } else if (entry.hash() == null) {
                        if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + "hash == null");
                    } else {
                        final URIMetadataRow.Components metadata = entry.metadata();
                        totalSearchedUrls++;
                        if (metadata == null) {
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + ASCII.String(entry.hash()));
                            remove(entry.hash());
                            continue;
                        }
                        if (metadata.url() == null) {
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null");
                            remove(entry.hash());
                            continue;
                        }
                        if (blacklist.isListed(Blacklist.BLACKLIST_CRAWLER, metadata.url()) ||
                            blacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url()) ||
                            (crawlStacker.urlInAcceptedDomain(metadata.url()) != null)) {
                            lastBlacklistedUrl = metadata.url().toNormalform(true, true);
                            lastBlacklistedHash = ASCII.String(entry.hash());
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + metadata.url().toNormalform(false, true));
                            remove(entry.hash());
                            if (blacklistedUrls % 100 == 0) {
                                Log.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
                            }
                        }
                        lastUrl = metadata.url().toNormalform(true, true);
                        lastHash = ASCII.String(entry.hash());
                    }
                }
            } catch (final RuntimeException e) {
                if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL") != -1) {
                    Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e);
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

            }
        }

        // create a new loaded URL db entry
        if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
        final URIMetadataRow newEntry = new URIMetadataRow(
                url,                                       // URL
                dc_title,                                  // document description
                document.dc_creator(),                     // author
                document.dc_subject(' '),                  // tags
                document.dc_publisher(),                   // publisher (may be important to get location data)
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

     */
    public int removeAllUrlReferences(final byte[] urlhash, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {

        if (urlhash == null) return 0;
        // determine the url string
        final URIMetadataRow entry = urlMetadata().load(urlhash);
        if (entry == null) return 0;
        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata == null || metadata.url() == null) return 0;

        try {
            // parse the resource
            final Document document = Document.mergeDocuments(metadata.url(), null, loader.loadDocuments(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Long.MAX_VALUE));
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

                    while (containerIterator.hasNext() && this.run) {
                        waiter();
                        entry = new WordReferenceVars(containerIterator.next());
                        // System.out.println("Wordhash: "+wordHash+" UrlHash:
                        // "+entry.getUrlHash());
                        final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
                        if (ue == null) {
                            urlHashs.put(entry.urlhash());
                        } else {
                            url = ue.metadata().url();
                            if (url == null || Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
                                urlHashs.put(entry.urlhash());
                            }
                        }
                    }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

    }
  }

  public EnumMap<METADATA, String> getMetadata() {
    final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
        final URIMetadataRow urlEntry = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(this.uri.hash());
        if (urlEntry != null) {
          metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
          metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
          metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate()));
          metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate()));
          metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
          metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
          metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
          metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language()));

          final URIMetadataRow.Components meta = urlEntry.metadata();
          if (meta != null) {
            metadata.put(METADATA.TITLE, meta.dc_title());
            metadata.put(METADATA.CREATOR, meta.dc_creator());
            metadata.put(METADATA.KEYWORDS, meta.dc_subject());
            metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.