Package net.yacy.kelondro.data.meta

Examples of net.yacy.kelondro.data.meta.URIMetadataRow


                    while (containerIterator.hasNext() && this.run) {
                        waiter();
                        entry = new WordReferenceVars(containerIterator.next());
                        // System.out.println("Wordhash: "+wordHash+" UrlHash:
                        // "+entry.getUrlHash());
                        final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
                        if (ue == null) {
                            urlHashs.put(entry.urlhash());
                        } else {
                            url = ue.metadata().url();
                            if (url == null || Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
                                urlHashs.put(entry.urlhash());
                            }
                        }
                    }
View Full Code Here


        final byte[] urlHash = obrwi.getElement().urlhash();
        if (urlHash == null) return null;
        try {
            final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
            if (entry == null) return null;
            return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
        } catch (final IOException e) {
            return null;
        }
    }
View Full Code Here

        if (this.urlIndexFile == null) return null;
        if (urlHash == null) return null;
        try {
            final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
            if (entry == null) return null;
            return new URIMetadataRow(entry, null, 0);
        } catch (final IOException e) {
            return null;
        }
    }
View Full Code Here

        }
    }

    public void store(final URIMetadataRow entry) throws IOException {
        // Check if there is a more recent Entry already in the DB
        URIMetadataRow oldEntry;
        if (this.urlIndexFile == null) return; // case may happen during shutdown or startup
        try {
            final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
            oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
        } catch (final Exception e) {
            Log.logException(e);
            oldEntry = null;
        }
        if (oldEntry != null && entry.isOlder(oldEntry)) {
View Full Code Here

     */
    public TreeSet<String> domainNameCollector(int count, final Map<String, URLHashCounter> domainSamples) {
        // collect hashes from all domains

        // fetch urls from the database to determine the host in clear text
        URIMetadataRow urlref;
        if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
        this.statsDump = new ArrayList<HostStat>();
        final TreeSet<String> set = new TreeSet<String>();
        for (final URLHashCounter hs: domainSamples.values()) {
            if (hs == null) continue;
            urlref = this.load(hs.urlhashb);
            if (urlref == null || urlref.metadata() == null || urlref.metadata().url() == null || urlref.metadata().url().getHost() == null) continue;
            set.add(urlref.metadata().url().getHost());
            count--;
            if (count == 0) break;
        }
        return set;
    }
View Full Code Here

     * @param domainSamples a map from domain hashes to hash statistics
     * @return a map from domain hashes to host stats including domain names
     */
    public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
        final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
        URIMetadataRow urlref;

        final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
            hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
        }
        URIMetadataRow.Components comps;
        DigestURI url;
        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
            urlref = this.load(e.getValue().urlhashb);
            comps = urlref.metadata();
            url = comps.url();
            hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
        }
        return hostMap;
    }
View Full Code Here

        // prevent too heavy IO.
        if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();

        // fetch urls from the database to determine the host in clear text
        final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
        URIMetadataRow urlref;
        String urlhash;
        count += 10; // make some more to prevent that we have to do this again after deletions too soon.
        if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
        this.statsDump = new ArrayList<HostStat>();
        URIMetadataRow.Components comps;
        DigestURI url;
        while (j.hasNext()) {
            urlhash = j.next();
            if (urlhash == null) continue;
            urlref = this.load(ASCII.getBytes(urlhash));
            if (urlref == null || urlref.metadata() == null || urlref.metadata().url() == null || urlref.metadata().url().getHost() == null) continue;
            if (this.statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
            comps = urlref.metadata();
            url = comps.url();
            this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
            count--;
            if (count == 0) break;
        }
View Full Code Here

        public final URIMetadataRow next() {
            Row.Entry e = null;
            if (this.iter == null) { return null; }
            if (this.iter.hasNext()) { e = this.iter.next(); }
            if (e == null) { return null; }
            return new URIMetadataRow(e, null, 0);
        }
View Full Code Here

                                this.run = false;
                                return;
                            }
                        }
                    }
                    final URIMetadataRow entry = eiter.next();
                    if (entry == null) {
                        if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
                    } else if (entry.hash() == null) {
                        if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + "hash == null");
                    } else {
                        final URIMetadataRow.Components metadata = entry.metadata();
                        this.totalSearchedUrls++;
                        if (metadata == null) {
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "corrupted entry for hash = " + ASCII.String(entry.hash()));
                            remove(entry.hash());
                            continue;
                        }
                        if (metadata.url() == null) {
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null");
                            remove(entry.hash());
                            continue;
                        }
                        if (this.blacklist.isListed(Blacklist.BLACKLIST_CRAWLER, metadata.url()) ||
                            this.blacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url()) ||
                            (this.crawlStacker.urlInAcceptedDomain(metadata.url()) != null)) {
                            this.lastBlacklistedUrl = metadata.url().toNormalform(true, true);
                            this.lastBlacklistedHash = ASCII.String(entry.hash());
                            if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + metadata.url().toNormalform(false, true));
                            remove(entry.hash());
                            if (this.blacklistedUrls % 100 == 0) {
                                Log.logInfo("URLDBCLEANER", "Deleted " + this.blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + this.lastBlacklistedUrl);
                            }
                        }
                        this.lastUrl = metadata.url().toNormalform(true, true);
                        this.lastHash = ASCII.String(entry.hash());
                    }
                }
            } catch (final RuntimeException e) {
                if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL",0) != -1) {
                    Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e);
View Full Code Here

                if (this.references.containsKey(e.urlhash())) continue;
                if (this.badReferences.has(e.urlhash())) {
                    notFoundx.add(e.urlhash());
                    continue;
                }
                final URIMetadataRow r = Transmission.this.segment.urlMetadata().load(e.urlhash());
                if (r == null) {
                    notFoundx.add(e.urlhash());
                    this.badReferences.put(e.urlhash());
                } else {
                    this.references.put(e.urlhash(), r);
View Full Code Here

TOP

Related Classes of net.yacy.kelondro.data.meta.URIMetadataRow

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.