Examples of URIMetadataRow


Examples of net.yacy.kelondro.data.meta.URIMetadataRow

    }

    public DigestURI getURL(final Segments.Process process, final byte[] urlhash) {
        if (urlhash == null) return null;
        if (urlhash.length == 0) return null;
        final URIMetadataRow le = this.indexSegments.urlMetadata(process).load(urlhash);
        if (le != null) {
            final Components metadata = le.metadata();
            if (metadata == null) return null;
            return metadata.url();
        }
        return this.crawlQueues.getURL(urlhash);
    }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        // remove stopwords
        this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());

        // STORE WORD INDEX
        URIMetadataRow newEntry = null;
        try {
            newEntry = this.indexSegments.segment(process).storeDocument(
                    queueEntry.url(),
                    referrerURL,
                    queueEntry.lastModified(),
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        @Override
        public void run() {

            // start fetching urls and snippets
            URIMetadataRow page;
            //final int fetchAhead = snippetMode == 0 ? 0 : 10;
            final boolean nav_topics = ResultFetcher.this.query.navigators.equals("all") || ResultFetcher.this.query.navigators.indexOf("topics") >= 0;
            try {
                //System.out.println("DEPLOYED WORKER " + id + " FOR " + this.neededResults + " RESULTS, timeoutd = " + (this.timeout - System.currentTimeMillis()));
                int loops = 0;
                while (this.shallrun && System.currentTimeMillis() < this.timeout) {
                    this.lastLifeSign = System.currentTimeMillis();

                    // check if we have enough
                    if (ResultFetcher.this.result.sizeAvailable() >= this.neededResults) {
                        //Log.logWarning("ResultFetcher", ResultFetcher.this.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
                        break;
                    }

                    // check if we can succeed if we try to take another url
                    if (ResultFetcher.this.rankingProcess.feedingIsFinished() && ResultFetcher.this.rankingProcess.sizeQueue() == 0) {
                        //Log.logWarning("ResultFetcher", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
                        break;
                    }

                    // get next entry
                    page = ResultFetcher.this.rankingProcess.takeURL(true, Math.min(100, this.timeout - System.currentTimeMillis()));
                    //if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
                    if (page == null) {
                        //System.out.println("page == null");
                        break; // no more available
                    }
                    if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;

                    loops++;
                    final ResultEntry resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
                    final String rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

     * @param args
     */
    public static void main(final String[] args) {
        try {
            final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
            final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
            EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
            System.out.println("valid test:\n=======");
            // add
            stack(urlRef, urlRef.hash(), url.hash(), stackNo);
            // size
            System.out.println("size of stack:\t"+ getStackSize(stackNo));
        } catch (final MalformedURLException e) {
            Log.logException(e);
        }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

        Date lastMod = entry.lastmod(null);
        if (lastMod != null) {
            final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
            if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
                // the url was already loaded. we need to check the date
                final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash);
                if (oldEntry != null) {
                    final Date modDate = oldEntry.moddate();
                    // check if modDate is null
                    if (modDate.after(lastMod)) return;
                }
            }
        }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

            return "post url not allowed";
        }

        // check if the url is double registered
        final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
        URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash());
        if (oldEntry == null) {
            if (dbocc != null) {
                // do double-check
                if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'.");
                if (dbocc.equals("errors")) {
                    ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
                    return "double in: errors (" + errorEntry.anycause() + ")";
                } else {
                    return "double in: " + dbocc;
                }
            }
        } else {
            final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
            if (recrawl) {
                if (this.log.isInfo())
                    this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
                        ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
            } else {
                if (dbocc == null) {
                    return "double in: LURL-DB";
                } else {
                    if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

                    Reference iEntry;
                    while (wordIdxEntries.hasNext()) {
                        iEntry = wordIdxEntries.next();
                        final byte[] urlHash = iEntry.urlhash();
                        if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
                            final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
                            urlCounter++;
                            minimizedUrlDB.store(urlEntry);
                            if (urlCounter % 500 == 0) {
                                log.logInfo(urlCounter + " URLs found so far.");
                            }
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

                    prop.put("mode_feed", "0");
                } else {
                    final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
                    if (bookmark == null) {
                        // try to get the bookmark from the LURL database
                        final URIMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlHash));
                        if (urlentry != null) try {
                            final URIMetadataRow.Components metadata = urlentry.metadata();
                            final Document document = Document.mergeDocuments(metadata.url(), null, sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE));
                            prop.put("mode_edit", "0"); // create mode
                            prop.put("mode_url", metadata.url().toNormalform(false, true));
                            prop.putHTML("mode_title", metadata.dc_title());
                            prop.putHTML("mode_description", (document == null) ? metadata.dc_title(): document.dc_title());
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

                Log.logException(e);
            }
        }
        if (urlhash == null || urlhash.length() == 0) return prop;

        final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
        if (entry == null) return prop;

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) {
            return prop;
        }
        final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());

        prop.putXML("dc_title", metadata.dc_title());
        prop.putXML("dc_creator", metadata.dc_creator());
        prop.putXML("dc_description", ""); // this is the fulltext part in the surrogate
        prop.putXML("dc_subject", metadata.dc_subject());
        prop.putXML("dc_publisher", metadata.dc_publisher());
        prop.putXML("dc_contributor", "");
        prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate()));
        prop.putXML("dc_type", String.valueOf(entry.doctype()));
        prop.putXML("dc_identifier", metadata.url().toNormalform(false, true));
        prop.putXML("dc_language", UTF8.String(entry.language()));
        prop.put("geo_lat", metadata.lat());
        prop.put("geo_long", metadata.lon());

        prop.put("yacy_urlhash", metadata.url().hash());
        prop.putXML("yacy_loaddate", entry.loaddate().toString());
        prop.putXML("yacy_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
        prop.putXML("yacy_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true));
        prop.put("yacy_size", entry.size());
        prop.put("yacy_words",entry.wordCount());

        // return rewrite properties
        return prop;
View Full Code Here

Examples of net.yacy.kelondro.data.meta.URIMetadataRow

          prop.put("delay", "9999");
          return prop;
      }

        // generating a new loaded URL entry
        final URIMetadataRow entry = URIMetadataRow.importEntry(propStr);
        if (entry == null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "3600");
            return prop;
        }

        final URIMetadataRow.Components metadata = entry.metadata();
        if (metadata.url() == null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "3600");
            return prop;
        }

        // check if the entry is in our network domain
        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
        if (urlRejectReason != null) {
            if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
            prop.put("delay", "9999");
            return prop;
        }

        if ("fill".equals(result)) try {
            // put new entry into database
            sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
            ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
            sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
            if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + metadata.url().toNormalform(false, true));

            // ready for more
            prop.put("delay", "10");
            return prop;
        } catch (final IOException e) {
            Log.logException(e);
            prop.put("delay", "3600");
            return prop;
        }

        sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
        sb.crawlQueues.errorURL.push(
                entry.toBalancerEntry(iam),
                youare.getBytes(),
                null,
                0,
                FailCategory.FINAL_LOAD_CONTEXT,
                result + ":" + reason, -1);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.