Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


            assert (entry != null);
            this.executor = entry.getColBytes(1, true);
            this.workdate = new Date(entry.getColLong(2));
            this.workcount = (int) entry.getColLong(3);
            this.anycause = entry.getColUTF8(4);
            this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false)));
            assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), this.bentry.url().hash()));
            this.stored = true;
            return;
        }
View Full Code Here


        return core.toString();
        //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
    }

    public Request toBalancerEntry(final String initiatorHash) {
        return new Request(
                ASCII.getBytes(initiatorHash),
                metadata().url(),
                referrerHash(),
                metadata().dc_title(),
                moddate(),
View Full Code Here

                return prop;
            }

            // stack URL
            String reasonString = null;
            reasonString = sb.crawlStacker.stackCrawl(new Request(
                    sb.peers.mySeed().hash.getBytes(),
                    crawlingStartURL,
                    null,
                    (title==null)?"CRAWLING-ROOT":title,
                    new Date(),
View Full Code Here

                continue;
            }

            // create a queue entry
            final Document document = surrogate.document();
            final Request request = new Request(
                    ASCII.getBytes(this.peers.mySeed().hash),
                    surrogate.getIdentifier(true),
                    null,
                    "",
                    surrogate.getDate(),
View Full Code Here

                nextUrl = nextEntry.getKey();
                final String u = nextUrl.toNormalform(true, true, false, true);
                if (!(u.startsWith("http://") || u.startsWith("https://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                try {
                    this.crawlStacker.enqueueEntry(new Request(
                            response.initiator(),
                            new DigestURI(u),
                            response.url().hash(),
                            nextEntry.getValue(),
                            new Date(),
View Full Code Here

        final Segments.Process process = Segments.Process.LOCALCRAWLING;
        if (searchEvent != null) {
            searchEvent.addHeuristic(url.hash(), heuristicName, true);
        }
        if (this.indexSegments.segment(process).exists(url.hash())) return; // don't do double-work
        final Request request = this.loader.request(url, true, true);
        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        if (acceptedError != null) {
            this.log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
            return;
        }
View Full Code Here

            final FailCategory failCategory,
            final String failreason
    ) {
        // assert initiator != null; // null == proxy
        // create a new errorURL DB entry
        final Request bentry = new Request(
                initiator,
                url,
                referrerHash,
                (name == null) ? "" : name,
                new Date(),
View Full Code Here

        // if the snippet is not in the cache, we can try to get it from the htcache
        Response response = null;
        try {
            // first try to get the snippet from metadata
            String loc;
            final Request request = loader.request(url, true, reindexing);
            final boolean inCache = de.anomic.http.client.Cache.has(comp.url());
            final boolean noCacheUsage = url.isFile() || url.isSMB() || cacheStrategy == null;
            if (containsAllHashes(loc = comp.dc_title(), queryhashes) ||
                containsAllHashes(loc = comp.dc_creator(), queryhashes) ||
                containsAllHashes(loc = comp.dc_subject(), queryhashes) ||
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.