Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


                            cq.noticeURL.removeByURLHash(urlhash);
                            cq.errorURL.remove(urlhash);
                        }

                        // put entry on crawl stack
                        enqueueEntry(new Request(
                                initiator,
                                url,
                                null,
                                MultiProtocolURI.unescape(entry.name),
                                entry.date,
View Full Code Here


     * @param url
     * @return null if successfull, a reason string if not successful
     */
    public String stackSimpleCrawl(final DigestURI url) {
      final CrawlProfile pe = this.crawler.defaultSurrogateProfile;
      return stackCrawl(new Request(
                this.peers.mySeed().hash.getBytes(),
                url,
                null,
                "CRAWLING-ROOT",
                new Date(),
View Full Code Here

            return "error pushing onto the crawl stack: " + er.getMessage();
        }
    }

    public Request get(final byte[] urlhash) {
        Request entry = null;
        try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = this.limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        return null;
View Full Code Here

        }
    }

    public void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs) {
        try {
            final Request entry = pop(fromStack, false, cs);
            if (entry != null) {
                final String warning = push(toStack, entry);
                if (warning != null) {
                    Log.logWarning("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
                }
View Full Code Here

    }

    private Request pop(final Balancer balancer, final boolean delay, final CrawlSwitchboard cs) throws IOException {
        // this is a filo - pop
        int s;
        Request entry;
        int errors = 0;
        synchronized (balancer) {
            while ((s = balancer.size()) > 0) {
                entry = balancer.pop(delay, cs);
                if (entry == null) {
View Full Code Here

            // case 1 and case 3
            if (cachedResponseHeader == null) {
                if (log.isFinest()) log.logFinest(reqID + " page not in cache: fulfill request from web");
                    fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
            } else {
              final Request request = new Request(
                  null,
                        url,
                        requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
                        "",
                        cachedResponseHeader.lastModified(),
View Full Code Here

                    Cache.delete(url);
                    conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                }

                // reserver cache entry
                final Request request = new Request(
                  null,
                        url,
                        requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
                        "",
                        responseHeader.lastModified(),
View Full Code Here

            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
           
            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
                    prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
                    prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
                    prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

    public Request request(
            final DigestURI url,
            final boolean forText,
            final boolean global
                    ) {
        return new Request(
                ASCII.getBytes(this.sb.peers.mySeed().hash),
                    url,
                    null,
                    "",
                    new Date(),
View Full Code Here

    public Request get(final byte[] urlhash) throws IOException {
        assert urlhash != null;
        if (this.urlFileIndex == null) return null; // case occurs during shutdown
        final Row.Entry entry = this.urlFileIndex.get(urlhash, false);
        if (entry == null) return null;
        return new Request(entry);
    }
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.