Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


                nextUrl = nextEntry.getKey();
                final String u = nextUrl.toNormalform(true, true, false, true);
                if (!(u.startsWith("http://") || u.startsWith("https://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
                // enqueue the hyperlink into the pre-notice-url db
                try {
                    this.crawlStacker.enqueueEntry(new Request(
                            response.initiator(),
                            new DigestURI(u),
                            response.url().hash(),
                            nextEntry.getValue(),
                            new Date(),
View Full Code Here


        final Segments.Process process = Segments.Process.LOCALCRAWLING;
        if (searchEvent != null) {
            searchEvent.addHeuristic(url.hash(), heuristicName, true);
        }
        if (this.indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
        final Request request = this.loader.request(url, true, true);
        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        if (acceptedError != null) {
            this.log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
            return;
        }
View Full Code Here

            final FailCategory failCategory,
            final String failreason
    ) {
        // assert initiator != null; // null == proxy
        // create a new errorURL DB entry
        final Request bentry = new Request(
                initiator,
                url,
                referrerHash,
                (name == null) ? "" : name,
                new Date(),
View Full Code Here

      this.lastDomainStackFill = System.currentTimeMillis();
      final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2);
        final CloneableIterator<byte[]> i = handles.keys(true, null);
        byte[] handle;
        String host;
        Request request;
      while (i.hasNext()) {
          handle = i.next();
          final Row.Entry entry = this.urlFileIndex.get(handle);
          if (entry == null) continue;
          request = new Request(entry);
          host = request.url().getHost();
        try {
                pushHashToDomainStacks(host, handle);
            } catch (final RowSpaceExceededException e) {
                break;
            }
View Full Code Here

      for (final byte[] n: ta) {
          if (n == null) break;
        try {
                    final Row.Entry rowEntry = this.urlFileIndex.get(n);
                    if (rowEntry == null) continue;
                    final Request crawlEntry = new Request(rowEntry);
                    cel.add(crawlEntry);
                    count--;
                    if (count <= 0) break;
                } catch (final IOException e) {}
      }

      int depth = 0;
      loop: while (count > 0) {
        // iterate over the domain stacks
          final int celsize = cel.size();
          ll: for (final HandleSet list: this.domainStacks.values()) {
              if (list.size() <= depth) continue ll;
              final byte[] n = list.getOne(depth);
              if (n == null) continue ll;
                try {
                    final Row.Entry rowEntry = this.urlFileIndex.get(n);
                    if (rowEntry == null) continue;
                    final Request crawlEntry = new Request(rowEntry);
                    cel.add(crawlEntry);
                    count--;
                    if (count <= 0) break loop;
                } catch (final IOException e) {}
          }
          if (cel.size() == celsize) break loop;
          depth++;
      }

      if (cel.size() < count) try {
            final List<Row.Entry> list = this.urlFileIndex.top(count - cel.size());
            for (final Row.Entry entry: list) cel.add(new Request(entry));
        } catch (final IOException e) { }
      return cel;
    }
View Full Code Here

        }

        public Request next() {
            final Row.Entry entry = this.rowIterator.next();
            try {
                return (entry == null) ? null : new Request(entry);
            } catch (final IOException e) {
                Log.logException(e);
                this.rowIterator = null;
                return null;
            }
View Full Code Here

            assert (entry != null);
            this.executor = entry.getColBytes(1, true);
            this.workdate = new Date(entry.getColLong(2));
            this.workcount = (int) entry.getColLong(3);
            this.anycause = entry.getColString(4);
            this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false)));
            assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), bentry.url().hash()));
            this.stored = true;
            return;
        }
View Full Code Here

                }
            }
        }

        // URL needs to crawled
        this.sb.crawlStacker.enqueueEntry(new Request(
                ASCII.getBytes(this.sb.peers.mySeed().hash),
                url,
                null, // this.siteMapURL.toString(),
                entry.url(),
                entry.lastmod(new Date()),
View Full Code Here

        ee = this.errorURL.get(urlhash);
        if (ee != null) return ee.url();
        for (final Loader w: this.workers.values()) {
            if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) return w.request.url();
        }
        final Request ne = this.noticeURL.get(urlhash);
        if (ne != null) return ne.url();
        return null;
    }
View Full Code Here

            if (this.log.isFine()) this.log.logFine("omitting de-queue/local: paused");
            return false;
        }

        // do a local crawl
        Request urlEntry;
        while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
            final String stats = "LOCALCRAWL[" +
                this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
                ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
            try {
                if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
                    // get one entry that will not be loaded, just indexed
                    urlEntry = this.noticeURL.pop(NoticedURL.StackType.NOLOAD, true, this.sb.crawler);
                    if (urlEntry == null) continue;
                    final String profileHandle = urlEntry.profileHandle();
                    if (profileHandle == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
                    if (profile == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    try {
                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null));
                        Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    }
                    return true;
                }

                urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler);
                if (urlEntry == null) continue;
                final String profileHandle = urlEntry.profileHandle();
                // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
                // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
                if (profileHandle == null) {
                    this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                    return true;
                }
                load(urlEntry, stats, profileHandle);
                return true;
            } catch (final IOException e) {
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.