Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


        final HandleSet urlHashes = new HandleSet(this.urlFileIndex.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
        final long terminate = (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
        synchronized (this) {
            final Iterator<Row.Entry> i = this.urlFileIndex.rows();
            Row.Entry rowEntry;
            Request crawlEntry;
            while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
                rowEntry = i.next();
                crawlEntry = new Request(rowEntry);
                if (crawlEntry.profileHandle().equals(profileHandle)) {
                    urlHashes.put(crawlEntry.url().hash());
                }
            }
        }

        // then delete all these urls from the queues and the file index
View Full Code Here


            filltop(delay, 6000, true);
            filltop(delay, Long.MAX_VALUE, true);
        } catch (final RowSpaceExceededException e) {}

      long sleeptime = 0;
      Request crawlEntry = null;
      synchronized (this) {
          byte[] failhash = null;
        while (!this.urlFileIndex.isEmpty()) {
          // first simply take one of the entries in the top list, that should be one without any delay
            byte[] nexthash = nextFromDelayed();
            //System.out.println("*** nextFromDelayed=" + nexthash);
            if (nexthash == null && !this.top.isEmpty()) {
                nexthash = this.top.remove();
                //System.out.println("*** top.remove()=" + nexthash);
            }
            if (nexthash == null) {
                nexthash = anyFromDelayed();
            }

            // check minimumDelta and if necessary force a sleep
            //final int s = urlFileIndex.size();
            Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
            if (rowEntry == null) {
                //System.out.println("*** rowEntry=null, nexthash=" + UTF8.String(nexthash));
              rowEntry = this.urlFileIndex.removeOne();
              if (rowEntry == null) {
                  nexthash = null;
              } else {
                  nexthash = rowEntry.getPrimaryKeyBytes();
                  //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + UTF8.String(nexthash));
              }

            }
            if (rowEntry == null) {
              Log.logWarning("Balancer", "removeOne() failed - size = " + size());
              return null;
            }
            //assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result;

            crawlEntry = new Request(rowEntry);
            //Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false));

            // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
            // if not: return null. A calling method must handle the null value and try again
            final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
            if (profileEntry == null) {
              Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
              return null;
            }
            // depending on the caching policy we need sleep time to avoid DoS-like situations
            sleeptime = (
                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

            if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops

            if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
                //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
              // put that thing back to omit a delay here
                if (!ByteBuffer.contains(this.delayed.values(), nexthash)) {
                    //System.out.println("*** delayed +=" + nexthash);
                    this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash);
                }
              try {
                        this.urlFileIndex.put(rowEntry);
                        String host = crawlEntry.url().getHost();
                        if (host == null) host = localhost;
                        this.domainStacks.remove(host);
                        failhash = nexthash;
                    } catch (final RowSpaceExceededException e) {
                        Log.logException(e);
                    }
                    continue;
            }
            break;
        }
        if (crawlEntry != null) {
                if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear();
                try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {}
        }
      }
      if (crawlEntry == null) return null;

        if (delay && sleeptime > 0) {
            // force a busy waiting here
            // in best case, this should never happen if the balancer works propertly
            // this is only to protection against the worst case, where the crawler could
            // behave in a DoS-manner
            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
            long loops = sleeptime / 1000;
            long rest = sleeptime % 1000;
            if (loops < 3) {
              rest = rest + 1000 * loops;
              loops = 0;
            }
            if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}}
            for (int i = 0; i < loops; i++) {
              Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
                try {this.wait(1000); } catch (final InterruptedException e) {}
            }
        }
        this.ddc.remove(crawlEntry.url().hash());
        Latency.update(crawlEntry.url());
        return crawlEntry;
    }
View Full Code Here

      this.lastDomainStackFill = System.currentTimeMillis();
      final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2);
        final CloneableIterator<byte[]> i = handles.keys(true, null);
        byte[] handle;
        String host;
        Request request;
      while (i.hasNext()) {
          handle = i.next();
          final Row.Entry entry = this.urlFileIndex.get(handle, false);
          if (entry == null) continue;
          request = new Request(entry);
          host = request.url().getHost();
        try {
                pushHashToDomainStacks(host, handle);
            } catch (final RowSpaceExceededException e) {
                break;
            }
View Full Code Here

      for (final byte[] n: ta) {
          if (n == null) break;
        try {
                    final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
                    if (rowEntry == null) continue;
                    final Request crawlEntry = new Request(rowEntry);
                    cel.add(crawlEntry);
                    count--;
                    if (count <= 0) break;
                } catch (final IOException e) {}
      }

      int depth = 0;
      loop: while (count > 0) {
        // iterate over the domain stacks
          final int celsize = cel.size();
          ll: for (final HandleSet list: this.domainStacks.values()) {
              if (list.size() <= depth) continue ll;
              final byte[] n = list.getOne(depth);
              if (n == null) continue ll;
                try {
                    final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
                    if (rowEntry == null) continue;
                    final Request crawlEntry = new Request(rowEntry);
                    cel.add(crawlEntry);
                    count--;
                    if (count <= 0) break loop;
                } catch (final IOException e) {}
          }
          if (cel.size() == celsize) break loop;
          depth++;
      }

      if (cel.size() < count) try {
            final List<Row.Entry> list = this.urlFileIndex.top(count - cel.size());
            for (final Row.Entry entry: list) cel.add(new Request(entry));
        } catch (final IOException e) { }
      return cel;
    }
View Full Code Here

        }

        public Request next() {
            final Row.Entry entry = this.rowIterator.next();
            try {
                return (entry == null) ? null : new Request(entry);
            } catch (final IOException e) {
                Log.logException(e);
                this.rowIterator = null;
                return null;
            }
View Full Code Here

                }
            }
        }

        // URL needs to crawled
        this.sb.crawlStacker.enqueueEntry(new Request(
                ASCII.getBytes(this.sb.peers.mySeed().hash),
                url,
                null, // this.siteMapURL.toString(),
                entry.url(),
                entry.lastmod(new Date()),
View Full Code Here

        ee = this.errorURL.get(urlhash);
        if (ee != null) return ee.url();
        for (final Loader w: this.workers.values()) {
            if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) return w.request.url();
        }
        final Request ne = this.noticeURL.get(urlhash);
        if (ne != null) return ne.url();
        return null;
    }
View Full Code Here

            if (this.log.isFine()) this.log.logFine("omitting de-queue/local: paused");
            return false;
        }

        // do a local crawl
        Request urlEntry;
        while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
            final String stats = "LOCALCRAWL[" +
                this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " +
                this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
                ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
            try {
                if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
                    // get one entry that will not be loaded, just indexed
                    urlEntry = this.noticeURL.pop(NoticedURL.StackType.NOLOAD, true, this.sb.crawler);
                    if (urlEntry == null) continue;
                    final String profileHandle = urlEntry.profileHandle();
                    if (profileHandle == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
                    if (profile == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    try {
                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
                        Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    }
                    return true;
                }

                urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler);
                if (urlEntry == null) continue;
                final String profileHandle = urlEntry.profileHandle();
                // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
                // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
                if (profileHandle == null) {
                    this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                    return true;
                }
                load(urlEntry, stats, profileHandle);
                return true;
            } catch (final IOException e) {
View Full Code Here

            loaddate = item.getPubDate();
            final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url);
            if (urlRejectReason == null) {
                // stack url
                if (this.sb.getLog().isFinest()) this.sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
                this.sb.crawlStacker.enqueueEntry(new Request(
                        ASCII.getBytes(hash),
                        url,
                        (referrer == null) ? null : referrer.hash(),
                        item.getDescription(),
                        loaddate,
View Full Code Here

        // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
        final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
                        + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
        try {
            final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler);
            final String profileHandle = urlEntry.profileHandle();
            // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
            // profileHandle = " + profileHandle + ", urlEntry.url = " +
            // urlEntry.url());
            load(urlEntry, stats, profileHandle);
            return true;
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.