Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


            loaddate = item.getPubDate();
            final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url);
            if (urlRejectReason == null) {
                // stack url
                if (this.sb.getLog().isFinest()) this.sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
                this.sb.crawlStacker.enqueueEntry(new Request(
                        ASCII.getBytes(hash),
                        url,
                        (referrer == null) ? null : referrer.hash(),
                        item.getDescription(),
                        loaddate,
View Full Code Here


        // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
        final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
                        + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
        try {
            final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler);
            final String profileHandle = urlEntry.profileHandle();
            // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
            // profileHandle = " + profileHandle + ", urlEntry.url = " +
            // urlEntry.url());
            load(urlEntry, stats, profileHandle);
            return true;
View Full Code Here

            if (url.getProtocol().equals("ftp")) {
                // put the whole ftp site on the crawl stack
                enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), replace);
            } else {
                // put entry on crawl stack
                enqueueEntry(new Request(
                        initiator,
                        url,
                        null,
                        e.getValue().getProperty("name", ""),
                        new Date(),
View Full Code Here

                            cq.noticeURL.removeByURLHash(urlhash);
                            cq.errorURL.remove(urlhash);
                        }
                       
                        // put entry on crawl stack
                        enqueueEntry(new Request(
                                initiator,
                                url,
                                null,
                                MultiProtocolURI.unescape(entry.name),
                                entry.date,
View Full Code Here

     * @param url
     * @return null if successfull, a reason string if not successful
     */
    public String stackSimpleCrawl(final DigestURI url) {
      CrawlProfile pe = this.crawler.defaultSurrogateProfile;
      return stackCrawl(new Request(
                peers.mySeed().hash.getBytes(),
                url,
                null,
                "CRAWLING-ROOT",
                new Date(),
View Full Code Here

            return "error pushing onto the crawl stack: " + er.getMessage();
        }
    }

    public Request get(final byte[] urlhash) {
        Request entry = null;
        try {if ((entry = noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
        return null;
View Full Code Here

        }
    }

    public void shift(final StackType fromStack, final StackType toStack, CrawlSwitchboard cs) {
        try {
            final Request entry = pop(fromStack, false, cs);
            if (entry != null) {
                String warning = push(toStack, entry);
                if (warning != null) {
                    Log.logWarning("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
                }
View Full Code Here

    }
   
    private Request pop(final Balancer balancer, final boolean delay, CrawlSwitchboard cs) throws IOException {
        // this is a filo - pop
        int s;
        Request entry;
        int errors = 0;
        synchronized (balancer) {
            while ((s = balancer.size()) > 0) {
                entry = balancer.pop(delay, cs);
                if (entry == null) {
View Full Code Here

    public Request get(final byte[] urlhash) throws IOException {
        assert urlhash != null;
        if (this.urlFileIndex == null) return null; // case occurs during shutdown
        final Row.Entry entry = this.urlFileIndex.get(urlhash);
        if (entry == null) return null;
        return new Request(entry);
    }
View Full Code Here

        final HandleSet urlHashes = Base64Order.enhancedCoder.getHandleSet(this.urlFileIndex.row().primaryKeyLength, 100);
        final long terminate = (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
        synchronized (this) {
            final Iterator<Row.Entry> i = this.urlFileIndex.rows();
            Row.Entry rowEntry;
            Request crawlEntry;
            while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
                rowEntry = i.next();
                crawlEntry = new Request(rowEntry);
                if (crawlEntry.profileHandle().equals(profileHandle)) {
                    urlHashes.put(crawlEntry.url().hash());
                }
            }
        }

        // then delete all these urls from the queues and the file index
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.