Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


            filltop(delay, 6000, true);
            filltop(delay, Long.MAX_VALUE, true);
        } catch (final RowSpaceExceededException e) {}

      long sleeptime = 0;
      Request crawlEntry = null;
      synchronized (this) {
          byte[] failhash = null;
        while (!this.urlFileIndex.isEmpty()) {
          // first simply take one of the entries in the top list, that should be one without any delay
            byte[] nexthash = nextFromDelayed();
            //System.out.println("*** nextFromDelayed=" + nexthash);
            if (nexthash == null && !this.top.isEmpty()) {
                nexthash = this.top.remove();
                //System.out.println("*** top.remove()=" + nexthash);
            }
            if (nexthash == null) {
                nexthash = anyFromDelayed();
            }

            // check minimumDelta and if necessary force a sleep
            //final int s = urlFileIndex.size();
            Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
            if (rowEntry == null) {
                //System.out.println("*** rowEntry=null, nexthash=" + UTF8.String(nexthash));
              rowEntry = this.urlFileIndex.removeOne();
              if (rowEntry == null) {
                  nexthash = null;
              } else {
                  nexthash = rowEntry.getPrimaryKeyBytes();
                  //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + UTF8.String(nexthash));
              }

            }
            if (rowEntry == null) {
              Log.logWarning("Balancer", "removeOne() failed - size = " + size());
              return null;
            }
            //assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result;

            crawlEntry = new Request(rowEntry);
            //Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false));

            // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
            // if not: return null. A calling method must handle the null value and try again
            final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
            if (profileEntry == null) {
              Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
              return null;
            }
            // depending on the caching policy we need sleep time to avoid DoS-like situations
            sleeptime = (
                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

            if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops

            if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
                //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
              // put that thing back to omit a delay here
                if (!ByteBuffer.contains(this.delayed.values(), nexthash)) {
                    //System.out.println("*** delayed +=" + nexthash);
                    this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash);
                }
              try {
                        this.urlFileIndex.put(rowEntry);
                        String host = crawlEntry.url().getHost();
                        if (host == null) host = localhost;
                        this.domainStacks.remove(host);
                        failhash = nexthash;
                    } catch (final RowSpaceExceededException e) {
                        Log.logException(e);
                    }
                    continue;
            }
            break;
        }
        if (crawlEntry != null) {
                if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear();
                try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {}
        }
      }
      if (crawlEntry == null) return null;

        if (delay && sleeptime > 0) {
            // force a busy waiting here
            // in best case, this should never happen if the balancer works propertly
            // this is only to protection against the worst case, where the crawler could
            // behave in a DoS-manner
            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
            long loops = sleeptime / 1000;
            long rest = sleeptime % 1000;
            if (loops < 3) {
              rest = rest + 1000 * loops;
              loops = 0;
            }
            if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}}
            for (int i = 0; i < loops; i++) {
              Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
                try {this.wait(1000); } catch (final InterruptedException e) {}
            }
        }
        this.ddc.remove(crawlEntry.url().hash());
        Latency.update(crawlEntry.url());
        return crawlEntry;
    }
View Full Code Here


    public Request request(
            final DigestURI url,
            final boolean forText,
            final boolean global
                    ) {
        return new Request(
                ASCII.getBytes(this.sb.peers.mySeed().hash),
                    url,
                    null,
                    "",
                    new Date(),
View Full Code Here

            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
           
            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
                    prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
                    prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
                    prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

                        final DigestURI referrer = null; // referrer needed!
                        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
                        if (urlRejectReason == null) {
                            // stack url
                            if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
                            sb.crawlStacker.enqueueEntry(new Request(
                                    peerhash.getBytes(),
                                    url,
                                    (referrer == null) ? null : referrer.hash(),
                                    "REMOTE-CRAWLING",
                                    loaddate,
View Full Code Here

                "", depth, medialink,
                CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
                true, true, true, false, true, true, true,
                CacheStrategy.IFFRESH);
        sb.crawler.putActive(pe.handle().getBytes(), pe);
        return sb.crawlStacker.stackCrawl(new Request(
                sb.peers.mySeed().hash.getBytes(),
                startURL,
                null,
                "CRAWLING-ROOT",
                new Date(),
View Full Code Here

            final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT;
            int maxCount = Math.min(100, post.getInt("count", 10));
            final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
            final long timeout = System.currentTimeMillis() + maxTime;
            int c = 0;
            Request entry;
            DigestURI referrer;
            while ((maxCount > 0) &&
                   (System.currentTimeMillis() < timeout) &&
                   (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
                try {
                    entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.crawler);
                } catch (final IOException e) {
                    break;
                }
                if (entry == null) break;

                // find referrer, if there is one
                referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerhash());

                // place url to notice-url db
                sb.crawlQueues.delegatedURL.push(
                                entry,
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                0,
                                FailCategory.FINAL_PROCESS_CONTEXT,
                                "client=____________",
                                -1);

                // create RSS entry
                prop.put("item_" + c + "_title", "");
                prop.putXML("item_" + c + "_link", entry.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", entry.name());
                prop.put("item_" + c + "_author", "");
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.appdate()));
                prop.put("item_" + c + "_guid", entry.url().hash());
                c++;
                maxCount--;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
        }

        if (post.get("call", "").equals("urlhashlist")) {
            // retrieve a list of urls from the LURL-db by a given list of url hashes
            final String urlhashes = post.get("hashes", "");
            if (urlhashes.length() % 12 != 0) return prop;
            final int count = urlhashes.length() / 12;
          int c = 0;
          URIMetadataRow entry;
          URIMetadataRow.Components metadata;
            DigestURI referrer;
            for (int i = 0; i < count; i++) {
                entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
                if (entry == null) continue;
                // find referrer, if there is one
                referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash());
                // create RSS entry
                metadata = entry.metadata();
                prop.put("item_" + c + "_title", metadata.dc_title());
                prop.putXML("item_" + c + "_link", metadata.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", metadata.dc_title());
                prop.put("item_" + c + "_author", metadata.dc_creator());
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
                prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
                c++;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
        }
View Full Code Here

            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
           
            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

                                if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
                            }
                        } else {
                            // iterating through the list of URLs
                            final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
                            Request entry;
                            final List<byte[]> removehashes = new ArrayList<byte[]>();
                            while (iter.hasNext()) {
                                if ((entry = iter.next()) == null) continue;
                                String value = null;
                               
                                location: switch (option) {
                                    case URL:       value = (entry.url() == null) ? null : entry.url().toString(); break location;
                                    case ANCHOR:    value = entry.name(); break location;
                                    case DEPTH:     value = Integer.toString(entry.depth()); break location;
                                    case INITIATOR:
                                        value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
                                        break location;
                                    case MODIFIED:  value = daydate(entry.appdate()); break location;
                                    default: value = null; break location;
                                }
                               
                                if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
                            }
                            Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
                            for (final byte[] b: removehashes) {
                                sb.crawlQueues.noticeURL.removeByURLHash(b);
                            }
                        }
                    } catch (final PatternSyntaxException e) {
                        Log.logException(e);
                    }
                }
               
                prop.put("info", "3");//crawling queue cleared
                prop.putNum("info_numEntries", c);
            } else if (post.containsKey("deleteEntry")) {
                final String urlHash = post.get("deleteEntry");
                sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
                prop.put("LOCATION","");
                return prop;
            }
        }

        int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
        if (stackSize == 0) {
            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));

            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if ((urle != null)&&(urle.url()!=null)) {
                    initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
                            final String[] keywords = scraper.getKeywords();
                            if (keywords != null) {
                                for (final String k: keywords) {
                                    final String kk = BookmarkHelper.cleanTagsString(k);
                                    if (kk.length() > 0) tags.add(kk);
                                }
                            }
                            String tagStr = tags.toString();
                            if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);

                            // we will create always a bookmark to use this to track crawled hosts
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                            if (bookmark != null) {
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
                                bookmark.setOwner("admin");
                                bookmark.setPublic(false);
                                bookmark.setTags(tags, true);
                                sb.bookmarksDB.saveBookmark(bookmark);
                            }

                            // do the same for ymarks
                            // TODO: could a non admin user add crawls?
                            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");

                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER_STRING,
                                    ipMustMatch,
                                    ipMustNotMatch,
                                    countryMustMatch,
                                    newcrawlingdepth,
                                    false,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                        0,
                        false,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
                                    null,
                                    e.getValue().getProperty("name", ""),
                                    new Date(),
View Full Code Here

            if (url.getProtocol().equals("ftp")) {
                // put the whole ftp site on the crawl stack
                enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), replace);
            } else {
                // put entry on crawl stack
                enqueueEntry(new Request(
                        initiator,
                        url,
                        null,
                        e.getValue().getProperty("name", ""),
                        new Date(),
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.