Package de.anomic.crawler

Examples of de.anomic.crawler.CrawlProfile


                final String crawlingMode = post.get("crawlingMode","url");
                if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
                    try {
                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);
                        final CrawlProfile profile = new CrawlProfile(
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final DigestURI url = crawlingStartURL;
                        sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "7"); // Error with file
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                } else if ("url".equals(crawlingMode)) {

                    // check if pattern matches
                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                        // print error message
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {

                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);

                        // stack request
                        // first delete old entry, if exists
                        final DigestURI url = new DigestURI(crawlingStart);
                        final byte[] urlhash = url.hash();
                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            tags.add("crawlStart");
                            if ("on".equals(post.get("createBookmark","off"))) {
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                                if (bookmark != null) {
                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
                                    bookmark.setOwner("admin");
                                    bookmark.setPublic(false);
                                    bookmark.setTags(tags, true);
                                    sb.bookmarksDB.saveBookmark(bookmark);
                                }
                            }
                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL,
                        CrawlProfile.MATCH_NEVER,
                        0,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
                                    null,
                                    e.getValue().getProperty("name", ""),
                                    new Date(),
                                    profile.handle(),
                                    0,
                                    0,
                                    0,
                                    0
                                    ));
View Full Code Here


           
            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
View Full Code Here

                        final Pattern compiledPattern = Pattern.compile(pattern);
                       
                        if (option == PROFILE) {
                            // search and delete the crawl profile (_much_ faster, independant of queue size)
                            // XXX: what to do about the annoying LOST PROFILE messages in the log?
                            CrawlProfile entry;
                            for (final byte[] handle: sb.crawler.getActive()) {
                                entry = sb.crawler.getActive(handle);
                                final String name = entry.name();
                                if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
                                    continue;
                                if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
                            }
                        } else {
                            // iterating through the list of URLs
                            final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
                            Request entry;
                            final List<byte[]> removehashes = new ArrayList<byte[]>();
                            while (iter.hasNext()) {
                                if ((entry = iter.next()) == null) continue;
                                String value = null;
                               
                                location: switch (option) {
                                    case URL:       value = (entry.url() == null) ? null : entry.url().toString(); break location;
                                    case ANCHOR:    value = entry.name(); break location;
                                    case DEPTH:     value = Integer.toString(entry.depth()); break location;
                                    case INITIATOR:
                                        value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
                                        break location;
                                    case MODIFIED:  value = daydate(entry.appdate()); break location;
                                    default: value = null; break location;
                                }
                               
                                if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
                            }
                            Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
                            for (final byte[] b: removehashes) {
                                sb.crawlQueues.noticeURL.removeByURLHash(b);
                            }
                        }
                    } catch (final PatternSyntaxException e) {
                        Log.logException(e);
                    }
                }
               
                prop.put("info", "3");//crawling queue cleared
                prop.putNum("info_numEntries", c);
            } else if (post.containsKey("deleteEntry")) {
                final String urlHash = post.get("deleteEntry");
                sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
                prop.put("LOCATION","");
                return prop;
            }
        }

        int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
        if (stackSize == 0) {
            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));

            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if ((urle != null)&&(urle.url()!=null)) {
                    initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
View Full Code Here

        // read post for handle
        final String handle = (post == null) ? "" : post.get("handle", "");
        if (post != null) {
            if (post.containsKey("terminate")) try {
                // termination of a crawl: shift the crawl from active to passive
                final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
                if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
                // delete all entries from the crawl queue that are deleted here
                sb.crawler.removeActive(handle.getBytes());
                sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
            } catch (RowSpaceExceededException e) {
                Log.logException(e);
            }
            if (post.containsKey("delete")) {
                // deletion of a terminated crawl profile
                sb.crawler.removePassive(handle.getBytes());
            }
            if (post.containsKey("deleteTerminatedProfiles")) {
                for (final byte[] h: sb.crawler.getPassive()) {
                    sb.crawler.removePassive(h);
                }
            }
        }
       
        // generate handle list: first sort by handle name
        CrawlProfile selentry;
        final Map<String, String> orderdHandles = new TreeMap<String, String>();
        for (final byte[] h : sb.crawler.getActive()) {
            selentry = sb.crawler.getActive(h);
            if (selentry != null && !ignoreNames.contains(selentry.name())) {
                orderdHandles.put(selentry.name(), selentry.handle());
            }
        }
       
        // then write into pop-up menu list
        int count = 0;
        for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) {
            prop.put("profiles_" + count + "_name", NameHandle.getKey());
            prop.put("profiles_" + count + "_handle", NameHandle.getValue());
            if (handle.equals(NameHandle.getValue())) {
                prop.put("profiles_" + count + "_selected", "1");
            }
            count++;
        }
        prop.put("profiles", count);
        selentry = sb.crawler.getActive(handle.getBytes());
        assert selentry == null || selentry.handle() != null;
        // read post for change submit
        if ((post != null) && (selentry != null)) {
            if (post.containsKey("submit")) {
                try {
                  Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL));
                  Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
                    final Iterator<eentry> lit = labels.iterator();
                    eentry tee;
                    while (lit.hasNext()) {
                        tee = lit.next();
                        final String cval = selentry.get(tee.name);
                        final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
                        if (!cval.equals(val)) {
                            selentry.put(tee.name, val);
                            sb.crawler.putActive(selentry.handle().getBytes(), selentry);
                        }
                    }
                } catch (final Exception ex) {
                    Log.logException(ex);
                    prop.put("error", "1");
                    prop.putHTML("error_message", ex.getMessage());
                }
            }
        }
       
        // generate crawl profile table
        count = 0;
        boolean dark = true;
        final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
        CrawlProfile profile;
        // put active crawls into list
        for (final byte[] h: sb.crawler.getActive()) {
            profile = sb.crawler.getActive(h);
            putProfileEntry(prop, sb.crawlStacker, profile, true, dark, count, domlistlength);
            dark = !dark;
View Full Code Here

            indexSegment.urlMetadata().remove(urlhash);
            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
            sb.crawlQueues.errorURL.remove(urlhash);

            // create crawling profile
            CrawlProfile pe = null;
            try {
                pe = new CrawlProfile(
                        crawlingStartURL.getHost(),
                        crawlingStartURL,
                        crawlingMustMatch,
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        remoteIndexing,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        CacheStrategy.IFFRESH);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
            } catch (final Exception e) {
                // mist
                prop.put("mode_status", "2");//Error with url
                prop.put("mode_code", "2");
                prop.putHTML("mode_status_error", e.getMessage());
                return prop;
            }

            // stack URL
            String reasonString = null;
            reasonString = sb.crawlStacker.stackCrawl(new Request(
                    sb.peers.mySeed().hash.getBytes(),
                    crawlingStartURL,
                    null,
                    (title==null)?"CRAWLING-ROOT":title,
                    new Date(),
                    pe.handle(),
                    0,
                    0,
                    0,
                    0
                ));
View Full Code Here

            color_lineend = post.get("colorlineend", color_lineend);
        }
       
        if (host.equals("auto")) {
          // try to find the host from the crawl profiles
          CrawlProfile e;
            for (byte[] handle: sb.crawler.getActive()) {
                e = sb.crawler.getActive(handle);
                if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
                    e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
                   continue;
                host = e.name();
                break; // take the first one
            }
        }
       
        // find start point
View Full Code Here

                setConfig("adminAccount", "");
            }

            // refresh recrawl dates
            try{
                CrawlProfile selentry;
                for (final byte[] handle: this.crawler.getActive()) {
                    selentry = this.crawler.getActive(handle);
                    assert selentry.handle() != null : "profile.name = " + selentry.name();
                    if (selentry.handle() == null) {
                        this.crawler.removeActive(handle);
                        continue;
                    }
                    boolean insert = false;
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) {
                        selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
                                Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
                        insert = true;
                    }
                    if (insert) this.crawler.putActive(UTF8.getBytes(selentry.handle()), selentry);
                }
            } catch (final Exception e) {
                Log.logException(e);
            }
View Full Code Here

            condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);

            // update image result list statistics
            // its good to do this concurrently here, because it needs a DNS lookup
            // to compute a URL hash which is necessary for a double-check
            final CrawlProfile profile = in.queueEntry.profile();
            ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
        }
        return new indexingQueueEntry(in.process, in.queueEntry, in.documents, condenser);
    }
View Full Code Here

        if (searchEvent != null) {
            searchEvent.addHeuristic(url.hash(), heuristicName, true);
        }
        if (this.indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
        final Request request = this.loader.request(url, true, true);
        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
        if (acceptedError != null) {
            this.log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
            return;
        }
View Full Code Here

        if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
        final String protocol = url.getProtocol();
        final String host = url.getHost();

        // check if we have the page in the cache
        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry

            final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
            final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content

                // create request header values and a response object because we need that
                // in case that we want to return the cached content in the next step
                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",
                        crawlProfile,
                        content);

                // check which caching strategy shall be used
                if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
                    // well, just take the cache and don't care about freshness of the content
                    this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
                    return response;
                }

                // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
                assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
                if (response.isFreshForProxy()) {
                    this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
                    return response;
                } else {
                    this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                }
            } else if (cachedResponse != null) {
                this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
            } else if (content != null) {
                this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
            }
        }

        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

        // now forget about the cache, nothing there. Try to load the content from the internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
        if (!url.isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
            if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
            if (wait > 0) {
                // force a sleep here. Instead just sleep we clean up the accessTime map
                final long untilTime = System.currentTimeMillis() + wait;
                cleanupAccessTimeTable(untilTime);
                if (System.currentTimeMillis() < untilTime)
                    try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
            }
        }

        // now it's for sure that we will access the target. Remember the access time
        if (host != null) accessTime.put(host, System.currentTimeMillis());

        // load resource from the internet
        Response response = null;
        if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
        if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
        if (protocol.equals("file")) response = this.fileLoader.load(request, true);
        if (response != null && response.getContent() != null) {
            // we got something. Now check if we want to store that to the cache
            // first check looks if we want to store the content to the cache
            if (!crawlProfile.storeHTCache()) {
                // no caching wanted. Thats ok, do not write any message
                return response;
            }
            // second check tells us if the protocoll tells us something about caching
            final String storeError = response.shallStoreCacheForCrawler();
View Full Code Here

TOP

Related Classes of de.anomic.crawler.CrawlProfile

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.