Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request


                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            tags.add("crawlStart");
                            if ("on".equals(post.get("createBookmark","off"))) {
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                                if (bookmark != null) {
                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
                                    bookmark.setOwner("admin");
                                    bookmark.setPublic(false);
                                    bookmark.setTags(tags, true);
                                    sb.bookmarksDB.saveBookmark(bookmark);
                                }
                            }
                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL,
                        CrawlProfile.MATCH_NEVER,
                        0,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
                                    null,
                                    e.getValue().getProperty("name", ""),
                                    new Date(),
View Full Code Here


            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
           
            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

                                if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
                            }
                        } else {
                            // iterating through the list of URLs
                            final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
                            Request entry;
                            final List<byte[]> removehashes = new ArrayList<byte[]>();
                            while (iter.hasNext()) {
                                if ((entry = iter.next()) == null) continue;
                                String value = null;
                               
                                location: switch (option) {
                                    case URL:       value = (entry.url() == null) ? null : entry.url().toString(); break location;
                                    case ANCHOR:    value = entry.name(); break location;
                                    case DEPTH:     value = Integer.toString(entry.depth()); break location;
                                    case INITIATOR:
                                        value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
                                        break location;
                                    case MODIFIED:  value = daydate(entry.appdate()); break location;
                                    default: value = null; break location;
                                }
                               
                                if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
                            }
                            Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
                            for (final byte[] b: removehashes) {
                                sb.crawlQueues.noticeURL.removeByURLHash(b);
                            }
                        }
                    } catch (final PatternSyntaxException e) {
                        Log.logException(e);
                    }
                }
               
                prop.put("info", "3");//crawling queue cleared
                prop.putNum("info_numEntries", c);
            } else if (post.containsKey("deleteEntry")) {
                final String urlHash = post.get("deleteEntry");
                sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
                prop.put("LOCATION","");
                return prop;
            }
        }

        int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
        if (stackSize == 0) {
            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));

            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if ((urle != null)&&(urle.url()!=null)) {
                    initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
                    dark = !dark;
                    showNum++;
                } else {
                    stackSize--;
                }
View Full Code Here

                        final DigestURI referrer = null; // referrer needed!
                        final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(url);
                        if (urlRejectReason == null) {
                            // stack url
                            if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
                            sb.crawlStacker.enqueueEntry(new Request(
                                    peerhash.getBytes(),
                                    url,
                                    (referrer == null) ? null : referrer.hash(),
                                    "REMOTE-CRAWLING",
                                    loaddate,
View Full Code Here

            final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT;
            int maxCount = Math.min(100, post.getInt("count", 10));
            long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
            long timeout = System.currentTimeMillis() + maxTime;
            int c = 0;
            Request entry;
            DigestURI referrer;
            while ((maxCount > 0) &&
                   (System.currentTimeMillis() < timeout) &&
                   (sb.crawlQueues.noticeURL.stackSize(stackType) > 0)) {
                try {
                    entry = sb.crawlQueues.noticeURL.pop(stackType, false, sb.crawler);
                } catch (final IOException e) {
                    break;
                }
                if (entry == null) break;
               
                // find referrer, if there is one
                referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerhash());
               
                // place url to notice-url db
                sb.crawlQueues.delegatedURL.push(
                                entry,
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                0,
                                FailCategory.FINAL_PROCESS_CONTEXT,
                                "client=____________",
                                -1);
               
                // create RSS entry
                prop.put("item_" + c + "_title", "");
                prop.putXML("item_" + c + "_link", entry.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", entry.name());
                prop.put("item_" + c + "_author", "");
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.appdate()));
                prop.put("item_" + c + "_guid", entry.url().hash());
                c++;
                maxCount--;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
        }
       
        if (post.get("call", "").equals("urlhashlist")) {
            // retrieve a list of urls from the LURL-db by a given list of url hashes
            final String urlhashes = post.get("hashes", "");
            if (urlhashes.length() % 12 != 0) return prop;
            final int count = urlhashes.length() / 12;
          int c = 0;
          URIMetadataRow entry;
          URIMetadataRow.Components metadata;
            DigestURI referrer;
            for (int i = 0; i < count; i++) {
                entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
                if (entry == null) continue;
                // find referrer, if there is one
                referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash());
                // create RSS entry
                metadata = entry.metadata();
                prop.put("item_" + c + "_title", metadata.dc_title());
                prop.putXML("item_" + c + "_link", metadata.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", metadata.dc_title());
                prop.put("item_" + c + "_author", metadata.dc_creator());
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
                prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
                c++;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
        }
View Full Code Here

                return prop;
            }

            // stack URL
            String reasonString = null;
            reasonString = sb.crawlStacker.stackCrawl(new Request(
                    sb.peers.mySeed().hash.getBytes(),
                    crawlingStartURL,
                    null,
                    (title==null)?"CRAWLING-ROOT":title,
                    new Date(),
View Full Code Here

            // case 1 and case 3
            if (cachedResponseHeader == null) {
                if (log.isFinest()) log.logFinest(reqID + " page not in cache: fulfill request from web");
                    fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
            } else {
              final Request request = new Request(
                  null,
                        url,
                        requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
                        "",
                        cachedResponseHeader.lastModified(),
View Full Code Here

                    Cache.delete(url);
                    conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                }

                // reserver cache entry
                final Request request = new Request(
                  null,
                        url,
                        requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(),
                        "",
                        responseHeader.lastModified(),
View Full Code Here

        return core.toString();
        //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
    }

    public Request toBalancerEntry(final String initiatorHash) {
        return new Request(
                ASCII.getBytes(initiatorHash),
                metadata().url(),
                referrerHash(),
                metadata().dc_title(),
                moddate(),
View Full Code Here

                continue;
            }

            // create a queue entry
            final Document document = surrogate.document();
            final Request request = new Request(
                    ASCII.getBytes(this.peers.mySeed().hash),
                    surrogate.getIdentifier(true),
                    null,
                    "",
                    surrogate.getDate(),
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Request

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.