Package de.anomic.crawler

Examples of de.anomic.crawler.CrawlProfile


                final String crawlingMode = post.get("crawlingMode","url");
                if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
                    try {
                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);
                        final CrawlProfile profile = new CrawlProfile(
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final DigestURI url = crawlingStartURL;
                        sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "7"); // Error with file
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                } else if ("url".equals(crawlingMode)) {

                    // check if pattern matches
                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                        // print error message
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {


                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);

                        // stack request
                        // first delete old entry, if exists
                        final DigestURI url = new DigestURI(crawlingStart);
                        final byte[] urlhash = url.hash();
                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // get a scraper to get the title
                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
                        final String description = scraper.getDescription();

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
                            final String[] keywords = scraper.getKeywords();
                            if (keywords != null) {
                                for (final String k: keywords) {
                                    final String kk = BookmarkHelper.cleanTagsString(k);
                                    if (kk.length() > 0) tags.add(kk);
                                }
                            }
                            String tagStr = tags.toString();
                            if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);

                            // we will create always a bookmark to use this to track crawled hosts
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                            if (bookmark != null) {
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
                                bookmark.setOwner("admin");
                                bookmark.setPublic(false);
                                bookmark.setTags(tags, true);
                                sb.bookmarksDB.saveBookmark(bookmark);
                            }

                            // do the same for ymarks
                            // TODO: could a non admin user add crawls?
                            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");

                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER_STRING,
                                    ipMustMatch,
                                    ipMustNotMatch,
                                    countryMustMatch,
                                    newcrawlingdepth,
                                    false,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                        0,
                        false,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
                                    null,
                                    e.getValue().getProperty("name", ""),
                                    new Date(),
                                    profile.handle(),
                                    0,
                                    0,
                                    0,
                                    0
                                    ));
View Full Code Here


           
            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
                    prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
                    prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
                    prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
View Full Code Here

 
  public boolean isRunning(final CrawlSwitchboard crawler) {
    final Iterator<byte[]> iter = crawler.getActive().iterator();
    while(iter.hasNext()) {
      final byte[] key = iter.next();
      final CrawlProfile crawl = crawler.getActive(key);
      if (crawl.startURL().equals(this.url)) {
        return true;       
      }
    }
    return false;
  }
View Full Code Here

        if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
        final String protocol = url.getProtocol();
        final String host = url.getHost();

        // check if we have the page in the cache
        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry

            final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
            final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content

                // create request header values and a response object because we need that
                // in case that we want to return the cached content in the next step
                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",
                        crawlProfile,
                        content);

                // check which caching strategy shall be used
                if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
                    // well, just take the cache and don't care about freshness of the content
                    this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
                    return response;
                }

                // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
                assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
                if (response.isFreshForProxy()) {
                    this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
                    return response;
                } else {
                    this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                }
            } else if (cachedResponse != null) {
                this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
            } else if (content != null) {
                this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
            }
        }

        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

        // now forget about the cache, nothing there. Try to load the content from the internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
        if (!url.isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
            if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
            if (wait > 0) {
                // force a sleep here. Instead just sleep we clean up the accessTime map
                final long untilTime = System.currentTimeMillis() + wait;
                cleanupAccessTimeTable(untilTime);
                if (System.currentTimeMillis() < untilTime)
                    try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
            }
        }

        // now it's for sure that we will access the target. Remember the access time
        if (host != null) accessTime.put(host, System.currentTimeMillis());

        // load resource from the internet
        Response response = null;
        if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
        if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
        if (protocol.equals("file")) response = this.fileLoader.load(request, true);
        if (response != null && response.getContent() != null) {
            // we got something. Now check if we want to store that to the cache
            // first check looks if we want to store the content to the cache
            if (crawlProfile == null || !crawlProfile.storeHTCache()) {
                // no caching wanted. Thats ok, do not write any message
                return response;
            }
            // second check tells us if the protocoll tells us something about caching
            final String storeError = response.shallStoreCacheForCrawler();
View Full Code Here

            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
           
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
           
            return response;
        }
       
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
       
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
           
            if (parserError != null) {
                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
           
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    url.toTokens().getBytes());
            return response;
        }
       
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
       
        // create response with loaded content
        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
                request,
                requestHeader,
                responseHeader,
                "200",
View Full Code Here

            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
           
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
           
            return response;
        }
       
        // create response header
        String mime = Classification.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
       
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
           
            if (parserError != null) {
                log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
           
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    url.toTokens().getBytes());
            return response;
        }
       
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
       
        // create response with loaded content
        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
                request,
                requestHeader,
                responseHeader,
                "200",
View Full Code Here

                    response = null;
                } else {
                    final ResponseHeader responseHeader = new ResponseHeader();
                    responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
                    responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
                    final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
                    response = new Response(
                            request,
                            requestHeader,
                            responseHeader,
                            "200",
View Full Code Here

                this.log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }

            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
            final Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    null);
            return response;
        }

        // download the remote file
        final byte[] b = ftpClient.get(path);

        // create a response
        final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
        final Response response = new Response(
                request,
                requestHeader,
                responseHeader,
                "200",
View Full Code Here

                  this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);
                  throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
                }

                // create a new cache entry
                final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
                response = new Response(
                        request,
                        requestHeader,
                        header,
                        Integer.toString(code),
View Full Code Here

            indexSegment.urlMetadata().remove(urlhash);
            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
            sb.crawlQueues.errorURL.remove(urlhash);

            // create crawling profile
            CrawlProfile pe = null;
            try {
                pe = new CrawlProfile(
                        crawlingStartURL.getHost(),
                        crawlingStartURL,
                        crawlingMustMatch,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
                        "",
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        true,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        remoteIndexing,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        CacheStrategy.IFFRESH);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
            } catch (final Exception e) {
                // mist
                prop.put("mode_status", "2");//Error with url
                prop.put("mode_code", "2");
                prop.putHTML("mode_status_error", e.getMessage());
                return prop;
            }

            // stack URL
            String reasonString = null;
            reasonString = sb.crawlStacker.stackCrawl(new Request(
                    sb.peers.mySeed().hash.getBytes(),
                    crawlingStartURL,
                    null,
                    (title==null)?"CRAWLING-ROOT":title,
                    new Date(),
                    pe.handle(),
                    0,
                    0,
                    0,
                    0
                ));
View Full Code Here

TOP

Related Classes of de.anomic.crawler.CrawlProfile

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.