Package net.yacy.document.parser.html

Examples of net.yacy.document.parser.html.ContentScraper


                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL,
                        CrawlProfile.MATCH_NEVER,
                        0,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
View Full Code Here


                try {
                    u = new DigestURI(url);
                } catch (final MalformedURLException e) {
                    // fail, do nothing
                }
                ContentScraper scraper = null;
                if (u != null) try {
                    scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
                } catch (final IOException e) {
                    // now thats a fail, do nothing
                }
                if (scraper != null) {
                    // put the document title
                    prop.putXML("title", scraper.getTitle());

                    // put the favicon that belongs to the document
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
                    final String list[]=scraper.getKeywords();
                    int count = 0;
                    for (final String element : list) {
                        final String tag = element;
                        if (!tag.equals("")) {
                            prop.putXML("tags_"+count+"_tag", tag);
                            count++;
                        }
                    }
                    prop.put("tags", count);
                    // put description
                    prop.putXML("desc", scraper.getDescription());
                    // put language
                    final Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());

                    // get links and put them into a semicolon-separated list
                    final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();
                    final StringBuilder links = new StringBuilder(uris.size() * 80);
                    final StringBuilder filter = new StringBuilder(uris.size() * 40);
                    count = 0;
                    for (final MultiProtocolURI uri: uris) {
                        links.append(';').append(uri.toNormalform(true, false));
View Full Code Here

                    File f;
                    String size;
                    long sz;
                    String headline, author, description, publisher;
                    int images, links;
                    ContentScraper scraper;
                    for (final String element : list) {
                        f = new File(targetFile, element);
                        if (f.isDirectory()) {
                            aBuffer.append("    <li><a href=\"" + path + element + "/\">" + element + "/</a><br/></li>\n");
                        } else {
                            if (element.endsWith("html") || (element.endsWith("htm"))) {
                                scraper = ContentScraper.parseResource(f);
                                headline = scraper.getTitle();
                                author = scraper.getAuthor();
                                publisher = scraper.getPublisher();
                                description = scraper.getDescription();
                                images = scraper.getImages().size();
                                links = scraper.getAnchors().size();
                            } else {
                                headline = null;
                                author = null;
                                publisher = null;
                                description = null;
View Full Code Here

            addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
        }
        addSolr(solrdoc, "httpstatus_i", 200);
        final Object parser = yacydoc.getParserObject();
        if (parser instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) parser;

            // header tags
            int h = 0;
            int f = 1;
            for (int i = 1; i <= 6; i++) {
                final String[] hs = html.getHeadlines(i);
                h = h | (hs.length > 0 ? f : 0);
                f = f * 2;
                addSolr(solrdoc, "attr_h" + i, hs);
            }
            addSolr(solrdoc, "htags_i", h);

            // canonical tag
            if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));

            // meta tags
            final Map<String, String> metas = html.getMetas();
            final String robots = metas.get("robots");
            if (robots != null) addSolr(solrdoc, "metarobots_t", robots);
            final String generator = metas.get("generator");
            if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);

            // bold, italic
            final String[] bold = html.getBold();
            addSolr(solrdoc, "boldcount_i", bold.length);
            if (bold.length > 0) {
                addSolr(solrdoc, "attr_bold", bold);
                if (isEmpty() || contains("attr_boldcount")) {
                    addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
                }
            }
            final String[] italic = html.getItalic();
            addSolr(solrdoc, "italiccount_i", italic.length);
            if (italic.length > 0) {
                addSolr(solrdoc, "attr_italic", italic);
                if (isEmpty() || contains("attr_italiccount")) {
                    addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
                }
            }
            final String[] li = html.getLi();
            addSolr(solrdoc, "licount_i", li.length);
            if (li.length > 0) addSolr(solrdoc, "attr_li", li);

            // images
            if (isEmpty() || contains("attr_images")) {
                final Collection<ImageEntry> imagesc = html.getImages().values();
                final String[] images = new String[imagesc.size()];
                c = 0;
                for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
                addSolr(solrdoc, "imagescount_i", images.length);
                if (images.length > 0) addSolr(solrdoc, "attr_images", images);
            }

            // style sheets
            if (isEmpty() || contains("attr_css")) {
                final Map<MultiProtocolURI, String> csss = html.getCSS();
                final String[] css = new String[csss.size()];
                c = 0;
                for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
                    css[c++] =
                        "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
                        " href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
                }
                addSolr(solrdoc, "csscount_i", css.length);
                if (css.length > 0) addSolr(solrdoc, "attr_css", css);
            }

            // Scripts
            if (isEmpty() || contains("attr_scripts")) {
                final Set<MultiProtocolURI> scriptss = html.getScript();
                final String[] scripts = new String[scriptss.size()];
                c = 0;
                for (final MultiProtocolURI url: scriptss) {
                    scripts[c++] = url.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "scriptscount_i", scripts.length);
                if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
            }

            // Frames
            if (isEmpty() || contains("attr_frames")) {
                final Set<MultiProtocolURI> framess = html.getFrames();
                final String[] frames = new String[framess.size()];
                c = 0;
                for (final MultiProtocolURI entry: framess) {
                    frames[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "framesscount_i", frames.length);
                if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
            }

            // IFrames
            if (isEmpty() || contains("attr_iframes")) {
                final Set<MultiProtocolURI> iframess = html.getIFrames();
                final String[] iframes = new String[iframess.size()];
                c = 0;
                for (final MultiProtocolURI entry: iframess) {
                    iframes[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "iframesscount_i", iframes.length);
                if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
            }

            // flash embedded
            addSolr(solrdoc, "flash_b", html.containsFlash());

            // generic evaluation pattern
            for (final String model: html.getEvaluationModelNames()) {
                if (isEmpty() || contains("attr_" + model)) {
                    final String[] scorenames = html.getEvaluationModelScoreNames(model);
                    if (scorenames.length > 0) {
                        addSolr(solrdoc, "attr_" + model, scorenames);
                        addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
                    }
                }
            }

            // response time
View Full Code Here

    private static DevAndMainVersions allReleaseFrom(final yacyUpdateLocation location) {
        // retrieves the latest info about releases
        // this is done by contacting a release location,
        // parsing the content and filtering+parsing links
        // returns the version info if successful, null otherwise
        ContentScraper scraper;
        try {
            scraper = Switchboard.getSwitchboard().loader.parseResource(location.getLocationURL(), CacheStrategy.NOCACHE);
        } catch (final IOException e) {
            return null;
        }

        // analyze links in scraper resource, and find link to latest release in it
        final Map<MultiProtocolURI, Properties> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
        final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
        final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
        for (final MultiProtocolURI url : anchors.keySet()) {
            try {
                final yacyRelease release = new yacyRelease(url, location.getPublicKey());
View Full Code Here

        } catch (UnsupportedCharsetException e) {
          c = Charset.defaultCharset();
        }
       
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);       
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
            writer.close();
        } catch (IOException e) {
View Full Code Here

        MultiProtocolURI url;
        Bookmark bm;
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);        
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();          
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");
            Log.logInfo("BOOKMARKS", "links.get(url)");
View Full Code Here

                    File f;
                    String size;
                    long sz;
                    String headline, author, description, publisher;
                    int images, links;
                    ContentScraper scraper;
                    for (final String element : list) {
                        f = new File(targetFile, element);
                        if (f.isDirectory()) {
                            aBuffer.append("    <li><a href=\"" + path + element + "/\">" + element + "/</a><br/></li>\n");
                        } else {
                            if (element.endsWith("html") || (element.endsWith("htm"))) {
                                scraper = ContentScraper.parseResource(f);
                                headline = scraper.getTitle();
                                author = scraper.getAuthor();
                                publisher = scraper.getPublisher();
                                description = scraper.getDescription();
                                images = scraper.getImages().size();
                                links = scraper.getAnchors().size();
                            } else {
                                headline = null;
                                author = null;
                                publisher = null;
                                description = null;
View Full Code Here

                try {
                    u = new DigestURI(url);
                } catch (final MalformedURLException e) {
                    Log.logException(e);
                }
                ContentScraper scraper = null;
                if (u != null) try {
                    scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
                } catch (final IOException e) {
                    Log.logException(e);
                    // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
                    // that should not affect the robots.txt validity
                }
                if (scraper != null) {
                    // put the document title
                    prop.putXML("title", scraper.getTitle());

                    // put the favicon that belongs to the document
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
                    final String list[] = scraper.getKeywords();
                    int count = 0;
                    for (final String element: list) {
                        final String tag = element;
                        if (!tag.equals("")) {
                            prop.putXML("tags_"+count+"_tag", tag);
                            count++;
                        }
                    }
                    prop.put("tags", count);
                    // put description
                    prop.putXML("desc", scraper.getDescription());
                    // put language
                    final Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());

                    // get links and put them into a semicolon-separated list
                    final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();
                    final StringBuilder links = new StringBuilder(uris.size() * 80);
                    final StringBuilder filter = new StringBuilder(uris.size() * 40);
                    count = 0;
                    for (final MultiProtocolURI uri: uris) {
                        if (uri == null) continue;
View Full Code Here

                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // get a scraper to get the title
                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
                        final String description = scraper.getDescription();

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                                newcrawlingdepth,
                                directDocByURL,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
                            final String[] keywords = scraper.getKeywords();
                            if (keywords != null) {
                                for (final String k: keywords) {
                                    final String kk = BookmarkHelper.cleanTagsString(k);
                                    if (kk.length() > 0) tags.add(kk);
                                }
                            }
                            String tagStr = tags.toString();
                            if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2);

                            // we will create always a bookmark to use this to track crawled hosts
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                            if (bookmark != null) {
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title);
                                bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description);
                                bookmark.setOwner("admin");
                                bookmark.setPublic(false);
                                bookmark.setTags(tags, true);
                                sb.bookmarksDB.saveBookmark(bookmark);
                            }

                            // do the same for ymarks
                            // TODO: could a non admin user add crawls?
                            sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start");

                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", post.get("crawlingURL"));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER_STRING,
                                    ipMustMatch,
                                    ipMustNotMatch,
                                    countryMustMatch,
                                    newcrawlingdepth,
                                    false,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL_STRING,
                        CrawlProfile.MATCH_NEVER_STRING,
                                ipMustMatch,
                                ipMustNotMatch,
                                countryMustMatch,
                        0,
                        false,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
View Full Code Here

TOP

Related Classes of net.yacy.document.parser.html.ContentScraper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.