Examples of net.yacy.document.parser.html.ContentScraper

net.yacy.document.parser.html.ContentScraper

            final String documentCharset,
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {


        try {
            // first get a document from the parsed html
            final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
            final Document document = transformScraper(location, mimeType, documentCharset, scraper);


            return new Document[]{document};
        } catch (final IOException e) {
      throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);

View Full Code Here

        } catch (final UnsupportedCharsetException e) {
            c = Charset.defaultCharset();
        }


        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);

View Full Code Here

    private static DevAndMainVersions allReleaseFrom(final yacyUpdateLocation location) {
        // retrieves the latest info about releases
        // this is done by contacting a release location,
        // parsing the content and filtering+parsing links
        // returns the version info if successful, null otherwise
        ContentScraper scraper;
        try {
            final DigestURI uri = location.getLocationURL();
            Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
            scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
        } catch (final IOException e) {
            return null;
        }


        // analyze links in scraper resource, and find link to latest release in it
        final Map<MultiProtocolURI, Properties> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
        final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
        final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
        for (final MultiProtocolURI url : anchors.keySet()) {
            try {
                final yacyRelease release = new yacyRelease(url, location.getPublicKey());

View Full Code Here

        MultiProtocolURI url;
        Bookmark bm;
        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");
            Log.logInfo("BOOKMARKS", "links.get(url)");

View Full Code Here

            addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
        }
        addSolr(solrdoc, "httpstatus_i", 200);
        final Object parser = yacydoc.getParserObject();
        if (parser instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) parser;


            // header tags
            int h = 0;
            int f = 1;
            for (int i = 1; i <= 6; i++) {
                final String[] hs = html.getHeadlines(i);
                h = h | (hs.length > 0 ? f : 0);
                f = f * 2;
                addSolr(solrdoc, "attr_h" + i, hs);
            }
            addSolr(solrdoc, "htags_i", h);


            // canonical tag
            if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));


            // noindex and nofollow attributes
            // from HTML (meta-tag in HTML header: robots)
            // and HTTP header (x-robots property)
            // coded as binary value:
            // bit  0: "all" contained in html header meta
            // bit  1: "index" contained in html header meta
            // bit  2: "noindex" contained in html header meta
            // bit  3: "nofollow" contained in html header meta
            // bit  8: "noarchive" contained in http header properties
            // bit  9: "nosnippet" contained in http header properties
            // bit 10: "noindex" contained in http header properties
            // bit 11: "nofollow" contained in http header properties
            // bit 12: "unavailable_after" contained in http header properties
            int b = 0;
            final String robots_meta = html.getMetas().get("robots");
            // this tag may have values: all, index, noindex, nofollow
            if (robots_meta != null) {
                if (robots_meta.indexOf("all",0) >= 0) b += 1;      // set bit 0
                if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
                if (robots_meta.indexOf("noindex",0) >= 0) b += 4;  // set bit 2
                if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
            }
            String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
            if (x_robots_tag.length() == 0) x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
            // this tag may have values: noarchive, nosnippet, noindex, unavailable_after
            if (x_robots_tag.length() > 0) {
                if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256;         // set bit 8
                if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512;         // set bit 9
                if (x_robots_tag.indexOf("noindex",0) >= 0) b += 1024;          // set bit 10
                if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048;         // set bit 11
                if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
            }
            addSolr(solrdoc, "robots_i", b);


            // meta tags: generator
            final String generator = html.getMetas().get("generator");
            if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);


            // bold, italic
            final String[] bold = html.getBold();
            addSolr(solrdoc, "boldcount_i", bold.length);
            if (bold.length > 0) {
                addSolr(solrdoc, "attr_bold", bold);
                if (isEmpty() || contains("attr_boldcount")) {
                    addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
                }
            }
            final String[] italic = html.getItalic();
            addSolr(solrdoc, "italiccount_i", italic.length);
            if (italic.length > 0) {
                addSolr(solrdoc, "attr_italic", italic);
                if (isEmpty() || contains("attr_italiccount")) {
                    addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
                }
            }
            final String[] li = html.getLi();
            addSolr(solrdoc, "licount_i", li.length);
            if (li.length > 0) addSolr(solrdoc, "attr_li", li);


            // images
            final Collection<ImageEntry> imagesc = html.getImages().values();
            final String[] imgtags  = new String[imagesc.size()];
            final String[] imgprots = new String[imagesc.size()];
            final String[] imgstubs = new String[imagesc.size()];
            final String[] imgalts  = new String[imagesc.size()];
            c = 0;
            for (final ImageEntry ie: imagesc) {
                final MultiProtocolURI uri = ie.url();
                imgtags[c] = ie.toString();
                imgprots[c] = uri.getProtocol();
                imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
                imgalts[c] = ie.alt();
                c++;
            }
            addSolr(solrdoc, "imagescount_i", imgtags.length);
            if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
            if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
            if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
            if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);


            // style sheets
            if (isEmpty() || contains("attr_css")) {
                final Map<MultiProtocolURI, String> csss = html.getCSS();
                final String[] css_tag = new String[csss.size()];
                final String[] css_url = new String[csss.size()];
                c = 0;
                for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
                    final String url = entry.getKey().toNormalform(false, false, false, false);
                    css_tag[c] =
                        "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
                        " href=\""+ url + "\" />";
                    css_url[c] = url;
                    c++;
                }
                addSolr(solrdoc, "csscount_i", css_tag.length);
                if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
                if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
            }


            // Scripts
            if (isEmpty() || contains("attr_scripts")) {
                final Set<MultiProtocolURI> scriptss = html.getScript();
                final String[] scripts = new String[scriptss.size()];
                c = 0;
                for (final MultiProtocolURI url: scriptss) {
                    scripts[c++] = url.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "scriptscount_i", scripts.length);
                if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
            }


            // Frames
            if (isEmpty() || contains("attr_frames")) {
                final Set<MultiProtocolURI> framess = html.getFrames();
                final String[] frames = new String[framess.size()];
                c = 0;
                for (final MultiProtocolURI entry: framess) {
                    frames[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "framesscount_i", frames.length);
                if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
            }


            // IFrames
            if (isEmpty() || contains("attr_iframes")) {
                final Set<MultiProtocolURI> iframess = html.getIFrames();
                final String[] iframes = new String[iframess.size()];
                c = 0;
                for (final MultiProtocolURI entry: iframess) {
                    iframes[c++] = entry.toNormalform(false, false, false, false);
                }
                addSolr(solrdoc, "iframesscount_i", iframes.length);
                if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
            }


            // flash embedded
            addSolr(solrdoc, "flash_b", html.containsFlash());


            // generic evaluation pattern
            for (final String model: html.getEvaluationModelNames()) {
                if (isEmpty() || contains("attr_" + model)) {
                    final String[] scorenames = html.getEvaluationModelScoreNames(model);
                    if (scorenames.length > 0) {
                        addSolr(solrdoc, "attr_" + model, scorenames);
                        addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
                    }
                }
            }


            // response time

View Full Code Here

0 1

TOP

Related Classes of net.yacy.document.parser.html.ContentScraper

Crawler_p

de.anomic.data.BookmarkHelper

de.anomic.http.server.HTTPDFileHandler

de.anomic.yacy.yacyRelease

getpageinfo_p

net.yacy.cora.document.MultiProtocolURI

net.yacy.cora.services.federated.solr.SolrScheme

net.yacy.document.parser.htmlParser

net.yacy.kelondro.io.CharBuffer

net.yacy.peers.operation.yacyRelease

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.