Package net.yacy.document.parser.html

Examples of net.yacy.document.parser.html.TransformerWriter


                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
View Full Code Here


          c = Charset.defaultCharset();
        }
       
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);       
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
            writer.close();
        } catch (IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        }
        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);           
        //serverFileUtils.copy(sourceFile, hfos);
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
            throw new Parser.Failure(errorMsg, location);   
        }
        return scraper;
    }
View Full Code Here

        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);        
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();          
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");
View Full Code Here

                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
View Full Code Here

            c = Charset.defaultCharset();
        }

        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
            sourceStream.close();
            writer.close();
        }
        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
        //serverFileUtils.copy(sourceFile, hfos);
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
            throw new Parser.Failure(errorMsg, location);
        }
        return scraper;
    }
View Full Code Here

        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");
View Full Code Here

TOP

Related Classes of net.yacy.document.parser.html.TransformerWriter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.