Examples of net.yacy.document.parser.html.TransformerWriter

net.yacy.document.parser.html.TransformerWriter

                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();


                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

View Full Code Here

          c = Charset.defaultCharset();
        }
        
        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);        
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
            writer.close();
        } catch (IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        }
        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
        //serverFileUtils.copy(sourceFile, hfos);
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
            throw new Parser.Failure(errorMsg, location);    
        }
        return scraper;
    }

View Full Code Here

        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);         
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();           
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");

View Full Code Here

                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();


                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

View Full Code Here

            c = Charset.defaultCharset();
        }


        // parsing the content
        final ContentScraper scraper = new ContentScraper(location);
        final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
        try {
            FileUtils.copy(sourceStream, writer, c);
        } catch (final IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        } finally {
            sourceStream.close();
            writer.close();
        }
        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
        //serverFileUtils.copy(sourceFile, hfos);
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
            throw new Parser.Failure(errorMsg, location);
        }
        return scraper;
    }

View Full Code Here

        final Set<String> tags=ListManager.string2set(tag); //this allow multiple default tags
        try {
            //load the links
            final ContentScraper scraper = new ContentScraper(baseURL);
            //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
            FileUtils.copy(input,writer);
            writer.close();
            links = scraper.getAnchors();
        } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
        for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
            url = link.getKey();
            title = link.getValue().getProperty("name", "");

View Full Code Here

TOP

Related Classes of net.yacy.document.parser.html.TransformerWriter

Crawler_p

de.anomic.data.BookmarkHelper

net.yacy.document.parser.htmlParser

net.yacy.kelondro.data.meta.DigestURI

net.yacy.kelondro.io.CharBuffer

java.io.File

java.io.Reader

java.io.FileReader

java.io.OutputStreamWriter

java.io.FileOutputStream

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.