Package bixo.parser

Examples of bixo.parser.SimpleParser


        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
       
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        }
       
        parser.setExtractLanguage(false);
        ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

       
        // Create the output map that connects each tail pipe to the appropriate sink, and the
        // list of tail pipes.
View Full Code Here


        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
        fetcher.setMaxRetryCount(options.getMaxRetries());
       
        // Give a long timeout for parsing
        ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
        SimpleParser parser = new SimpleParser(parserPolicy);

        SimpleParser rawParser = new SimpleParser(parserPolicy, true);
       
        // Create Boilperpipe content extractor
        SimpleParser bpParser = new SimpleParser(new BoilerpipeContentExtractor(), new NullLinkExtractor(), parserPolicy);
       
        if (options.isTraceLogging()) {
            Logger.getRootLogger().setLevel(Level.TRACE);
            System.setProperty("bixo.root.level", "TRACE");
        }
       
        String urls[] = options.getUrls() == null ? null : options.getUrls().split(",");
        boolean interactive = (urls == null);
        int index = 0;
       
        while (interactive || (index < urls.length)) {
          String url;
         
          try {
              if (interactive) {
                System.out.print("URL to fetch: ");
                url = readInputLine();
                if (url.length() == 0) {
                  System.exit(0);
                }
              } else {
                url = args[index++];
              }

              System.out.println("Fetching " + url);
            FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
            System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
            System.out.flush();
           
            // System.out.println("Result = " + result.toString());
            ParsedDatum parsed = parser.parse(result);
            System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
                            parsed.getLanguage(), parsed.getParsedText().length()));
           
            ParsedDatum bpParsed = bpParser.parse(result);
            ParsedDatum rawParsed = rawParser.parse(result);
           
            if (interactive) {
                while (true) {
                    System.out.print("Next action - (d)ump regular, dump (b)oilerpipe, dump (r)aw, (e)xit: ");
View Full Code Here

        BixoPlatform platform = new BixoPlatform(ParsePipeTest.class, Platform.Local);
       

        Pipe pipe = new Pipe("parse_source");
        ParsePipe parserPipe = new ParsePipe(pipe, new SimpleParser());
        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);
View Full Code Here

        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);

        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
       
        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());
       
View Full Code Here

            super.cleanup(flowProcess, operationCall);
        }
    }

    public ParsePipe(Pipe fetcherPipe) {
        this(fetcherPipe, new SimpleParser());
    }
View Full Code Here

TOP

Related Classes of bixo.parser.SimpleParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.