Package cascading.tuple

Examples of cascading.tuple.Fields


    public void importUrls(boolean debug) throws Exception {


        try {
            Tap urlSource = _platform.makeTap(_platform.makeTextScheme(), _inputFilePath);
            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFromTextFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));

            Tap urlSink = _platform.makeTap(_platform.makeBinaryScheme(CrawlDbDatum.FIELDS), _destDirPath, SinkMode.REPLACE);

            FlowConnector flowConnector = _platform.makeFlowConnector();
            Flow flow = flowConnector.connect(urlSource, urlSink, importPipe);
View Full Code Here


        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);

        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);


        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
View Full Code Here

        List<Pipe> tailPipes = new ArrayList<Pipe>();
       
        if (options.isGenerateHTML()) {
            // Let's write out the parse as text:
            Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
            BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
            Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
            sinkMap.put(textParsePipe.getName(), textParseTap);
            tailPipes.add(textParsePipe);
        }
       
        // Let's output a WritableSequenceFile as an example - this file can
        // then be used as input when working with Mahout.
        // For now we only do it when we are running in Hadoop mode
          Tap writableSeqFileSink = null;
          Pipe writableSeqFileDataPipe = null;
            if (!options.isLocalPlatformMode()) {
                writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
                BasePath writableSeqFileDataPath = platform.makePath(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
                WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
                writableSeqFileSink = platform.makeTap(writableSeqScheme, writableSeqFileDataPath, SinkMode.REPLACE);
            }
       
        Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
        if (urlFilter != null) {
            urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
        }
       
        urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

        // Take status and output urls from it 
        Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
        urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
        urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

        // Finally join the URLs we get from parsing content with the URLs we got
        // from the status ouput, and the urls we didn't process from the db so that
        // we have a unified stream of all known URLs for the crawldb.
        Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
        finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

        // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums
        // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
        // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
        Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
                        new Fields(UrlDatum.URL_FN));
        crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);
       
        Pipe outputPipe = new Pipe ("output pipe");
        outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());
       
View Full Code Here

                analyzedDatum = new AnalyzedDatum(analyzedTuple);
                url = analyzedDatum.getUrl();
            }

            // we could have either status + link or just link tuple entry
            if (entry.getString(new Fields(LinkDatum.URL_FN)) != null) {
                LinkDatum linkDatum = new LinkDatum(TupleEntry.select(LinkDatum.FIELDS, entry));
               
                pageScore = linkDatum.getPageScore();
                // Add up the link scores
                linkScore += linkDatum.getLinkScore();
View Full Code Here

            BasePath inputPath = platform.makePath(inputFileName);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
           
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
           
            BaseScoreGenerator scorer = new FixedScoreGenerator();
           
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
           
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
        Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);

        // Create a named pipe for the status of the mod_mbox-generated pages.
            Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());

            // Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
            // can be big, e.g. 4MB).
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
            fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
           
            // We can create the fetch pipe, and set up our Mbox splitter to run on content.
            FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
            SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
           
            // Now create the pipe that's going to analyze the emails we get after splitting them up.
            Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
            analysisPipe = new Each(analysisPipe, new ParseEmailFunction());
           
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
            analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);

            // Now we want to sum the scores for each user, which is another grouping/summing.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
            analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);
           
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            analysisPipe = new Each(analysisPipe, filter);
           
            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
View Full Code Here

@SuppressWarnings({"serial", "rawtypes"})
public class CreateWritableSeqFileData extends BaseOperation<NullContext> implements Function<NullContext> {
    private static final Logger LOGGER = LoggerFactory.getLogger(CreateWritableSeqFileData.class);

    public CreateWritableSeqFileData() {
        super(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN));
    }
View Full Code Here

            pipe = new Each(pipe, new UrlLengthener(fetcher));
            pipe = new Each(pipe, new Debug());

            BixoPlatform platform = new BixoPlatform(LengthenUrlsTool.class, Platform.Local);
            BasePath filePath = platform.makePath(filename);
            TextLine textLineLocalScheme = new TextLine(new Fields("url"));
            Tap sourceTap = platform.makeTap(textLineLocalScheme, filePath, SinkMode.KEEP);
            SinkTap sinkTap = new NullSinkTap(new Fields("url"));
           
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);

            flow.complete();
View Full Code Here

            BixoPlatform platform = new BixoPlatform(RunFakeFetchPipe.class, Platform.Local);
           
            BasePath inputPath = platform.makePath(path.getFile());
            Tap in = platform.makeTap(platform.makeTextScheme(), inputPath);

            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFunction());

            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new FakeHttpFetcher(true, 10);
            FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, 1);
View Full Code Here

@SuppressWarnings("serial")
public class CreateResultsFunction extends BaseOperation<NullContext> implements Function<NullContext> {
    private static final Logger LOGGER = LoggerFactory.getLogger(CreateResultsFunction.class);

    public CreateResultsFunction() {
        super(new Fields("line"));
    }
View Full Code Here

            Pipe pipe = new Pipe("Email Analyzer");
            pipe = new Each(pipe, new ParseEmailFunction());
           
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            pipe = new GroupBy(pipe, new Fields(FieldNames.MESSAGE_ID));
            pipe = new Every(pipe, new CalcMessageScoreBuffer(), Fields.RESULTS);

            // Now we want to sum the scores for each user, which is another grouping/summing.
            pipe = new GroupBy(pipe, new Fields(FieldNames.EMAIL_ADDRESS));
            pipe = new Every(pipe, new SumScoresBuffer(), Fields.RESULTS);
           
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            pipe = new Each(pipe, filter);
           
            // And let's sort in reverse order (high to low score)
            pipe = new GroupBy(pipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the output (sink tap)
            Tap sinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputDirName), SinkMode.REPLACE);
           
View Full Code Here

TOP

Related Classes of cascading.tuple.Fields

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.