Examples of cascading.tap.Tap

cascading.tap.Tap
A Tap represents the physical data source or sink in a connected {@link cascading.flow.Flow}.
That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, anda sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk, distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts out the complexity of connecting to these types of data sources.
A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc).A Tap is responsible for how the resource is reached.
By default when planning a Flow, Tap equality is a function of the {@link #getIdentifier()} and {@link #getScheme()}values. That is, two Tap instances are the same Tap instance if they sink/source the same resource and sink/source the same fields.
Some more advanced taps, like a database tap, may need to extend equality to include any filtering, like the {@code where} clause in a SQL statement so two taps reading from the same SQL table aren't considered equal.
Taps are also used to determine dependencies between two or more {@link Flow} instances when used with a{@link cascading.cascade.Cascade}. In that case the {@link #getFullIdentifier(Object)} value is used and the Schemeis ignored.

        }
    }
    
    protected void testDurationLimitSimple(BixoPlatform platform) throws Exception {
        // Pretend like we have 10 URLs from the same domain
        Tap in = makeInputData(platform, "testDurationLimitSimple", 1, 10);


        // Create the fetch pipe we'll use to process these fake URLs
        Pipe pipe = new Pipe("urlSource");
        
        // This will force all URLs to get skipped because of the crawl end time limit.
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlEndTime(0);
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);


        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testDurationLimitSimple");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap statusSink = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);


        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
        
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertFalse(tupleEntryIterator.hasNext());
        
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            numEntries += 1;
            TupleEntry entry = tupleEntryIterator.next();

View Full Code Here

    }
    
    protected void testMaxUrlsPerServer(BixoPlatform platform) throws Exception {
        // Pretend like we have 2 URLs from the same domain
        final int sourceUrls = 2;
        Tap in = makeInputData(platform, "testMaxUrlsPerServer", 1, sourceUrls);


        // Create the fetch pipe we'll use to process these fake URLs
        Pipe pipe = new Pipe("urlSource");
        
        // This will limit us to one URL.
        final int maxUrls = 1;
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy.getMaxRequestsPerConnection(), maxUrls, BaseFetchJobPolicy.DEFAULT_CRAWL_DELAY);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);


        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testMaxUrlsPerServer");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap statusSink = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);


        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
        
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        tupleEntryIterator.next();
        Assert.assertFalse(tupleEntryIterator.hasNext());


        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        
        int numSkippedEntries = 0;
        int numFetchedEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();

View Full Code Here

    }
    
    private Tap makeInputData(BasePlatform platform, String testname, int numDomains, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int i = 0; i < numDomains; i++) {
            for (int j = 0; j < numPages; j++) {
                // Use special domain name pattern so code deep inside of operations "knows" not
                // to try to resolve host names to IP addresses.
                write.add(makeTuple("bixo-test-domain-" + i + ".com", j, payload));

View Full Code Here

    }
    
    private Tap makeInputData(BasePlatform platform, String testname, String domain, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int j = 0; j < numPages; j++) {
            write.add(makeTuple(domain, j, payload));
        }


        write.close();

View Full Code Here

            }


            BixoPlatform platform = new BixoPlatform(RunFakeFetchPipe.class, Platform.Local);
            
            BasePath inputPath = platform.makePath(path.getFile());
            Tap in = platform.makeTap(platform.makeTextScheme(), inputPath);


            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFunction());


            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new FakeHttpFetcher(true, 10);
            FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, 1);


            // Create the output, which is a dual file sink tap.
            String output = "build/test/RunFakeFetchPipe/dual";
            BasePath outputPath = platform.makePath(output);
            BasePath statusPath = platform.makePath(outputPath, "status");
            Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);


            BasePath contentPath = platform.makePath(outputPath, "content");
            Tap content = platform.makeTap(platform.makeTextScheme(), contentPath, SinkMode.REPLACE);
            
            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
            flow.complete();

View Full Code Here

        


        Pipe pipe = new Pipe("parse_source");
        ParsePipe parserPipe = new ParsePipe(pipe, new SimpleParser());
        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);


        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());


        ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");
        Iterator<ArchiveRecord> iterator = archiveReader.iterator();

View Full Code Here

            BixoPlatform platform = new BixoPlatform(AnalyzeMbox.class, options.getPlatformMode());
            // Create the input (source tap), which is just a sequence file reader. We assume
          // that the file already has the results of splitting the mbox file into emails.
            BasePath inputPath = platform.makePath(inputFileName);
            platform.assertPathExists(inputPath, "input file");
            Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
            
            Pipe pipe = new Pipe("Email Analyzer");
            pipe = new Each(pipe, new ParseEmailFunction());
            
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            pipe = new GroupBy(pipe, new Fields(FieldNames.MESSAGE_ID));
            pipe = new Every(pipe, new CalcMessageScoreBuffer(), Fields.RESULTS);


            // Now we want to sum the scores for each user, which is another grouping/summing.
            pipe = new GroupBy(pipe, new Fields(FieldNames.EMAIL_ADDRESS));
            pipe = new Every(pipe, new SumScoresBuffer(), Fields.RESULTS);
            
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            pipe = new Each(pipe, filter);
            
            // And let's sort in reverse order (high to low score)
            pipe = new GroupBy(pipe, new Fields(FieldNames.SUMMED_SCORE), true);


            // Create the output (sink tap)
            Tap sinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputDirName), SinkMode.REPLACE);
            
            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);

View Full Code Here

        SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
        
        InputStream is = null;
        TupleEntryCollector writer = null;
        try {
            Tap urlSink = platform.makeTap(platform.makeTextScheme(), crawlDbPath, SinkMode.REPLACE);
            writer = urlSink.openForWrite(platform.makeFlowProcess());


            is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName);
            if (is == null) {
                throw new FileNotFoundException("The seed urls file doesn't exist");
            }

View Full Code Here


        // Input : the crawldb
        platform.assertPathExists(crawlDbPath, "CrawlDb");


// TODO VMa - figure out types       Tap inputSource = platform.makeTap(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath);
        Tap inputSource = platform.makeTap(platform.makeTextScheme(), crawlDbPath);
        Pipe importPipe = new Pipe("import pipe");
        // Apply a regex to extract the relevant fields 
        RegexParser crawlDbParser = new RegexParser(CrawlDbDatum.FIELDS, 
                                                        "^(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)");
        importPipe = new Each(importPipe, new Fields("line"), crawlDbParser);


        // Split into tuples that are to be fetched and that have already been fetched
        SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());


        Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
        Pipe urlsToFetchPipe = splitter.getLHSPipe();


        // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
        // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
        // from high to low by links score.
        // TODO add unit test
        urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
        long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));


        BaseScoreGenerator scorer = new LinkScoreGenerator();


        // Create the sub-assembly that runs the fetch job
        int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL :  CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
        fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);


        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);


        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
        
        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());
        
        Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
        outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());


        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateResultsFunction());
        
        // Group the finished datums, the skipped datums, status, outlinks
        Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), 
                        Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), 
                                        new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin());
        updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);


        
        // output : loop dir specific crawldb
        BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawlDbSink = platform.makeTap(platform.makeTextScheme(), outCrawlDbPath, SinkMode.REPLACE);
        // Status, 
        BasePath statusDirPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath);
        // Content
        BasePath contentDirPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath);
        
        // PageResults
        BasePath resultsDirPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
        Tap resultsSink = platform.makeTap(platform.makeTextScheme(), resultsDirPath);


        // Create the output map that connects each tail pipe to the appropriate sink.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        sinkMap.put(updatePipe.getName(), crawlDbSink);
        sinkMap.put(statusPipe.getName(), statusSink);

View Full Code Here

        groupByItemIDPipe.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
        // for these matrices the group by key is the id from the Mahout row key
        groupByItemIDPipe.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
        groupByItemIDPipe.getStepConfigDef().setProperty("joining", "true");


        Tap groupedOutputSink = new Hfs(new TextDelimited(true,","), groupedCSVOutputPath.toString());


        FlowDef flowDef = new FlowDef()
            .setName("group-DRMs-by-key")
            .addSource(lhs, dRM1Source)
            .addSource(rhs, dRM2Source)

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cascading.tap.Tap

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

bixo.pipes.AbstractFetchPipeTest

cascading.BasicPipesPlatformTest

cascading.BasicTrapPlatformTest

cascading.BufferPipesPlatformTest

cascading.cascade.CascadePlatformTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.