Package cascading.flow

Examples of cascading.flow.Flow


        options.setLocalPlatformMode(true);
        UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
        Server server = null;
        try {
            server = startServer(new FakeWebSiteHandler(), 8089);
            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();

            // Update the crawlDb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            // Now we should have an output/1-<timestamp>/ directory, where the
            // /urls dir has 11 entries with
            // one being previously crawled, and the other 10 being pending.

            Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());

            int numFetched = 0;
            int numPending = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int crawlDepth = datum.getCrawlDepth();
                if (datum.getLastFetched() != 0) {
                    numFetched += 1;

                    assertEquals(UrlStatus.FETCHED, status);
                    assertEquals(0, crawlDepth);
                } else {
                    numPending += 1;
                    assertEquals(UrlStatus.UNFETCHED, status);
                    assertEquals(1, crawlDepth);
                }
            }

            assertEquals(1, numFetched);
            assertEquals(10, numPending);

            // Do it one more time, to verify status gets propagated forward.
            curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);

            flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();
            // Update crawldb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            iter = crawldbTap.openForRead(platform.makeFlowProcess());
View Full Code Here


            TextLine textLineLocalScheme = new TextLine(new Fields("url"));
            Tap sourceTap = platform.makeTap(textLineLocalScheme, filePath, SinkMode.KEEP);
            SinkTap sinkTap = new NullSinkTap(new Fields("url"));
           
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);

            flow.complete();
        } catch (Exception e) {
            System.err.println("Exception running tool: " + e.getMessage());
            e.printStackTrace(System.err);
            System.exit(-1);
        }
View Full Code Here

        try {
            server = startServer(new DirectoryResponseHandler("src/test/resources/test-pages"), 8089);
           
            BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 1);

            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
       
            // validate
            BasePath statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 1, "status", true);
   
            BasePath contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 1, "content", false);

            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 3, "crawldb", true);
           
            // run the second loop
            curLoopDirPath =  CrawlDirUtils.makeLoopDir(platform, workingDirPath, 2);
            flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
           
            // validate
            statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 2, "status", true);
   
View Full Code Here

        flowDef.addSource(pipe, in);
        flowDef.addTailSink(fetchPipe.getStatusTailPipe(), status);
        flowDef.addTailSink(fetchPipe.getContentTailPipe(), new NullSinkTap());
       
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(flowDef);
        flow.writeDOT("build/test/FetchPipeLRTest/testHeadersInStatus/flow.dot");
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        StatusDatum sd = new StatusDatum(tupleEntryIterator.next());
View Full Code Here

        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new NoRobotsResponseHandler(), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
View Full Code Here

        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new RedirectResponseHandler(true), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
View Full Code Here

        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, status, fetchPipe.getStatusTailPipe());
        TestWebServer webServer = null;
       
        try {
            final int numBytes = 10000;
           
            // Pick a time way longer than the FetcherPolicy.getRequestTimeout().
            final long numMilliseconds = 100 * 1000L;
            webServer = new TestWebServer(new NoRobotsResponseHandler(numBytes, numMilliseconds), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
View Full Code Here

        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
View Full Code Here

        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
       
        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(null, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        TupleEntry te = tupleEntryIterator.next();
View Full Code Here

        Tap statusSink = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(statusSink, contentSink), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertFalse(tupleEntryIterator.hasNext());
       
View Full Code Here

TOP

Related Classes of cascading.flow.Flow

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.