Examples of com.scaleunlimited.cascading.BasePath

com.scaleunlimited.cascading.BasePath

                            CrawlDirUtils.findNextLoopDir(_platform, _outputPath, 4).toString());
    }


    @Test
    public void testExtractLoopNumber() throws Exception {
        BasePath path0 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 0);
        BasePath path1 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 1);
        BasePath path3 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 3);
        BasePath path7 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 7);
        BasePath path11 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 11);
        Assert.assertEquals(0, CrawlDirUtils.extractLoopNumber(path0));
        Assert.assertEquals(1, CrawlDirUtils.extractLoopNumber(path1));
        Assert.assertEquals(3, CrawlDirUtils.extractLoopNumber(path3));
        Assert.assertEquals(7, CrawlDirUtils.extractLoopNumber(path7));
        Assert.assertEquals(11, CrawlDirUtils.extractLoopNumber(path11));

View Full Code Here

    private String makeCrawlDb(String workingFolder, String input) throws Exception {


        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        
        // We don't want to regenerate this DB all the time.
        BasePath workingPath = platform.makePath(workingFolder);
        BasePath crawlDBPath = platform.makePath(workingPath, URL_DB_NAME);
        if (!crawlDBPath.exists()) {
            Pipe importPipe = new Pipe("import URLs");
            importPipe = new Each(importPipe, new LoadUrlsFunction());
            
            BasePath inputPath = platform.makePath(input);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), crawlDBPath, SinkMode.REPLACE);
            
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, importPipe);

View Full Code Here


        String workingFolder = "build/it/FetcherTest/testStaleConnection/working";
        String input = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt");
        
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath inputPath = platform.makePath(input);


        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        String outputDir = "build/it/FetcherTest/testStaleConnection/out";
        BasePath outputPath = platform.makePath(outputDir);


        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);


        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);


        
        Pipe pipe = new Pipe("urlSource");

View Full Code Here

        System.setProperty("bixo.root.level", "TRACE");
        
        String workingFolder = "build/test-it/FetcherTest/testRunFetcher";
        String input = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt");
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath workingPath = platform.makePath(workingFolder);
        BasePath inputPath = platform.makePath(input);
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        
        BasePath contentPath = platform.makePath(workingPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
        BasePath statusPath = platform.makePath(workingPath, "status");
        Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);


        
        Pipe pipe = new Pipe("urlSource");

View Full Code Here

        String outputDirName = options.getOutputDir();


        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeEmail.class, options.getPlatformMode());
            // Create the input (source tap), which is just a text file reader
            BasePath inputPath = platform.makePath(inputFileName);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
            
            BaseScoreGenerator scorer = new FixedScoreGenerator();
            
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
            
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
        Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);


        // Create a named pipe for the status of the mod_mbox-generated pages.
            Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());


            // Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
            // can be big, e.g. 4MB).
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
            fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
            
            // We can create the fetch pipe, and set up our Mbox splitter to run on content.
            FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
            SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
            
            // Now create the pipe that's going to analyze the emails we get after splitting them up.
            Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
            analysisPipe = new Each(analysisPipe, new ParseEmailFunction());
            
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
            analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);


            // Now we want to sum the scores for each user, which is another grouping/summing.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
            analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);
            
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            analysisPipe = new Each(analysisPipe, filter);
            
            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);


            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),

View Full Code Here

        long fetchTime = System.currentTimeMillis();
        PartitioningKey groupingKey = new PartitioningKey("key", 1);
        FetchSetDatum pfd = new FetchSetDatum(urls, fetchTime, 1000, groupingKey.getValue(), groupingKey.getRef());
        
        BixoPlatform platform = new BixoPlatform(ScoredUrlDatumTest.class, platformMode);
        BasePath path = platform.makePath("build/test/ScoredUrlDatumTest/testCascadingSerialization/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchSetDatum.FIELDS), path, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        write.add(pfd.getTuple());
        write.close();
    }

View Full Code Here

    public void testNotLosingFetchedUrls() throws Throwable {
        String baseDirName = "build/test/DemoCrawlWorkflowLRTest/output";
        
        BixoPlatform platform = new BixoPlatform(DemoCrawlWorkflowLRTest.class, Platform.Local);
        
        BasePath baseDirPath = platform.makePath(baseDirName);
        baseDirPath.delete(true);
        BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 0);
        BasePath crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);


        DemoCrawlTool.importOneDomain(platform, "localhost:8089", crawlDbPath);
        curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 1);


        FetcherPolicy defaultPolicy = new FetcherPolicy();

View Full Code Here

            Pipe pipe = new Pipe("urls");
            pipe = new Each(pipe, new UrlLengthener(fetcher));
            pipe = new Each(pipe, new Debug());


            BixoPlatform platform = new BixoPlatform(LengthenUrlsTool.class, Platform.Local);
            BasePath filePath = platform.makePath(filename);
            TextLine textLineLocalScheme = new TextLine(new Fields("url"));
            Tap sourceTap = platform.makeTap(textLineLocalScheme, filePath, SinkMode.KEEP);
            SinkTap sinkTap = new NullSinkTap(new Fields("url"));
            
            FlowConnector flowConnector = platform.makeFlowConnector();

View Full Code Here

        options.setWorkingDir(WORKING_DIR);
        options.setAgentName("test-agent");
        options.setLocalPlatformMode(true);
        
        BixoPlatform platform = new BixoPlatform(DemoWebMiningWorkflowTest.class, options.getPlatformMode());
        BasePath workingDirPath = platform.makePath(WORKING_DIR);
        DemoWebMiningTool.setupWorkingDir(platform, workingDirPath, "/test-seedurls.txt");
        
        BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
        BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/plain");
        validMimeTypes.add("text/html");
        fetcherPolicy.setValidMimeTypes(validMimeTypes);


        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);


        Server server = null;
        try {
            server = startServer(new DirectoryResponseHandler("src/test/resources/test-pages"), 8089);
            
            BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 1);


            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
        
            // validate
            BasePath statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 1, "status", true);
    
            BasePath contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 1, "content", false);


            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 3, "crawldb", true);
            
            // run the second loop
            curLoopDirPath =  CrawlDirUtils.makeLoopDir(platform, workingDirPath, 2);
            flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
            
            // validate
            statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 2, "status", true);
    
            contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 2, "content", false);


            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 8, "crawldb", true);
            assertTrue(validatePageScores(platform, crawlDbPath));
            
            BasePath resultsPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
            validateEntryCount(platform, resultsPath, null, 3, "page results", true);
        }  finally {
            if (server != null) {
                server.stop();
            }

View Full Code Here

        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
        
        
        BasePath outputPath = makeOutputPath(platform, "testHeadersInStatus");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        
        // Finally we can run it.
        FlowDef flowDef = new FlowDef();
        flowDef.setName("testHeadersInStatus");

View Full Code Here

0 1 2 3

TOP

Related Classes of com.scaleunlimited.cascading.BasePath

bixo.datum.ScoredUrlDatumTest

bixo.examples.crawl.DemoCrawlTool

bixo.examples.crawl.DemoCrawlWorkflow

bixo.examples.crawl.DemoCrawlWorkflowLRTest

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningTool

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.