Package bixo.config

Examples of bixo.config.BixoPlatform


        testDurationLimitSimple(new BixoPlatform(FetchPipeHadoopTest.class, Platform.Hadoop));
    }
   
    @Test
    public void testMaxUrlsPerServer() throws Exception {
        testMaxUrlsPerServer(new BixoPlatform(FetchPipeHadoopTest.class, Platform.Hadoop));
    }
View Full Code Here


        }

        // Build and run the flow.
       
        try {
            BixoPlatform platform = new BixoPlatform(DemoWebMiningTool.class, options.getPlatformMode());
            BasePath workingDirPath = platform.makePath(options.getWorkingDir());

            setupWorkingDir(platform, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);
            BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
            if (latestDirPath == null) {
                error("No previous cycle output dirs exist in " + workingDirPath, parser);
            }
           
            BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
           
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
           
            FetcherPolicy fetcherPolicy = new FetcherPolicy();
            fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // We only care about mime types that the Tika HTML parser can handle,
            // so restrict it to the same.
            Set<String> validMimeTypes = new HashSet<String>();
            Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
            for (MediaType supportedType : supportedTypes) {
                validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
            }
            fetcherPolicy.setValidMimeTypes(validMimeTypes);

            // Let's limit our crawl to two loops
            for (int curLoop = 1; curLoop <= 2; curLoop++) {
                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, curLoop);
                Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
                flow.complete();

                // Update crawlDbPath to point to the latest crawl db
                crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            }
           
        } catch (Exception e) {
            System.err.println("Exception running job: " + e.getMessage());
            e.printStackTrace(System.err);
View Full Code Here

        String inputFileName = options.getInputFile();
        String outputDirName = options.getOutputDir();
       
        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeMbox.class, options.getPlatformMode());
            // Create the input (source tap), which is just a sequence file reader. We assume
          // that the file already has the results of splitting the mbox file into emails.
            BasePath inputPath = platform.makePath(inputFileName);
            platform.assertPathExists(inputPath, "input file");
            Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
           
            Pipe pipe = new Pipe("Email Analyzer");
            pipe = new Each(pipe, new ParseEmailFunction());
           
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            pipe = new GroupBy(pipe, new Fields(FieldNames.MESSAGE_ID));
            pipe = new Every(pipe, new CalcMessageScoreBuffer(), Fields.RESULTS);

            // Now we want to sum the scores for each user, which is another grouping/summing.
            pipe = new GroupBy(pipe, new Fields(FieldNames.EMAIL_ADDRESS));
            pipe = new Every(pipe, new SumScoresBuffer(), Fields.RESULTS);
           
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            pipe = new Each(pipe, filter);
           
            // And let's sort in reverse order (high to low score)
            pipe = new GroupBy(pipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the output (sink tap)
            Tap sinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputDirName), SinkMode.REPLACE);
           
            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);
            flow.complete();
        } catch (Throwable t) {
            System.err.println("Exception running AnalyzeMbox: " + t.getMessage());
            t.printStackTrace(System.err);
View Full Code Here

TOP

Related Classes of bixo.config.BixoPlatform

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.