Examples of com.scaleunlimited.cascading.BasePath

com.scaleunlimited.cascading.BasePath

        BixoPlatform platform = new BixoPlatform(ParsePipeTest.class, Platform.Local);
        


        Pipe pipe = new Pipe("parse_source");
        ParsePipe parserPipe = new ParsePipe(pipe, new SimpleParser());
        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);


        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());


        ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");

View Full Code Here

    }


    static void setupWorkingDir(BasePlatform platform, BasePath workingDirPath, String seedUrlsfileName) throws Exception {
        
        // Check if we already have a crawldb
        BasePath crawlDbPath = null;
        BasePath loopDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
        if (loopDirPath != null) {
            // Clear out any previous loop directory, so we're always starting from scratch
            LOGGER.info("deleting existing working dir");
            while (loopDirPath != null) {
                loopDirPath.delete(true);
                loopDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
            }
        } 


        // Create a "0-<timestamp>" loop sub-directory and import the seed urls

View Full Code Here


        // Build and run the flow.
        
        try {
            BixoPlatform platform = new BixoPlatform(DemoWebMiningTool.class, options.getPlatformMode());
            BasePath workingDirPath = platform.makePath(options.getWorkingDir());


            setupWorkingDir(platform, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);
 
            BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
            if (latestDirPath == null) {
                error("No previous cycle output dirs exist in " + workingDirPath, parser);
            }
            
            BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
            
            FetcherPolicy fetcherPolicy = new FetcherPolicy();
            fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
            
            // We only care about mime types that the Tika HTML parser can handle,
            // so restrict it to the same.
            Set<String> validMimeTypes = new HashSet<String>();
            Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
            for (MediaType supportedType : supportedTypes) {
                validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
            }
            fetcherPolicy.setValidMimeTypes(validMimeTypes);


            // Let's limit our crawl to two loops 
            for (int curLoop = 1; curLoop <= 2; curLoop++) {
                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, curLoop);
                Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
                flow.complete();


                // Update crawlDbPath to point to the latest crawl db
                crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

View Full Code Here

        
        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeMbox.class, options.getPlatformMode());
            // Create the input (source tap), which is just a sequence file reader. We assume
          // that the file already has the results of splitting the mbox file into emails.
            BasePath inputPath = platform.makePath(inputFileName);
            platform.assertPathExists(inputPath, "input file");
            Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
            
            Pipe pipe = new Pipe("Email Analyzer");
            pipe = new Each(pipe, new ParseEmailFunction());

View Full Code Here

                                        new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin());
        updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);


        
        // output : loop dir specific crawldb
        BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawlDbSink = platform.makeTap(platform.makeTextScheme(), outCrawlDbPath, SinkMode.REPLACE);
        // Status, 
        BasePath statusDirPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath);
        // Content
        BasePath contentDirPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath);
        
        // PageResults
        BasePath resultsDirPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
        Tap resultsSink = platform.makeTap(platform.makeTextScheme(), resultsDirPath);


        // Create the output map that connects each tail pipe to the appropriate sink.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        sinkMap.put(updatePipe.getName(), crawlDbSink);

View Full Code Here

   */
  public static BasePath makeLoopDir(BasePlatform platform, BasePath outputDir, int loopNumber)
      throws Exception {
    String timestamp = new SimpleDateFormat("yyyyMMdd'T'HHmmss")
        .format(new Date());
    BasePath loopDir = platform.makePath(outputDir, "" + loopNumber + "-" + timestamp);
    loopDir.mkdirs();
    return loopDir;
  }

View Full Code Here

   * @throws Exception 
   */
  public static BasePath findLatestLoopDir(BasePlatform platform, BasePath outputPath)
      throws Exception {
    int bestLoop = -1;
    BasePath result = null;


    BasePath[] paths = outputPath.list();
    for (BasePath path : paths) {
      if (!path.isDirectory()) {
        continue;

View Full Code Here

   * @throws Exception 
   */
  public static BasePath findNextLoopDir(BasePlatform platform, BasePath outputPath,
      int loopNumber) throws Exception {
    int bestLoop = Integer.MAX_VALUE;
    BasePath result = null;


    BasePath[] paths = outputPath.list();
    for (BasePath path : paths) {
      if (!path.isDirectory()) {
        continue;

View Full Code Here


            try {
                // Verify crawl dir name is valid.
                extractLoopNumber(path);


                BasePath subdirPath = platform.makePath(path, subdirName);
                if (subdirPath.exists() && subdirPath.isDirectory()) {
                    result.add(subdirPath);
                }
            } catch (InvalidParameterException e) {
                LOGGER.debug("Ignoring directory :" + path.getName());
            }

View Full Code Here

0 1 2 3

TOP

Related Classes of com.scaleunlimited.cascading.BasePath

bixo.datum.ScoredUrlDatumTest

bixo.examples.crawl.DemoCrawlTool

bixo.examples.crawl.DemoCrawlWorkflow

bixo.examples.crawl.DemoCrawlWorkflowLRTest

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningTool

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.