Package com.scaleunlimited.cascading

Examples of com.scaleunlimited.cascading.BasePath


    @Test
    public void testFindAllSubdirs() throws Exception {
        // Make a loop dir with a subdir
        String subdirName = "bogus";
        BasePath path0 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 0);
        BasePath subdirPath0 = _platform.makePath(path0, subdirName);
        subdirPath0.mkdirs();
       
        // And another one without the subdir
        BasePath path1 = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 1);

        BasePath[] allSubdirPathsArr = CrawlDirUtils.findAllSubdirs(_platform, _outputPath, subdirName);
        Assert.assertEquals(1, allSubdirPathsArr.length);

        // Now add a subdir to path1 as well
        BasePath subdirPath1 = _platform.makePath(path1, subdirName);
        subdirPath1.mkdirs();
        BasePath[] strictSubdirPathsArr = CrawlDirUtils.findAllSubdirs(_platform, _outputPath, subdirName);
        Assert.assertEquals(2, strictSubdirPathsArr.length);
    }
View Full Code Here


        System.exit(-1);
    }

  @SuppressWarnings({ "unchecked", "rawtypes" })
    private static void processStatus(BasePlatform platform, BasePath curDirPath) throws Exception {
        BasePath statusPath = platform.makePath(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusTap = platform.makeTap(platform.makeTextScheme(), statusPath);
       
        TupleEntryIterator iter = statusTap.openForRead(platform.makeFlowProcess());
       
        LOGGER.info("Analyzing: " +  CrawlConfig.STATUS_SUBDIR_NAME);
View Full Code Here

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private static void processCrawlDb(BixoPlatform platform, BasePath latestCrawlDirPath, boolean exportDb) throws Exception {
        TupleEntryIterator iter;
        int totalEntries;
        BasePath crawlDbPath = platform.makePath(latestCrawlDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
        iter = crawldbTap.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        int fetchedUrls = 0;
        int unfetchedUrls = 0;
View Full Code Here

        String crawlDirName = options.getWorkingDir();

        try {
            BixoPlatform platform = new BixoPlatform(DemoStatusTool.class, options.getPlatformMode());
          BasePath crawlDirPath = platform.makePath(crawlDirName);

          platform.assertPathExists(crawlDirPath, "Prior crawl output directory does not exist");
         
          // Skip Hadoop/Cascading DEBUG messages.
            org.apache.log4j.Logger.getRootLogger().setLevel(Level.INFO);
         
          boolean exportDb = options.isExportDb();
          if (exportDb) {
              BasePath latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(platform, crawlDirPath);
              processCrawlDb(platform, latestCrawlDirPath, exportDb);
          } else {
              int prevLoop = -1;
              BasePath curDirPath = null;
              while ((curDirPath = CrawlDirUtils.findNextLoopDir(platform, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.getAbsolutePath();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");
               
View Full Code Here

            logsDir = logsDir + "/";
        }
       
        try {
            BixoPlatform platform = new BixoPlatform(DemoCrawlTool.class, options.getPlatformMode());
            BasePath outputPath = platform.makePath(outputDirName);

            // First check if the user want to clean
            if (options.isCleanOutputDir()) {
                if (outputPath.exists()) {
                    outputPath.delete(true);
                }
            }
           
            // See if the user isn't starting from scratch then set up the
            // output directory and create an initial urls subdir.
            if (!outputPath.exists()) {
                outputPath.mkdirs();

                // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
                // In the /crawldb dir the input file will have a single URL for the target domain.

                BasePath curLoopDir = CrawlDirUtils.makeLoopDir(platform, outputPath, 0);
                String curLoopDirName = curLoopDir.getName();
                setLoopLoggerFile(logsDir + curLoopDirName, 0);
                BasePath crawlDbPath = platform.makePath(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);
               
                if (domain != null) {
                    importOneDomain(platform, domain, crawlDbPath);
                } else {
                    BasePath urlsPath = platform.makePath(urlsFile);
                    UrlImporter urlImporter = new UrlImporter(platform, urlsPath, crawlDbPath);
                    urlImporter.importUrls(false);                }
            }
           
            BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, outputPath);

            if (latestDirPath == null) {
                System.err.println("No previous cycle output dirs exist in " + outputDirName);
                printUsageAndExit(parser);
            }

            BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            // Set up the start and end loop counts.
            int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
            int endLoop = startLoop + options.getNumLoops();

            // Set up the UserAgent for the fetcher.
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

            // You also get to customize the FetcherPolicy
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // It is a good idea to set up a crawl duration when running long crawls as you may
            // end up in situations where the fetch slows down due to a 'long tail' and by
            // specifying a crawl duration you know exactly when the crawl will end.
            int crawlDurationInMinutes = options.getCrawlDuration();
            boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
            long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) :
                FetcherPolicy.NO_CRAWL_END_TIME;

            // By setting up a url filter we only deal with urls that we want to
            // instead of all the urls that we extract.
            BaseUrlFilter urlFilter = null;
            List<String> patterns = null;
            String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
            if (regexUrlFiltersFile != null) {
                patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
            } else {
                patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
                if (domain != null) {
                    String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                    patterns.add(domainPatterStr);
                } else {
                    String protocolPatterStr = "+(?i)^(http|https)://*";
                    patterns.add(protocolPatterStr);
                    LOGGER.warn("Defaulting to basic url regex filtering (just suffix and protocol");
                }
            }
            urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

            // OK, now we're ready to start looping, since we've got our current
            // settings
            for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

                // Adjust target end time, if appropriate.
                if (hasEndTime) {
                    int remainingLoops = (endLoop - curLoop) + 1;
                    long now = System.currentTimeMillis();
                    long perLoopTime = (targetEndTime - now) / remainingLoops;
                    defaultPolicy.setCrawlEndTime(now + perLoopTime);
                }

                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, outputPath, curLoop);
                String curLoopDirName = curLoopDirPath.getName();
                setLoopLoggerFile(logsDir+curLoopDirName, curLoop);

                Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
                flow.complete();
               
View Full Code Here

    public void testOperateWithGroupBy() throws Exception {
       
        BixoPlatform platform = new BixoPlatform(LatestUrlDatumBufferTest.class, Platform.Local);
       
        // Create a temp file with a fetched url
        BasePath workingDirPath = platform.makePath(WORKINGDIR);
        BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched");
        ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
        fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        fetchedDatums.add(fetchedDatum1);
        createDataFile(platform, fetchedDatumsPath, fetchedDatums);
       
        // And another with unfetched urls
        BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched");
        ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
        unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum1);
        UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
        unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum2);
       
        createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);

       
        // create a workflow
        Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
        Pipe unfetchedPipe = new Pipe("unfetched");

        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);

        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);

        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);
View Full Code Here

        // Create the output sinks :
        //      crawldb
        //      content
        //      parse
        //      status
        BasePath outCrawlDbPath = platform.makePath(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap loopCrawldbSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), outCrawlDbPath, SinkMode.REPLACE);

        BasePath contentDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath, SinkMode.REPLACE);

        BasePath parseDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
        Tap parseSink = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), parseDirPath, SinkMode.REPLACE);

        BasePath statusDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath, SinkMode.REPLACE);

        // Create the sub-assembly that runs the fetch job
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
        fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

        // You can also provide a set of mime types you want to restrict what content type you
        // want to deal with - for now keep it simple.
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/plain");
        validMimeTypes.add("text/html");
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // The scorer is used by the FetchPipe to assign a score to every URL that passes the
        // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
        // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
        BaseScoreGenerator scorer = new FixedScoreGenerator();

        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
       
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        }
       
        parser.setExtractLanguage(false);
        ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

       
        // Create the output map that connects each tail pipe to the appropriate sink, and the
        // list of tail pipes.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        List<Pipe> tailPipes = new ArrayList<Pipe>();
       
        if (options.isGenerateHTML()) {
            // Let's write out the parse as text:
            Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
            BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
            Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
            sinkMap.put(textParsePipe.getName(), textParseTap);
            tailPipes.add(textParsePipe);
        }
       
        // Let's output a WritableSequenceFile as an example - this file can
        // then be used as input when working with Mahout.
        // For now we only do it when we are running in Hadoop mode
          Tap writableSeqFileSink = null;
          Pipe writableSeqFileDataPipe = null;
            if (!options.isLocalPlatformMode()) {
                writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
                BasePath writableSeqFileDataPath = platform.makePath(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
                WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
                writableSeqFileSink = platform.makeTap(writableSeqScheme, writableSeqFileDataPath, SinkMode.REPLACE);
            }
       
        Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
View Full Code Here

        _outputPath.delete(true);
    }

    @Test
    public void testMakeLoopDir() throws Exception {
        BasePath loopPath = CrawlDirUtils.makeLoopDir(_platform, _outputPath, 3);
        Assert.assertTrue(loopPath.toString().startsWith(_outputPath.toString() + "/3-"));
        Assert.assertTrue(loopPath.exists());
    }
View Full Code Here

    public void testFindLatestLoopDir() throws Exception {
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 0);
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 1);
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 3);
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 7);
        BasePath expectedPath =
            CrawlDirUtils.makeLoopDir(_platform, _outputPath, 11);
        Assert.assertEquals(expectedPath.toString(),
                            CrawlDirUtils.findLatestLoopDir(_platform, _outputPath).toString());
    }
View Full Code Here

    }
   
    @Test
    public void testFindNextLoopDir() throws Exception {
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 0);
        BasePath path1 =
            CrawlDirUtils.makeLoopDir(_platform, _outputPath, 1);
        BasePath path3 =
            CrawlDirUtils.makeLoopDir(_platform, _outputPath, 3);
        BasePath path7 =
            CrawlDirUtils.makeLoopDir(_platform, _outputPath, 7);
        CrawlDirUtils.makeLoopDir(_platform, _outputPath, 11);
        Assert.assertEquals(path1.toString(),
                            CrawlDirUtils.findNextLoopDir(_platform, _outputPath, 0).toString());
        Assert.assertEquals(path3.toString(),
                            CrawlDirUtils.findNextLoopDir(_platform, _outputPath, 1).toString());
        Assert.assertEquals(path7.toString(),
                            CrawlDirUtils.findNextLoopDir(_platform, _outputPath, 4).toString());
    }
View Full Code Here

TOP

Related Classes of com.scaleunlimited.cascading.BasePath

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.