Package cascading.tuple

Examples of cascading.tuple.Fields


        Tap inputSource = platform.makeTap(platform.makeTextScheme(), crawlDbPath);
        Pipe importPipe = new Pipe("import pipe");
        // Apply a regex to extract the relevant fields
        RegexParser crawlDbParser = new RegexParser(CrawlDbDatum.FIELDS,
                                                        "^(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)");
        importPipe = new Each(importPipe, new Fields("line"), crawlDbParser);

        // Split into tuples that are to be fetched and that have already been fetched
        SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

        Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
        Pipe urlsToFetchPipe = splitter.getLHSPipe();

        // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster,
        // or MAX_LOCAL_FETCH if running locally. So first we sort the entries
        // from high to low by links score.
        // TODO add unit test
        urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
        long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

        BaseScoreGenerator scorer = new LinkScoreGenerator();

        // Create the sub-assembly that runs the fetch job
        int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL :  CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
        fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);

        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
       
        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());
       
        Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
        outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateResultsFunction());
       
        // Group the finished datums, the skipped datums, status, outlinks
        Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
                        Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                                        new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin());
        updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

       
        // output : loop dir specific crawldb
        BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
View Full Code Here


import cascading.tuple.Fields;

public class FieldUtils {

    public static Fields add(Fields fields, String... moreFieldNames) {
        Fields moreFields = new Fields(moreFieldNames);
        return fields.append(moreFields);
    }
View Full Code Here

    public void setScore(double score) {
        _tupleEntry.setDouble(SCORE_FN, score);
    }

    public static Fields getSortingField() {
        return new Fields(SCORE_FN);
    }
View Full Code Here

    }

    // ==================================================
   
    public static Fields getGroupingField() {
        return new Fields(GROUPING_KEY_FN);
    }
View Full Code Here

    public static Fields getGroupingField() {
        return new Fields(GROUPING_KEY_FN);
    }

    public static Fields getSortingField() {
        return new Fields(FETCH_TIME_FN);
    }
View Full Code Here

       
        return result;
    }

    public static Fields getParsedTextField() {
        return new Fields(ParsedDatum.PARSED_TEXT_FN);
    }
View Full Code Here

        Pipe skippedStatus = new Pipe("skipped status", new Each(splitter.getLHSPipe(), new MakeSkippedStatus()));
       
        // TODO KKr You're already setting the group name here (so that the
        // tail pipe gets the same name), so I wasn't able to pass in a
        // group name here for BaseTool.nameFlowSteps to use for the job name.
        Pipe joinedStatus = new GroupBy(STATUS_PIPE_NAME, Pipe.pipes(skippedStatus, fetchedStatus), new Fields(StatusDatum.URL_FN));

        setTails(fetchedContent, joinedStatus);
    }
View Full Code Here

    public void setGroupKey(String groupKey) {
        _tupleEntry.setString(GROUP_KEY_FN, groupKey);
    }
   
    public static Fields getGroupingField() {
        return new Fields(GROUP_KEY_FN);
    }
View Full Code Here

        String[] columnNames;
        int numChunks;
        Options options;

        public DBMigrateScheme(int numChunks, String dbDriver, String dbUrl, String username, String pwd, String tableName, String pkColumn, String[] columnNames, Options options) {
            super(new Fields(columnNames));
            this.dbDriver = dbDriver;
            this.dbUrl = dbUrl;
            this.username = username;
            this.pwd = pwd;
            this.tableName = tableName;
View Full Code Here

    WriteDRMsToSolr(Map<String, String> fields) throws IOException {
        Configuration conf = new JobConf();
        fs = FileSystem.get(conf);
        iDFieldName = fields.get("iD1");
        dRM1FieldName = fields.get("dRM1FieldName");
        inFieldsDRM1 = new Fields(iDFieldName, dRM1FieldName);
        simpleOutFields = new Fields(iDFieldName, dRM1FieldName);
        if(fields.containsKey("dRM2FieldName")){//joining DRMs so defined needed fields
            iD2FieldName = iDFieldName+"2";//just to uniqueify it from the other id field name
            dRM2FieldName = fields.get("dRM2FieldName");
            inFieldsDRM2 = new Fields(iDFieldName, dRM2FieldName);
            common = new Fields(iDFieldName);
            grouped = new Fields(iDFieldName, dRM1FieldName, iD2FieldName, dRM2FieldName);
            joinedOutFields = new Fields(iDFieldName, dRM1FieldName, dRM2FieldName);
        }
    }
View Full Code Here

TOP

Related Classes of cascading.tuple.Fields

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.