Examples of cascading.tap.hadoop.Hfs

cascading.tap.hadoop.Hfs
ode> will denote Dfs, and file://... will denote Lfs.
Call {@link #setTemporaryDirectory(java.util.Map,String)} to use a different temporary file directory pathother than the current Hadoop default path.
By default Cascading on Hadoop will assume any source or sink Tap using the {@code file://} URI schemeintends to read files from the local client filesystem (for example when using the {@code Lfs} Tap) where the Hadoopjob jar is started, Tap so will force any MapReduce jobs reading or writing to {@code file://} resources to run inHadoop "standalone mode" so that the file can be read.
To change this behavior, {@link HfsProps#setLocalModeScheme(java.util.Map,String)} to set a different scheme value,or to "none" to disable entirely for the case the file to be read is available on every Hadoop processing node in the exact same path.
Hfs can optionally combine multiple small files (or a series of small "blocks") into larger "splits". This reduces the number of resulting map tasks created by Hadoop and can improve application performance.
This is enabled by calling {@link HfsProps#setUseCombinedInput(boolean)} to {@code true}. By default, merging or combining splits into large ones is disabled.

        CascadeConnector cascadeConnector = new CascadeConnector(cfg);
        cascadeConnector.connect(flows).complete();
    }


    private Tap sourceTap() {
        return new Hfs(new TextDelimited(new Fields("id", "name", "url", "picture", "ts")), INPUT);
    }

View Full Code Here

        props.put(ConfigurationOptions.ES_INPUT_JSON, "true");
        return props;
    }


    private Tap sourceTap() {
        return new Hfs(new TextDelimited(new Fields("line")), INPUT);
    }

View Full Code Here

        groupByItemIDPipe.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
        // for these matrices the group by key is the id from the Mahout row key
        groupByItemIDPipe.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
        groupByItemIDPipe.getStepConfigDef().setProperty("joining", "true");


        Tap groupedOutputSink = new Hfs(new TextDelimited(true,","), groupedCSVOutputPath.toString());


        FlowDef flowDef = new FlowDef()
            .setName("group-DRMs-by-key")
            .addSource(lhs, dRM1Source)
            .addSource(rhs, dRM2Source)

View Full Code Here

        //pass these to the output function so the strings from the indexes can be written instead of the
        //binary values of the Keys and Vectors in the DRMs
        dRM1.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
        dRM1.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
        dRM1.getStepConfigDef().setProperty("joining", "false");
        Tap outputSink = new Hfs(new TextDelimited(true,","), cSVOutputPath.toString());


        FlowDef flowDef = new FlowDef()
            .setName("convert-to-CSV")
            .addSource(dRM1, dRM1Source)
            .addTailSink(dRM1, outputSink);

View Full Code Here

                //if(s.getPath().toString().contains("part-")){//found a part-xxxxx file
                if(s.getPath().getName().matches("^part.*")){//found a part-xxxxx file
                    Path filePath = new Path(s.getPath().toString());
                    //Tap t = new Hfs( inFields, filePath.toString());
                    //Tap t = new Hfs(new TextLine(), filePath.toString(), true);
                    Tap t = new Hfs( new WritableSequenceFile( f, LongWritable.class, VectorWritable.class ), filePath.toString() );
                    if( s.getLen() != 0 ){// then part file is not empty
                        all.add(t);
                    }
                }
            }

View Full Code Here

    }
    
    @Override
    protected Tap<?, ?, ?> makeSolrSink(Fields fields, String path) throws Exception {
        Scheme scheme = new SolrScheme(fields, SOLR_CORE_DIR);
        return new Hfs(scheme, path, SinkMode.REPLACE);
    }

View Full Code Here

        return new Hfs(scheme, path, SinkMode.REPLACE);
    }
    
    @Override
    protected Tap<?, ?, ?> makeSourceTap(Fields fields, String path) {
        return new Hfs(new SequenceFile(fields), path, SinkMode.REPLACE);
    }

View Full Code Here

    Tap tweetTap = makeTap( tweetPath, new TextDelimited( true, "\t" ) );


    Tap stopTap = makeTap( stopWords, new TextDelimited( new Fields( "stop" ), true, "\t" ) );


    // create SINK taps, replacing previous output if needed
    Tap tokenTap = new Hfs( new TextDelimited( true, "\t" ), tokenPath, SinkMode.REPLACE );
    Tap similarityTap = new Hfs( new TextDelimited( true, "\t" ), similarityPath, SinkMode.REPLACE );


    /*
    flow part #1
    generate a bipartite map of (uid, token), while filtering out stop-words
    */

View Full Code Here

    similarityFlow.complete();
    }


  public static Tap makeTap( String path, Scheme scheme )
    {
    return path.matches( "^[^:]+://.*" ) ? new Hfs( scheme, path ) : new Lfs( scheme, path );
    }

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );


    Fields stop = new Fields( "stop" );
    Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );
    Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );


    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cascading.tap.hadoop.Hfs

cascading.flow.hadoop.BuildJobsHadoopPlatformTest

cascading.flow.hadoop.FlowPlatformTest

cascading.flow.hadoop.MapReduceFlow

cascading.flow.hadoop.MapReduceFlowPlatformTest

cascading.flow.hadoop.planner.rule.scopeexpression.EquivalentTapsScopeExpression

cascading.flow.hadoop.util.HadoopMRUtil

cascading.flow.hadoop.util.HadoopUtil

cascading.lingual.platform.hadoop.HadoopDefaultFactory

cascading.lingual.platform.hadoop.HadoopPlatformBroker

cascading.lingual.platform.hadoop2.Hadoop2MR1DefaultFactory

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.