Examples of cascading.operation.regex.RegexFilter

cascading.operation.regex.RegexFilter
Class RegexFilter will apply the regex patternString against every input Tuple value and filter the Tuple stream accordingly.
By default, Tuples that match the given pattern are kept, and Tuples that do not match are filtered out. This can be changed by setting removeMatch to true.
Also, by default, the whole Tuple is matched against the given patternString (tab delimited). If matchEachElement is set to true, the pattern is applied to each Tuple value individually.
This operation uses {@link java.util.regex.Matcher} internally, specifically the method {@link java.util.regex.Matcher#find()}. @see java.util.regex.Matcher @see java.util.regex.Pattern

    Fields outputSelector = new Fields( "uid", "token" );
    tweetPipe = new Each( tweetPipe, new Fields( "text" ), splitter, outputSelector );


    tweetPipe = new Unique( tweetPipe, Fields.ALL );


    RegexFilter filter = new RegexFilter( "^\\S\\S+$" );
    tweetPipe = new Each( tweetPipe, new Fields( "token" ), filter );


    // create PIPEs for left join on the stop words
    Pipe stopPipe = new Pipe( "stop" ); // name branch
    Pipe joinPipe = new HashJoin( tweetPipe, new Fields( "token" ), stopPipe, new Fields( "stop" ), new LeftJoin() );
    joinPipe = new Each( joinPipe, new Fields( "stop" ), new RegexFilter( "^$" ) );


    joinPipe = new Retain( joinPipe, new Fields( "uid", "token" ) );


    /*
    flow part #2

View Full Code Here


    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe( "stop" );
    Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
    tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );
    tokenPipe = new Retain( tokenPipe, fieldSelector );


    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe( "TF", tokenPipe );
    Fields tf_count = new Fields( "tf_count" );

View Full Code Here

      {
      // split the text line into "url" and "raw" with the default delimiter of tab
      RegexSplitter regexSplitter = new RegexSplitter( new Fields( "url", "raw" ) );
      Pipe importPipe = new Each( name, new Fields( "line" ), regexSplitter );
      // remove all pdf documents from the stream
      importPipe = new Each( importPipe, new Fields( "url" ), new RegexFilter( ".*\\.pdf$", true ) );
      // replace ":nl" with a new line, return the fields "url" and "page" to the stream.
      // discared the other fields in the stream
      RegexReplace regexReplace = new RegexReplace( new Fields( "page" ), ":nl:", "\n" );
      importPipe = new Each( importPipe, new Fields( "raw" ), regexReplace, new Fields( "url", "page" ) );

View Full Code Here

    Checkpoint tsvCheck = new Checkpoint( "tsv", gisPipe );


    // parse the "park" output
    Pipe parkPipe = new Pipe( "park", tsvCheck );
    regex = "^\\s+Community Type\\:\\s+Park.*$";
    parkPipe = new Each( parkPipe, new Fields( "misc" ), new RegexFilter( regex ) );


    // parse the "tree" output
    Pipe treePipe = new Pipe( "tree", tsvCheck );
    regex = "^\\s+Private\\:\\s+(\\S+)\\s+Tree ID\\:\\s+(\\d+)\\s+.*Situs Number\\:\\s+(\\d+)\\s+Tree Site\\:\\s+(\\d+)\\s+Species\\:\\s+(\\S.*\\S)\\s+Source.*$";
    treePipe = new Each( treePipe, new Fields( "misc" ), new RegexFilter( regex ) );


    Fields treeFields = new Fields( "priv", "tree_id", "situs", "tree_site", "raw_species" );
    int[] treeGroups = { 1, 2, 3, 4, 5 };
    parser = new RegexParser( treeFields, regex, treeGroups );
    treePipe = new Each( treePipe, new Fields( "misc" ), parser, Fields.ALL );


    // scrub "species" as a primary key
    regex = "^([\\w\\s]+).*$";
    int[] speciesGroups = { 1 };
    parser = new RegexParser( new Fields( "scrub_species" ), regex, speciesGroups );
    treePipe = new Each( treePipe, new Fields( "raw_species" ), parser, Fields.ALL );
    String expression = "scrub_species.trim().toLowerCase()";
    ExpressionFunction exprFunc = new ExpressionFunction( new Fields( "tree_species" ), expression, String.class );
    treePipe = new Each( treePipe, new Fields( "scrub_species" ), exprFunc, Fields.ALL );


    // join with tree metadata
    Pipe metaTreePipe = new Pipe( "meta_tree" );
    treePipe = new HashJoin( treePipe, new Fields( "tree_species" ), metaTreePipe, new Fields( "species" ), new InnerJoin() );
    treePipe = new Rename( treePipe, new Fields( "blurb" ), new Fields( "tree_name" ) );


    regex = "^(\\S+),(\\S+),(\\S+)\\s*$";
    int[] gpsGroups = { 1, 2, 3 };
    parser = new RegexParser( new Fields( "tree_lat", "tree_lng", "tree_alt" ), regex, gpsGroups );
    treePipe = new Each( treePipe, new Fields( "geo" ), parser, Fields.ALL );


    // determine a tree geohash
    Fields geohashArguments = new Fields( "tree_lat", "tree_lng" );
    treePipe = new Each( treePipe, geohashArguments, new GeoHashFunction( new Fields( "tree_geohash" ), 6 ), Fields.ALL );


    Fields fieldSelector = new Fields( "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_geohash" );
    treePipe = new Retain( treePipe, fieldSelector );


    // parse the "road" output
    Pipe roadPipe = new Pipe( "road", tsvCheck );
    regex = "^\\s+Sequence\\:.*\\s+Year Constructed\\:\\s+(\\d+)\\s+Traffic Count\\:\\s+(\\d+)\\s+Traffic Index\\:\\s+(\\w.*\\w)\\s+Traffic Class\\:\\s+(\\w.*\\w)\\s+Traffic Date.*\\s+Paving Length\\:\\s+(\\d+)\\s+Paving Width\\:\\s+(\\d+)\\s+Paving Area\\:\\s+(\\d+)\\s+Surface Type\\:\\s+(\\w.*\\w)\\s+Surface Thickness.*\\s+Bike Lane\\:\\s+(\\w+)\\s+Bus Route\\:\\s+(\\w+)\\s+Truck Route\\:\\s+(\\w+)\\s+Remediation.*$";
    roadPipe = new Each( roadPipe, new Fields( "misc" ), new RegexFilter( regex ) );
    Fields roadFields = new Fields( "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route" );
    int[] roadGroups = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
    parser = new RegexParser( roadFields, regex, roadGroups );
    roadPipe = new Each( roadPipe, new Fields( "misc" ), parser, Fields.ALL );

View Full Code Here


    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );


    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
    pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );


    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );


    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

View Full Code Here

    Tap source = getPlatform().getTextFile( inputFileApache );
    Tap sink = getPlatform().getTextFile( getOutputPath( "filter" ), SinkMode.REPLACE );


    Pipe pipe = new Pipe( "test" );


    Filter filter = new RegexFilter( "^68.*" );


    pipe = new Each( pipe, new Fields( "line" ), filter );


    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );

View Full Code Here

    Tap source = getPlatform().getTextFile( inputFileApache );
    Tap sink = getPlatform().getTextFile( getOutputPath( "logicfilter" ), SinkMode.REPLACE );


    Pipe pipe = new Pipe( "test" );


    Filter filter = new And( new RegexFilter( "^68.*$" ), new RegexFilter( "^1000.*$" ) );


    pipe = new Each( pipe, new Fields( "line" ), filter );


    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );

View Full Code Here


    Pipe pipe = new Pipe( "test" );


    pipe = new Each( pipe, new Fields( "line" ), TestConstants.APACHE_COMMON_PARSER );


    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );
    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );


    pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );


    pipe = new GroupBy( pipe, new Fields( "value" ) );

View Full Code Here

    Fields fieldDeclaration = new Fields( "ip", "time", "method", "event", "status", "size" );
    int[] groups = {1, 2, 3, 4, 5, 6};
    RegexParser function = new RegexParser( fieldDeclaration, regex, groups );
    pipe = new Each( pipe, new Fields( "line" ), function );


    pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all


    pipe = new GroupBy( pipe, new Fields( "method" ) );


    pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );

View Full Code Here

    Tap sink1 = getPlatform().getTextFile( getOutputPath( "split1" ), SinkMode.REPLACE );
    Tap sink2 = getPlatform().getTextFile( getOutputPath( "split2" ), SinkMode.REPLACE );


    Pipe pipe = new Pipe( "split" );


    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );


    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );


    Map sources = new HashMap();
    sources.put( "split", source );


    Map sinks = new HashMap();

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of cascading.operation.regex.RegexFilter

cascading.assembly.CrossTab

cascading.BasicPipesPlatformTest

cascading.CoGroupFieldedPipesPlatformTest

cascading.DistanceUseCasePlatformTest

cascading.FieldedPipesPlatformTest

cascading.flow.hadoop.BuildJobsHadoopPlatformTest

cascading.flow.iso.graph.LoneGroupAssertionGraph

cascading.function.FunctionPlatformTest

cascading.JoinFieldedPipesPlatformTest

cascading.MergePipesPlatformTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.