Package cascading.flow.hadoop

Source Code of cascading.flow.hadoop.BuildJobsHadoopPlatformTest

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.flow.hadoop;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import cascading.PlatformTestCase;
import cascading.TestBuffer;
import cascading.TestFunction;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowConnectorProps;
import cascading.flow.FlowElement;
import cascading.flow.FlowException;
import cascading.flow.FlowStep;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.PlannerException;
import cascading.flow.planner.Scope;
import cascading.flow.planner.graph.ElementGraph;
import cascading.operation.AssertionLevel;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.aggregator.Count;
import cascading.operation.aggregator.First;
import cascading.operation.aggregator.MaxValue;
import cascading.operation.aggregator.Sum;
import cascading.operation.assertion.AssertNotNull;
import cascading.operation.assertion.AssertNull;
import cascading.operation.expression.ExpressionFilter;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexParser;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.HashJoin;
import cascading.pipe.Pipe;
import cascading.pipe.joiner.InnerJoin;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.SequenceFile;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.util.TempHfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import org.jgrapht.alg.DijkstraShortestPath;
import org.junit.Ignore;
import org.junit.Test;

public class BuildJobsHadoopPlatformTest extends PlatformTestCase
  {
  public BuildJobsHadoopPlatformTest()
    {
    super( false );
    }

  /**
   * Test a single piece Pipe, should not fail, inserts Identity pipe
   *
   * @throws IOException
   */
  @Test
  public void testIdentity() throws Exception
    {
    Tap source = new Hfs( new TextLine(), "input/path" );
    Tap sink = new Hfs( new TextLine(), "output/path", true );

    Pipe pipe = new Pipe( "test" );

    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    HadoopFlowStep step = (HadoopFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() );
    assertNull( "not null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );
    }

  @Test
  public void testName()
    {
    Pipe count = new Pipe( "count" );
    Pipe pipe = new GroupBy( count, new Fields( 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );

    assertEquals( "not equal: count.getName()", "count", count.getName() );
    assertEquals( "not equal: pipe.getName()", "count", pipe.getName() );

    pipe = new Each( count, new Fields( 1 ), new RegexSplitter( Fields.size( 2 ) ) );
    assertEquals( "not equal: pipe.getName()", "count", pipe.getName() );
    }

  @Test
  public void testOneJob() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new GroupBy( pipe, new Fields( 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, pipe ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    int mapDist = countDistance( step.getElementGraph(), (FlowElement) step.getSourceTaps().iterator().next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 1, reduceDist );
    }

  @Test
  public void testOneJob2() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new Each( pipe, new Fields( 1 ), new Identity(), new Fields( 2 ) ); // in:second out:all
    pipe = new Each( pipe, new Fields( 0 ), new Identity( new Fields( "_all" ) ), new Fields( 1 ) ); // in:all out:_all
    pipe = new GroupBy( pipe, new Fields( 0 ) ); // in:_all out:_all
    pipe = new Every( pipe, new Fields( 0 ), new Count(), new Fields( 0, 1 ) ); // in:_all out:_all,count

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, pipe ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    int mapDist = countDistance( step.getElementGraph(), (FlowElement) step.getSourceTaps().iterator().next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 2, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 1, reduceDist );
    }

  @Test
  public void testOneJob3() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );
    sources.put( "b", new Hfs( new TextLine( new Fields( "third", "fourth" ) ), "input/path/b" ) );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe splice = new CoGroup( pipeA, new Fields( 1 ), pipeB, new Fields( 1 ) );

    sinks.put( splice.getName(), new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, splice ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 2, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    Iterator<Tap> iterator = step.getSourceTaps().iterator();
    int mapDist = countDistance( step.getElementGraph(), iterator.next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );
    mapDist = countDistance( step.getElementGraph(), iterator.next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 0, reduceDist );
    }

  @Test
  public void testOneJob4() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );
    sources.put( "b", new Hfs( new TextLine( new Fields( "third", "fourth" ) ), "input/path/b" ) );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe cogroup = new CoGroup( pipeA, new Fields( 1 ), pipeB, new Fields( 1 ) );

    cogroup = new Each( cogroup, new Identity() );

    sinks.put( cogroup.getName(), new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, cogroup ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 2, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    int mapDist = countDistance( step.getElementGraph(), (FlowElement) step.getSourceTaps().iterator().next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 1, reduceDist );
    }

  @Test
  public void testOneJob5() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );
    sources.put( "b", new Hfs( new TextLine( new Fields( "third", "fourth" ) ), "input/path/b" ) );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe splice = new CoGroup( pipeA, pipeB );

    splice = new Each( splice, new Identity() );

    sinks.put( splice.getName(), new Hfs( new TextLine(), "output/path" ) );

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, splice ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 2, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    int mapDist = countDistance( step.getElementGraph(), (FlowElement) step.getSourceTaps().iterator().next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 1, reduceDist );
    }

  @Test
  public void testNoGroup() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new Each( pipe, new Identity() );
    pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, pipe );
      fail( "did not throw flow exception" );
      }
    catch( Exception exception )
      {
      // ignore
//      exception.printStackTrace();
      }
    }

  /** This should result in only two steps, one for each side */
  @Test
  public void testSplit()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

//    flow.writeDOT( "safesplit.dot" );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 2, steps.size() );
    }

  /** this test verifies that the planner recognizes there are fewer tails than sinks. */
  @Test
  public void testSplitHangingTails()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );


    // this is ok
    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    try
      {
      getPlatform().getFlowConnector().connect( sources, sinks, pipe );
      fail( "did not catch missing tails" );
      }
    catch( Exception exception )
      {
      System.out.println( "exception.getMessage() = " + exception.getMessage() );
      assertTrue( exception.getMessage().contains( "'left', 'right'" ) );
      }
    }

  @Test
  public void testSplitOnNonSafeOperations()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    // this operation is not safe
    pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );

    FlowStep step = steps.get( 0 );

    assertEquals( "wrong number of operations", 2, ( (BaseFlowStep) step ).getAllOperations().size() );
    }

  @Test
  public void testSplitOnNonSafeOperationsSimple()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    // this operation is not safe
    pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );

    FlowStep step = steps.get( 0 );

    assertEquals( "wrong number of operations", 1, ( (BaseFlowStep) step ).getAllOperations().size() );
    }

  // verify unsafe splits happen when splitting on a pipe
  @Test
  public void testSplitOnNonSafeOperations2()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
    Tap sink3 = new Hfs( new TextLine(), "foo/split3", true );

    Pipe pipe = new Pipe( "split" );

    // this operation is not safe
    pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );

    pipe = new Pipe( "middle", pipe );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );
    sinks.put( "middle", sink3 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 4, steps.size() );

    FlowStep step = steps.get( 0 );

    assertEquals( "wrong number of operations", 2, ( (BaseFlowStep) step ).getAllOperations().size() );
    }

  /**
   * This should result in a Temp Tap after the Each split.
   * <p/>
   * We previously would push the each to the next step, but if there is already data being written, save the cpu.
   */
  @Test
  public void testSplitComplex()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );

    pipe = new GroupBy( pipe, new Fields( "ip" ) );

    pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );

    pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) );

    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "ip" ), new RegexFilter( ".*192.*" ) );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

//    flow.writeDOT( "splitcomplex.dot" );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    Scope nextScope = step.getNextScope( step.getGroup() );
    FlowElement operator = step.getNextFlowElement( nextScope );

    assertTrue( "not an Every", operator instanceof Every );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    assertTrue( "not a Each", operator instanceof Each );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    assertTrue( "not a TempHfs", operator instanceof TempHfs );
    }

  /** same as splitComplex, except pipe/branch naming is after the Each, not before */
  @Test
  public void testSplitComplex2()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
    Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );

    pipe = new GroupBy( pipe, new Fields( "ip" ) );

    pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );

    pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( pipe, new Fields( "ip" ), new RegexFilter( ".*46.*" ) );

    left = new Pipe( "left", left );

    Pipe right = new Each( pipe, new Fields( "ip" ), new RegexFilter( ".*192.*" ) );

    right = new Pipe( "right", right );

    Map sources = new HashMap();
    sources.put( "split", source );

    Map sinks = new HashMap();
    sinks.put( "left", sink1 );
    sinks.put( "right", sink2 );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    Scope nextScope = step.getNextScope( step.getGroup() );
    FlowElement operator = step.getNextFlowElement( nextScope );

    assertTrue( "not an Every", operator instanceof Every );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    assertTrue( "not a Each", operator instanceof Each );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    assertTrue( "not a TempHfs", operator instanceof TempHfs );
    }

  @Test
  public void testMerge()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
    Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge2" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "right", source2 );

    Map sinks = new HashMap();
    sinks.put( "merge", sink );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 1, steps.size() );
    }

  @Test
  public void testDupeSource()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
    Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
    right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );
    right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );
    right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "right", source2 );

    Map sinks = new HashMap();
    sinks.put( "merge", sink );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 1, steps.size() );
    }

  @Test
  public void testDupeSourceRepeat()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe pipe = new Pipe( "pipe" );

    Pipe merge = new CoGroup( "cogroup", pipe, new Fields( "offset" ), 1, Fields.size( 4 ) );

    Map sources = new HashMap();
    sources.put( "pipe", source1 );

    Map sinks = new HashMap();
    sinks.put( "cogroup", sink );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 1, steps.size() );
    }

  @Test
  public void testDupeSource2()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Pipe( "left" );
    Pipe right = new Pipe( "right" );

    Pipe merge = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "right", source1 );

    Map sinks = new HashMap();
    sinks.put( "cogroup", sink );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
//    flow.writeDOT( "dupesource.dot" );
      fail( "did not throw planner exception" );
      }
    catch( Exception exception )
      {

      }
    }

  @Test
  public void testDupeSource3()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
    Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar/merge" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Pipe( "left" );
    Pipe middle = new Pipe( "middle" );
    Pipe right = new Pipe( "right" );

    Pipe[] pipes = Pipe.pipes( left, middle, right );
    Fields[] fields = Fields.fields( new Fields( "offset" ), new Fields( "offset" ), new Fields( "offset" ) );

    Pipe merge = new CoGroup( "cogroup", pipes, fields, Fields.size( 6 ) );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "middle", source2 );
    sources.put( "right", source1 );

    Map sinks = new HashMap();
    sinks.put( "cogroup", sink );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not throw planner exception" );
      }
    catch( PlannerException exception )
      {
//      exception.printStackTrace();
      }
    }

//  public void testEquivalentPaths()
//    {
//    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
//    Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
//
//    Tap sink = new Hfs( new TextLine(), "foo" );
//
//    Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
//    Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
//
//    Pipe join = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) );
//
//    Map sources = new HashMap();
//    sources.put( "left", source1 );
//    sources.put( "right", source2 );
//
//    Map sinks = new HashMap();
//    sinks.put( "cogroup", sink );
//
//    Flow flow = new FlowConnector().connect( sources, sinks, join );
//    flow.writeDOT( "identicalpaths.dot" );
//
//    List<FlowStep> steps = flow.getSteps();
//
//    assertEquals( "not equal: steps.size()", 1, steps.size() );
//
//    FlowStep step = steps.get( 0 );
//    System.out.println( "size: " + step.sources.size() );
//
//    System.out.println( "size: " + step.getNextScopes( step.sources.keySet().iterator().next()).size() );
//    }

  @Test
  public void testMerge2()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
    Tap source2 = new Hfs( new SequenceFile( new Fields( "offset", "line" ) ), "foo/merge2" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "right", source2 );

    Map sinks = new HashMap();
    sinks.put( "merge", sink );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 1, steps.size() );
    }

  /**
   * Tests the case where the same source is split, then re-merged
   */
  @Test
  public void testMergeSameSourceSplit()
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe head = new Pipe( "source" );

    head = new Each( head, new Fields( "line" ), new ExpressionFilter( "line.length() != 0", String.class ) );

    Pipe left = new Each( new Pipe( "left", head ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", head ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );

    Flow flow = getPlatform().getFlowConnector().connect( source, sink, merge );

//    flow.writeDOT( "splitmerge.dot" );

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 1, steps.size() );
    }

  @Test
  public void testCoGroupAroundCoGroup() throws Exception
    {
    Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
    Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "source20", source20 );
    sources.put( "source101", source10 );
    sources.put( "source102", source10 );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "baz", true );

    Pipe pipeNum20 = new Pipe( "source20" );
    Pipe pipeNum101 = new Pipe( "source101" );
    Pipe pipeNum102 = new Pipe( "source102" );

    Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );

    assertEquals( "not equal: steps.size()", 2, flow.getFlowSteps().size() );
    }

  @Test
  public void testCoGroupAroundCoGroupOptimized() throws Exception
    {
    Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
    Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "source20", source20 );
    sources.put( "source101", source10 );
    sources.put( "source102", source10 );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "baz", true );

    Pipe pipeNum20 = new Pipe( "source20" );
    Pipe pipeNum101 = new Pipe( "source101" );
    Pipe pipeNum102 = new Pipe( "source102" );

    Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );

    Properties properties = new Properties();
    FlowConnectorProps.setIntermediateSchemeClass( properties, TextLine.class );

    FlowConnector flowConnector = getPlatform().getFlowConnector( properties );

    Flow flow = flowConnector.connect( sources, sink, splice2 );

    assertEquals( "not equal: steps.size()", 2, flow.getFlowSteps().size() );
    }

  @Test
  public void testCoGroupAroundCoGroupAroundCoGroup() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper1", sourceUpper );
    sources.put( "upper2", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "output", true );

    Pipe pipeLower = new Each( "lower", new Fields( "line" ), splitter );
    Pipe pipeUpper1 = new Each( "upper1", new Fields( "line" ), splitter );
    Pipe pipeUpper2 = new Each( "upper2", new Fields( "line" ), splitter );

    Pipe splice1 = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );

    splice1 = new Each( splice1, new Identity() );

    splice1 = new GroupBy( splice1, new Fields( 0 ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );

    splice2 = new Each( splice2, new Identity() );

    splice2 = new GroupBy( splice2, new Fields( 0 ) );

    splice2 = new CoGroup( splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
      }
    catch( FlowException exception )
      {
//      exception.writeDOT( "cogroupcogroup.dot" );
      throw exception;
      }

    assertEquals( "not equal: steps.size()", 5, flow.getFlowSteps().size() );
    }

  @Test
  public void testCoGroupWithResultGroupFieldsDefault() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) );

    splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num1", "value" ) );

    Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice );
    }

  @Test
  public void testCoGroupWithResultGroupFields() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum", "somenum2" ) );

    splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) );

    Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice );
    }

  @Test
  public void testDirectCoGroup() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );
    sources.put( "lower2", sourceLower );
    sources.put( "upper1", sourceUpper );
    sources.put( "upper2", sourceUpper );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );
    Pipe pipeLower2 = new Pipe( "lower2" );
    Pipe pipeUpper1 = new Pipe( "upper1" );
    Pipe pipeUpper2 = new Pipe( "upper2" );

    Pipe splice1 = new CoGroup( pipeLower1, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );

    splice2 = new CoGroup( "output1", splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );

    Pipe splice3 = new CoGroup( "output2", pipeLower2, new Fields( "num" ), splice2, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5", "num6", "char6" ) );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, splice3 );
      }
    catch( FlowException exception )
      {
//      exception.writeDOT( "directcogroup.dot" );
      throw exception;
      }

    assertEquals( "not equal: steps.size()", 5, flow.getFlowSteps().size() );
    }

  /**
   * verify case where same source is fed to multiple chained cogroups
   *
   * @throws Exception
   */
  @Test
  public void testMultipleCoGroupSimilarSources() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );
    sources.put( "upper1", sourceUpper );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );
    Pipe pipeUpper1 = new Pipe( "upper1" );

    Pipe splice1 = new CoGroup( pipeLower1, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );

    splice2 = new CoGroup( "output1", splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );

    Pipe splice3 = new CoGroup( "output2", pipeUpper1, new Fields( "num" ), splice2, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5", "num6", "char6" ) );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, splice3 );
      }
    catch( FlowException exception )
      {
//      exception.writeDOT( "chainedcogroup.dot" );
      throw exception;
      }

    assertEquals( "not equal: steps.size()", 5, flow.getFlowSteps().size() );
    }

  /**
   * tests to make sure splits on a pipe before a cogroup and after result in proper normalization
   *
   * @throws Exception
   */
  @Test
  public void testMultipleCoGroupSplitSources() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );
    sources.put( "upper1", sourceUpper );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );
    Pipe pipeUpper1 = new Pipe( "upper1" );

    Pipe pipeLower2 = new Each( pipeLower1, new Identity() );
    pipeLower2 = new GroupBy( pipeLower2, new Fields( "num", "char" ) );
    pipeLower2 = new Every( pipeLower2, new Fields( "num", "char" ), new Count(), new Fields( "num", "char" ) );

    pipeLower1 = new Each( pipeLower1, new Identity() );
    pipeLower1 = new Each( pipeLower1, new Identity() );
    pipeLower1 = new Each( pipeLower1, new Identity() );
    pipeLower1 = new Pipe( "lower2", pipeLower1 );

    pipeUpper1 = new Each( pipeUpper1, new Identity() );
    pipeUpper1 = new Each( pipeUpper1, new Identity() );
    pipeUpper1 = new Each( pipeUpper1, new Identity() );

    Pipe splice1 = new CoGroup( "group", Pipe.pipes( pipeLower1, pipeLower2, pipeUpper1 ), Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ), new InnerJoin() );

    Pipe output1 = new Each( splice1, AssertionLevel.VALID, new AssertNotNull() );
    output1 = new Each( output1, new Identity() );
    output1 = new Pipe( "output1", output1 );

    Pipe output2 = new Each( splice1, AssertionLevel.VALID, new AssertNull() );
    output2 = new Each( output2, new Identity() );
    output2 = new Pipe( "output2", output2 );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, output1, output2 );
      }
    catch( FlowException exception )
      {
//      exception.writeDOT( "chainedcogroup.dot" );
      throw exception;
      }

    assertEquals( "not equal: steps.size()", 4, flow.getFlowSteps().size() );
    }

  /**
   * verify split is homogeneous
   *
   * @throws Exception
   */
  @Test
  public void testSplitEachOnGroup() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );

    Pipe pipe = new GroupBy( pipeLower1, new Fields( 0 ) );
    Pipe left = new Each( new Pipe( "output1", pipe ), new Identity() );
    Pipe right = new Each( new Pipe( "output2", pipe ), new Identity() );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
      }
    catch( PlannerException exception )
      {
//      exception.writeDOT( "splitout.dot" );
      throw exception;
      }

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );
    }

  @Test
  public void testSplitEveryOnGroup() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );

    Pipe pipe = new GroupBy( pipeLower1, new Fields( 0 ) );
    Pipe left = new Every( new Pipe( "output1", pipe ), new TestBuffer( new Fields( "left" ), true ) );
    Pipe right = new Every( new Pipe( "output2", pipe ), new TestBuffer( new Fields( "right" ), true ) );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
      fail( "did not throw planner exception" );
      }
    catch( PlannerException exception )
      {
//      exception.printStackTrace(  );
      }
    }

  @Test
  public void testSplitOutput() throws Exception
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );

    Map sources = new HashMap();

    sources.put( "lower1", sourceLower );

    // using null pos so all fields are written
    Tap sink1 = new Hfs( new TextLine(), "output1", true );
    Tap sink2 = new Hfs( new TextLine(), "output2", true );

    Map sinks = new HashMap();

    sinks.put( "output1", sink1 );
    sinks.put( "output2", sink2 );

    Pipe pipeLower1 = new Pipe( "lower1" );

    Pipe left = new GroupBy( "output1", pipeLower1, new Fields( 0 ) );
    Pipe right = new GroupBy( "output2", left, new Fields( 0 ) );

    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
      }
    catch( FlowException exception )
      {
//      exception.writeDOT( "splitout.dot" );
      throw exception;
      }

    List<FlowStep> steps = flow.getFlowSteps();

    assertEquals( "not equal: steps.size()", 3, steps.size() );

//    for( FlowStep step : steps )
//      {
//      if( step.group != null )
//        continue;
//
//      Scope nextScope = step.getNextScope( step.sources.keySet().iterator().next() );
//      FlowElement operator = step.getNextFlowElement( nextScope );
//
//      assertTrue( "should be Pipe", operator instanceof Pipe );
//      }
    }

  /**
   * DISABLED
   * found having pipes with same names was too error prone. the workaround is to bind the tap to both names.
   * if the process logically must use the same tap for each branch, then the branch should be split
   *
   * This tests if two pipes can have the same name, and thus logically the same input source.
   * <p/>
   * Further, a GroupBy with two inputs would fail if the source was directly associated. but there is a Group
   * function between the source and the merge, so it passes.
   *
   *
   * @throws java.io.IOException
   */
//  public void testSameHeadName() throws IOException
//    {
//    Map sources = new HashMap();
//    Map sinks = new HashMap();
//
//    sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
//
//    Pipe pipeA = new Pipe( "a" );
//    Pipe pipeB = new Pipe( "a" );
//
//    Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
//    Pipe group2 = new GroupBy( "a2", pipeB, Fields.FIRST );
//
//    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
//
//    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
//
//    Flow flow = new FlowConnector().connect( sources, sinks, merge );
//
//    assertEquals( "not equal: steps.size()", 3, flow.getSteps().size() );
//    }

  /**
   * This is an alternative to having two pipes with the same name, but uses one pipe that is split
   * across two branches.
   *
   * @throws IOException
   */
  @Test
  public void testSameSourceForBranch() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );

    Pipe pipeA = new Pipe( "a" );

    Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
    Pipe group2 = new GroupBy( "a2", pipeA, Fields.FIRST );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }

  /**
   * Verifies the same tap instance can be shared between two logically different pipes.
   *
   * @throws IOException
   */
  @Test
  public void testSameTaps() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }

  @Test
  public void testDanglingHead() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs source = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", source );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch missing source tap" );
      }
    catch( PlannerException exception )
      {
      // do nothing
      }
    catch( Exception exception )
      {
      fail( "threw wrong exception" );
      }
    }

  @Test
  public void testDanglingTail() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

//    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch missing sink tap" );
      }
    catch( PlannerException exception )
      {
      // do nothing
      }
    catch( Exception exception )
      {
      fail( "threw wrong exception" );
      }
    }

  @Test
  public void testExtraSource() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );
    sources.put( "c", tap );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch extra source tap" );
      }
    catch( PlannerException exception )
      {
//      exception.printStackTrace();
      assertTrue( exception.getMessage().contains( "['c']" ) );
      }
    catch( Exception exception )
      {
      fail( "threw wrong exception" );
      }
    }

  @Test
  public void testExtraSink() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
    sinks.put( "c", new Hfs( new TextLine(), "output/path" ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch extra sink tap" );
      }
    catch( PlannerException exception )
      {
//      exception.printStackTrace();
      assertTrue( exception.getMessage().contains( "['c']" ) );
      }
    catch( Exception exception )
      {
      fail( "threw wrong exception" );
      }
    }

  @Test
  public void testBuffer() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new GroupBy( pipe, new Fields( 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );

    List steps = getPlatform().getFlowConnector().connect( sources, sinks, pipe ).getFlowSteps();

    assertEquals( "wrong size", 1, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() );
    assertNotNull( "null: step.groupBy", step.getGroup() );
    assertNotNull( "null: step.sink", step.getSink() );

    int mapDist = countDistance( step.getElementGraph(), (FlowElement) step.getSourceTaps().iterator().next(), step.getGroup() );
    assertEquals( "not equal: mapDist", 0, mapDist );

    int reduceDist = countDistance( step.getElementGraph(), step.getGroup(), step.getSink() );
    assertEquals( "not equal: reduceDist", 1, reduceDist );
    }

  @Test
  public void testBufferFail() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new GroupBy( pipe, new Fields( 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );

    try
      {
      getPlatform().getFlowConnector().connect( sources, sinks, pipe );
      fail( "did not throw planner exception" );
      }
    catch( Exception exception )
      {
      // ignore
//      exception.printStackTrace();
      }
    }

  @Test
  public void testBufferFail2() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "count", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path" ) );
    sinks.put( "count", new Hfs( new TextLine( new Fields( 0, 1 ) ), "output/path" ) );

    Pipe pipe = new Pipe( "count" );
    pipe = new GroupBy( pipe, new Fields( 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
    pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );

    try
      {
      getPlatform().getFlowConnector().connect( sources, sinks, pipe );
      fail( "did not throw planner exception" );
      }
    catch( Exception exception )
      {
      // ignore
//      exception.printStackTrace();
      }
    }

  @Test
  public void testErrorMessages() throws Exception
    {
    Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
    Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );

    Map sources = new HashMap();

    sources.put( "source20", source20 );
    sources.put( "source101", source10 );
    sources.put( "source102", source10 );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), "baz", true );

    Pipe pipeNum20 = new Pipe( "source20" );
    Pipe pipeNum101 = new Pipe( "source101" );
    Pipe pipeNum102 = new Pipe( "source102" );

    Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );

    Pipe splice2 = new CoGroup( splice1, new Fields( "num9" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );

    FlowConnector flowConnector = getPlatform().getFlowConnector();

    try
      {
      Flow flow = flowConnector.connect( sources, sink, splice2 );
      fail( "did not fail on bad field" );
      }
    catch( Exception exception )
      {
      // ignore
      assertTrue( "missing message", exception.getMessage().contains( "BuildJobsHadoopPlatformTest.testErrorMessages" ) );
      }
    }

  /**
   * This test verifies splits on Pipe instances are recognized
   * <p/>
   * This flow intentionally splits to a Each and a Tap from a Each
   * <pre>
   *
   *  .... E1 - T1 - E2 - T2
   *
   * </pre>
   * <p/>
   * this test also verifed T1 feeds E2, instead of a new copy job being created
   *
   * @throws Exception
   */
  @Test
  public void testSplitInMiddleBeforePipeOptimized() throws Exception
    {
    splitMiddle( true, true );
    }

  @Test
  public void testSplitInMiddleBeforePipe() throws Exception
    {
    splitMiddle( true, false );
    }

  @Test
  public void testSplitInMiddleAfterPipe() throws Exception
    {
    splitMiddle( false, false );
    }

  private void splitMiddle( boolean before, boolean testTempReplaced )
    {
    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "lower" );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "upper" );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Scheme leftScheme = testTempReplaced ? new SequenceFile( new Fields( "num", "lower", "num2", "upper" ) ) : new TextLine( new Fields( "offset", "line" ), new Fields( "lower" ) );
    Tap sinkLeft = new Hfs( leftScheme, "/splitmiddle/left", SinkMode.REPLACE );

    Scheme rightScheme = testTempReplaced ? new SequenceFile( new Fields( "lower" ) ) : new TextLine( new Fields( "offset", "line" ), new Fields( "lower" ) );
    Tap sinkRight = new Hfs( rightScheme, "/splitmiddle/right", SinkMode.REPLACE );

    Map sinks = new HashMap();

    sinks.put( "left", sinkLeft );
    sinks.put( "right", sinkRight );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new CoGroup( "both", pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num", "lower", "num2", "upper" ) );

    splice = new Each( splice, new Fields( "num" ), new RegexFilter( ".*" ) );

    Pipe left = splice;

    if( before )
      left = new Pipe( "left", left );

    left = new Each( left, new Fields( "num" ), new RegexFilter( ".*" ) );

    if( !before )
      left = new Pipe( "left", left );

    Pipe right = left;

    if( before )
      right = new Pipe( "right", right );

    right = new Each( right, new Fields( "num" ), new RegexFilter( ".*" ) );

    if( !before )
      right = new Pipe( "right", right );

    FlowConnector flowConnector = getPlatform().getFlowConnector();

    Flow flow = flowConnector.connect( "splitmiddle", sources, sinks, left, right );

    List<FlowStep> steps = flow.getFlowSteps();

    // if the optimization isn't in place, don't test for it
    if( testTempReplaced && !flowConnector.getRuleRegistry().hasRule( "CombineAdjacentTapTransformer" ) )
      testTempReplaced = false;

    assertEquals( "not equal: steps.size()", testTempReplaced ? 2 : 3, steps.size() );

    BaseFlowStep step = (BaseFlowStep) steps.get( 0 );

    Scope nextScope = step.getNextScope( step.getGroup() );
    FlowElement operator = step.getNextFlowElement( nextScope );

    assertTrue( "not an Each", operator instanceof Each );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    assertTrue( "not a Each", operator instanceof Each );

    nextScope = step.getNextScope( operator );
    operator = step.getNextFlowElement( nextScope );

    if( testTempReplaced )
      {
      assertEquals( "not proper sink", sinkLeft, operator );
      }
    else
      {
      assertTrue( "not a TempHfs", operator instanceof TempHfs );
      }
    }

  @Test
  public void testSourceIsSink()
    {
    Tap tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );

    Pipe pipe = new Pipe( "left" );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( tap, tap, pipe );
      fail( "did not throw planner exception" );
      }
    catch( Exception exception )
      {
//      exception.printStackTrace();
      }
    }

  @Test
  public void testReplaceFail() throws Exception
    {
    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
    Tap sink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "offset", "line2" ) ), "bar", true );

    Pipe pipe = new Pipe( "test" );

    Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" );
    pipe = new Each( pipe, new Fields( "line" ), parser, Fields.REPLACE );
    pipe = new Each( pipe, new Fields( "line" ), new Identity( Fields.ARGS ), Fields.REPLACE );
    pipe = new Each( pipe, new Fields( "line" ), new Identity( new Fields( "line2" ) ), Fields.REPLACE );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
      fail( "did not fail" );
      }
    catch( Exception exception )
      {
      }
    }

  private int countDistance( ElementGraph graph, FlowElement lhs, FlowElement rhs )
    {
    return DijkstraShortestPath.findPathBetween( graph, lhs, rhs ).size() - 1;
    }

  @Test
  public void testNestedProperties() throws IOException
    {
    Tap source = new Hfs( new TextLine( new Fields( "line" ) ), "/input" );

    Pipe pipe = new Pipe( "test" );

    pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.ALL );

    Tap sink = new Hfs( new TextLine(), "output", true );

    Properties defaultProperties = new Properties();

    defaultProperties.setProperty( "test.key", "test.value" );

    HadoopFlow flow = (HadoopFlow) getPlatform().getFlowConnector( new Properties( defaultProperties ) ).connect( source, sink, pipe );

    assertEquals( "test flow", "test.value", flow.getProperty( "test.key" ) );
    assertEquals( "test step", "test.value", ( (HadoopFlowStep) flow.getFlowSteps().get( 0 ) ).createInitializedConfig( flow.getFlowProcess(), flow.getConfig() ).get( "test.key" ) );
    }

  @Test
  public void testEveryAfterJoin()
    {
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
    Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge2" );

    Tap sink = new Hfs( new TextLine(), "foo" );

    Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );

    Fields[] fields = Fields.fields( new Fields( "offset" ), new Fields( "offset" ) );
    Pipe merge = new HashJoin( "join", Pipe.pipes( left, right ), fields, Fields.size( 4 ), new InnerJoin() );

    merge = new Every( merge, new MaxValue() );

    Map sources = new HashMap();
    sources.put( "left", source1 );
    sources.put( "right", source2 );

    Map sinks = new HashMap();
    sinks.put( "join", sink );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail();
      }
    catch( Exception exception )
      {
      // do nothing
//      exception.printStackTrace();
      }
    }

  //  @Test
  @Ignore
  public void testManyJoins()
    {
    int n = 50;
    Map<String, Tap> sources = new HashMap<>();
    Map<String, Tap> sinks = new HashMap<>();
    Pipe[] pipes = new Pipe[ n ];

    int count = 0;

    for( int i = 0; i < n; i++ )
      {
      String nameIn = "in" + i;
      String nameOut = "out" + i;
      Pipe pipe = new Pipe( nameIn );
      sources.put( nameIn, new Hfs( new TextLine( new Fields( "key" + i ) ), "foo/in" + i ) );
      sinks.put( nameOut, new Hfs( new TextLine(), "foo/out" + i ) );

      count += 2; // 2 taps

      if( i > 0 )
        {
        pipe = new CoGroup( pipes[ i - 1 ], new Fields( "key" + ( i - 1 ) ), pipe, new Fields( "key" + i ) );
        pipe = new Every( pipe, new Fields( "key" + ( i - 1 ) ), new Sum() );
        count += 2; // 2 pipes
        }

      pipes[ i ] = new Pipe( nameOut, pipe );
      count += 1; // 1 pipe
      }

    long start = System.currentTimeMillis();
    Flow flow = new HadoopFlowConnector().connect( sources, sinks, pipes );
    long end = System.currentTimeMillis();
    System.out.printf( "n = %d: elements: %d: %.03f seconds\n", n, count, ( end - start ) / 1000.0 );
    }
  }
TOP

Related Classes of cascading.flow.hadoop.BuildJobsHadoopPlatformTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.