Package org.apache.crunch.contrib.text

Source Code of org.apache.crunch.contrib.text.ParseTest

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.contrib.text;

import static org.apache.crunch.contrib.text.Extractors.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.util.Collection;

import org.apache.crunch.PCollection;
import org.apache.crunch.Pair;
import org.apache.crunch.Tuple3;
import org.apache.crunch.Tuple4;
import org.apache.crunch.TupleN;
import org.apache.crunch.impl.mem.MemPipeline;
import org.apache.crunch.types.avro.Avros;
import org.junit.Test;

import com.google.common.collect.ImmutableList;

public class ParseTest {

  @Test
  public void testInt() {
    assertEquals(Integer.valueOf(1729), xint().extract("1729"));
    assertEquals(Integer.valueOf(321), xint(321).extract("foo"));
  }

  @Test
  public void testString() {
    assertEquals("bar", xstring().extract("bar"));
  }

  @Test
  public void testPairWithDrop() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").drop(0, 2).build();
    assertEquals(Pair.of(1, "abc"), xpair(sf, xint(), xstring()).extract("foo,1,17.29,abc"));
  }

  @Test
  public void testTripsWithSkip() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").skip("^foo").build();
    assertEquals(Tuple3.of(17, "abc", 3.4f),
        xtriple(sf, xint(), xstring(), xfloat()).extract("foo17;abc;3.4"));
  }
 
  @Test
  public void testTripsWithKeep() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").keep(1, 2, 3).build();
    assertEquals(Tuple3.of(17, "abc", 3.4f),
        xtriple(sf, xint(), xstring(), xfloat()).extract("foo;17;abc;3.4"));
  }
 
  @Test
  public void testQuadsWithWhitespace() {
    TokenizerFactory sf = TokenizerFactory.getDefaultInstance();
    assertEquals(Tuple4.of(1.3, "foo", true, 1L),
        xquad(sf, xdouble(), xstring(), xboolean(), xlong()).extract("1.3   foo  true 1"));
  }
 
  @Test
  public void testTupleN() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").build();
    assertEquals(new TupleN(1, false, true, 2, 3),
        xtupleN(sf, xint(), xboolean(), xboolean(), xint(), xint()).extract("1,false,true,2,3"));
  }
 
  @Test
  public void testCollections() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(";").build();
    // Use 3000 as the default for values we can't parse
    Extractor<Collection<Integer>> x = xcollect(sf, xint(3000));
   
    assertEquals(ImmutableList.of(1, 2, 3), x.extract("1;2;3"));
    assertFalse(x.errorOnLastRecord());
    assertEquals(ImmutableList.of(17, 29, 3000), x.extract("17;29;a"));
    assertTrue(x.errorOnLastRecord());
    assertEquals(1, x.getStats().getErrorCount());
  }
 
  @Test
  public void testNestedComposites() {
    TokenizerFactory outer = TokenizerFactory.builder().delimiter(";").build();
    TokenizerFactory inner = TokenizerFactory.builder().delimiter(",").build();
    Extractor<Pair<Pair<Long, Integer>, Tuple3<String, Integer, Float>>> extractor =
        xpair(outer, xpair(inner, xlong(), xint()), xtriple(inner, xstring(), xint(), xfloat()));
    assertEquals(Pair.of(Pair.of(1L, 2), Tuple3.of("a", 17, 29.0f)),
        extractor.extract("1,2;a,17,29"));
  }
 
  @Test
  public void testParse() {
    TokenizerFactory sf = TokenizerFactory.builder().delimiter(",").build();
    PCollection<String> lines = MemPipeline.typedCollectionOf(Avros.strings(), "1,3.0");
    Iterable<Pair<Integer, Float>> it = Parse.parse("test", lines,
        xpair(sf, xint(), xfloat())).materialize();
    assertEquals(ImmutableList.of(Pair.of(1, 3.0f)), it);
  }
}
TOP

Related Classes of org.apache.crunch.contrib.text.ParseTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.