Package com.ebay.erl.mobius.core.criterion

Source Code of com.ebay.erl.mobius.core.criterion.TupleRestrictions

package com.ebay.erl.mobius.core.criterion;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;

import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.collection.CaseInsensitiveTreeSet;
import com.ebay.erl.mobius.core.model.Tuple;

/**
* Factory class that provides methods to define {@link TupleCriterion}
* for filtering {@link Tuple}s in a {@link Dataset}.
*
* <p>
* This product is licensed under the Apache License,  Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*/
public class TupleRestrictions
  /**
   * Hadoop configuration
   */
  protected static Configuration conf;
 
 
  /**
   * Setup Hadoop configuration.
   */
  public static final void configure(Configuration conf)
  {
    TupleRestrictions.conf = conf;
  }
 
  private static File checkFileExist(File file)
    throws FileNotFoundException
  {
    if( TupleRestrictions.conf!=null && TupleRestrictions.conf.get ("mobius.studio.workspace.base")!=null )
    {
      File base = new File(TupleRestrictions.conf.get ("mobius.studio.workspace.base"));
      File f = new File(base, file.getName ());
      if( !f.exists () )
      {
        throw new FileNotFoundException("File not found in:"+f.getAbsolutePath ());
      }
      return f;
    }
    else
    {   
      if( !file.exists () )
      {
        throw new FileNotFoundException("File not found:"+file.getAbsolutePath ());
      }
      return file;
    }
  }
 
 
  /**
   * Create a {@link TupleCriterion} that only accept tuples with
   * the value of the specified <code>column</code> that is within
   * the provide <code>list</code>.
   * <p>
   *
   * The value of the <code>column</code> will be converted into
   * string, if it's not string, to compare.
   */
  public static TupleCriterion withinString(final String column, final ArrayList<String> list)
  {
    return new StringCriterion(column, list, RelationalOperator.WITHIN);
  }
 
 
  /**
   * Create a {@link TupleCriterion} that only accepts tuples with
   * the value of the specified <code>column</code> that is within
   * the provide <code>list</code>.
   * <p>
   *
   * The value of the <code>column</code> will be converted into
   * double, if it's not number, to compare.
   */
  public static TupleCriterion withinNumber(final String column, final ArrayList<Double> list)
  {
    return new NumberCriterion(column, list, RelationalOperator.WITHIN);
  }
 
 
 
  /**
   * Create a tuple criterion that only accepts tuples when the value
   * of the <code>column</code> are presented in the given <code>file</code>
   * <p>
   *
   * The assumption of the file is that, it's single column and one to many
   * line text file.  Each line is read into a case insensitive set, and
   * using the set to check the value of the <code>column</code> within
   * the set or not.
   *
   *
   * @param column the name of a column to be tested that whether its value is in
   * the given <code>file</code> or not
   *
   * @param file a single column and multiple lines of file that contains strings/numbers,
   * each line is treated as a single unit.
   *
   * @return an instance of {@link TupleCriterion} that extracts only the records
   * when the value of its <code>column</code> are presented in the given
   * <code>file</code>.
   *
   * @throws FileNotFoundException if the given file cannot be found.
   */
  public static TupleCriterion within(final String column, File file)
    throws FileNotFoundException
  {
    final File f = TupleRestrictions.checkFileExist (file);
   
    return new TupleCriterion(){     
     
      private static final long serialVersionUID = -1121221619118915652L;
      private Set<String> set;
     
      @Override
      public void setConf(Configuration conf)
      {
        try
        {
          if( conf.get ("tmpfiles")==null || conf.get ("tmpfiles").trim ().length ()==0 )
          {
            conf.set ("tmpfiles", validateFiles (f.getAbsolutePath (), conf));
          }
          else
          {
            conf.set ("tmpfiles", validateFiles (f.getAbsolutePath (), conf)+","+conf.get("tmpfiles"));
          }
         
        }
        catch ( IOException e )
        {
          throw new IllegalArgumentException(e);
        }
      }
     
      /**
       * COPIED FROM org.apache.hadoop.util.GenericOptionsParser
       */
      private String validateFiles(String files, Configuration conf) throws IOException
      {
        if ( files == null )
          return null;
        String[] fileArr = files.split (",");
        String[] finalArr = new String[fileArr.length];
        for ( int i = 0; i < fileArr.length; i++ )
        {
          String tmp = fileArr[i];
          String finalPath;
          Path path = new Path (tmp);
          URI pathURI = path.toUri ();
          FileSystem localFs = FileSystem.getLocal (conf);
          if ( pathURI.getScheme () == null )
          {
            // default to the local file system
            // check if the file exists or not first
            if ( !localFs.exists (path) )
            {
              throw new FileNotFoundException ("File " + tmp + " does not exist.");
            }
            finalPath = path.makeQualified (localFs).toString ();
          } else
          {
            // check if the file exists in this file system
            // we need to recreate this filesystem object to copy
            // these files to the file system jobtracker is running
            // on.
            FileSystem fs = path.getFileSystem (conf);
            if ( !fs.exists (path) )
            {
              throw new FileNotFoundException ("File " + tmp + " does not exist.");
            }
            finalPath = path.makeQualified (fs).toString ();
            try
            {
              fs.close ();
            } catch ( IOException e )
            {
            }
            ;
          }
          finalArr[i] = finalPath;
        }
        return StringUtils.arrayToString (finalArr);
      }
     
     
       
      @Override
      protected boolean evaluate(Tuple tuple, Configuration configuration)
      { 
        if( set==null )
        {
          set = new CaseInsensitiveTreeSet();
          BufferedReader br = null;
          try
          {
            br = new BufferedReader(new FileReader(new File(f.getName ())));
            String newLine = null;
            while( (newLine=br.readLine ())!=null )
            {
              this.set.add (newLine);
            }
          }catch(IOException e)
          {
            throw new RuntimeException(e);
          }
          finally
          {
            try{br.close ();}catch(Throwable e){}
          }
        }
       
        String value = tuple.getString (column);
        if( value!=null )
        {
          return this.set.contains (value);
        }
        else
        {
          return false;
        }
      }
 
      @Override
      public String[] getInvolvedColumns()
      {       
        return new String[]{column};
      }
    };
  }
 
 
 
  /**
   * Create a tuple criterion that only accepts tuples
   * with the value of <code>column</code> that is <b>NOT</b>
   * presented in the given <code>file</code>
   *
   * The assumption of the file is that, it's single column and one to many
   * line text file.  Each line is read into a case insensitive set, and
   * using the set to check the value of the <code>column</code> within
   * the set or not.
   *
   * @param column the name of a column to be tested that whether its value is in
   * the given <code>file</code> or not
   *
   * @param file a single column and multiple lines of file that contains strings/numbers,
   * each line is treated as a single unit.
   *
   * @return an instance of {@link TupleCriterion} that extracts only the records
   * when the value of its <code>column</code> are <b>NOT</b>presented in the given
   * <code>file</code>.
   *
   * @throws FileNotFoundException if the given file cannot be found.
   */
  public static TupleCriterion not_within(final String column, final File file)
    throws FileNotFoundException
  {
    TupleCriterion criterion = TupleRestrictions.within (column, file);
    TupleCriterion notCriterion = criterion.not();
    return notCriterion;
  }
 
 
  /**
   * Create a {@link TupleCriterion} that only accept tuples with
   * the value of the specified <code>column</code> is <b>not</b>
   * within the provide <code>list</code>.
   * <p>
   *
   * The value of the <code>column</code> will be converted into
   * double to compare, if it's not double.
   */
  public static TupleCriterion notWithinNumber(final String column, final ArrayList<Double> values)   
  {   
    return TupleRestrictions.withinNumber(column, values).not();
  }
 
 
  /**
   * Create a {@link TupleCriterion} that only accept tuples with
   * the value of the specified <code>column</code> is <b>not</b>
   * within the provide <code>list</code>.
   * <p>
   *
   * The value of the <code>column</code> will be converted into
   * string to compare, if it's not string.
   */
  public static TupleCriterion notWithinString(final String column, final ArrayList<String> values)
  {   
    return TupleRestrictions.withinString(column, values).not();
  }
 
 
  /**
   * Define a {@link TupleCriterion} that only extracts records when the value of the
   * <code>column</code> meets the <cdoe>regex</code>.
   *
   * @param column the name of a column to be tested on its value whether it meets
   * the specified <code>regex</code> or not.
   *
   * @param regex a regular expression to test.
   *
   * @return a {@link TupleCriterion} accepts value from the <code>column</code>
   * match the given <code>regex</code>.
   *
   */ 
  public static TupleCriterion regex(final String column, final String regex)
  {
    return new TupleCriterion(){     
     
      private static final long serialVersionUID = -6630104271777176036L;
      private transient Pattern pattern = Pattern.compile (regex);
      private transient Matcher matcher = pattern.matcher ("");
     
      @Override
      protected boolean evaluate(Tuple tuple, Configuration configuration)
      { 
        if( pattern==null )
        {
          pattern = Pattern.compile (regex);
          matcher = pattern.matcher ("");
        }
       
        String value = tuple.getString (column);
        if( value!=null )
        {
          matcher.reset (value);
          return matcher.find ();
        }
        else
        {
          return false;
        }
      }

      @Override
      public String[] getInvolvedColumns()
      {       
        return new String[]{column};
      }};
  }
 
 
  /**
   * Create a {@link TupleCriterion} that only accepts
   * tuples with the value of the given <code>column<code>
   * is not null nor empty string.
   */
  public static TupleCriterion notNull(final String column)
  {
   
    return new TupleCriterion(){

      private static final long serialVersionUID = 1573625916312469904L;

      @Override
      protected boolean evaluate(Tuple tuple, Configuration configuration)
      {       
        return tuple.get (column)!=null && tuple.getString (column).trim ().length ()>0;
      }

      @Override
      public String[] getInvolvedColumns()
      {
        return new String[]{column};
      }};
  }
 
 
 
  /**
   * Specify the given <code>column</code>'s value equals to <code>value</code>
   */
  public static TupleCriterion eq(String column, String value)
  {
    return new StringCriterion(column, value, RelationalOperator.EQ);
  }
 
 
 
  /**
   * Specify the given <code>column</code>'s value equals to <code>value</code>
   */
  public static TupleCriterion eq(String column, Number value)
  {
    return new NumberCriterion(column, value.doubleValue(), RelationalOperator.EQ);
  }
 
 
 
  /**
   * Specify the given <code>column</code>'s value equals to <code>trueFalse</code>
   */
  public static TupleCriterion eq(final String column, final boolean trueFalse)
  {
    return new TupleCriterion()
    {     
      private static final long serialVersionUID = 3652448730224390852L;

      @Override
      protected boolean evaluate(Tuple tuple, Configuration configuration)
      {       
        return tuple.getBoolean(column)==trueFalse;
      }

      @Override
      public String[] getInvolvedColumns()
      {       
        return new String[]{column};
      }
     
    };
  }
 
 
 
  /**
   * Return a {@link TupleCriterion} that parses the value of <column>column</column>
   * with the given <column>columnDateFormat</column> into milliseconds, comparing the
   * milliseconds (A) with the <code>date</code> (B) and only accept tuples records when
   * A equals to B.
   *
   * @param column name of a column to be tested in a dataset.
   *
   * @param columnDateFormat the date format of the specified <code>column</code> in the dataset.
   * The <code>columnFormat</code> pattern is the same as {@link java.text.SimpleDateFormat}
   *
   * @param date a date constraint to be test.
   *
   */
  public static TupleCriterion eq(String column, String columnDateFormat, java.util.Date date)
  {
    return new DateCriterion(column, columnDateFormat, date.getTime(), RelationalOperator.EQ);
  }
 
 
  /**
   * Return a {@link TupleCriterion} that only accepts tuples with
   * the value of <code>column</code> is equal to the specified
   * <code>date</code>.
   * <p>
   *
   * If the type of the value for the <code>column</code> is and instance
   * of {@link java.util.Date}, then the comparison is done by calling the
   * method of {@link java.util.Date#getTime()} for the value and compare
   * it with <code>date.getTime()</code>.
   * <p>
   *
   * If the type of the value is not an instance of {@link java.util.Date},
   * then it will be parsed into date format using either the format of
   * <code>yyyy-MM-dd</code> or <code>yyyy-MM-dd HH:mm:ss</code>.
   *
   */
  public static TupleCriterion eq(String column, java.util.Date date)
  {
    return new DateCriterion(column, null, date.getTime(), RelationalOperator.EQ);
  }
 
 
 
  /**
   * Create a {@link TupleCriterion} that only accepts tuples with
   * the two columns' values are equals.
   */
  public static TupleCriterion eqColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.EQ);
  }
 
 
 
 
 
  /**
   * not equals
   */
  public static TupleCriterion ne(String columnName, String value)
  {
    return new StringCriterion(columnName, value, RelationalOperator.NE);
 
  public static TupleCriterion ne(String columnName, Number value)
  {
    return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.NE);
  }
  public static TupleCriterion ne(String columnName, String columnFormat, Date date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTime (), RelationalOperator.NE);
  }
  public static TupleCriterion ne(String columnName, Date date)
  {
    return new DateCriterion(columnName, null, date.getTime (), RelationalOperator.NE);
  }
  public static TupleCriterion ne(String columnName, String columnFormat, Calendar date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTimeInMillis (), RelationalOperator.NE);
  }
  public static TupleCriterion ne(String columnName, Calendar date)
  {
    return new DateCriterion(columnName, null, date.getTimeInMillis (), RelationalOperator.NE);
  }
  public static TupleCriterion ne(final String column, final boolean trueFalse)
  {
    return new TupleCriterion()
    {     
      private static final long serialVersionUID = 3652448730224390852L;

      @Override
      protected boolean evaluate(Tuple tuple, Configuration configuration)
      {       
        return tuple.getBoolean(column)!=trueFalse;
      }

      @Override
      public String[] getInvolvedColumns()
      {       
        return new String[]{column};
      }
     
    };
  }
  /**
   * compare if two column's values are not equals.
   */
  public static TupleCriterion neColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.NE);
  }
 
 
 
  /**
   * greater than
   */
  public static TupleCriterion gt(String columnName, String value)
  {
    return new StringCriterion(columnName, value, RelationalOperator.GT);
  }
  public static TupleCriterion gt(String columnName, Number value)
  {
    return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GT);
 
  public static TupleCriterion gt(String columnName, String columnFormat, Date date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTime (), RelationalOperator.GT);
  }
  public static TupleCriterion gt(String columnName, Date date)
  {
    return new DateCriterion(columnName, null, date.getTime (), RelationalOperator.GT);
  }
  public static TupleCriterion gt(String columnName, String columnFormat, Calendar date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTimeInMillis (), RelationalOperator.GT);
  }
  public static TupleCriterion gt(String columnName, Calendar date)
  {
    return new DateCriterion(columnName, null, date.getTimeInMillis (), RelationalOperator.GT);
  }
  /**
   * compare if column1's value greater than column2's value
   */
  public static TupleCriterion gtColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.GT);
  }
 
 
 
  /**
   * greater than or equal
   */
  public static TupleCriterion ge(String columnName, String value)
  {
    return new StringCriterion(columnName, value, RelationalOperator.GE);
 
  public static TupleCriterion ge(String columnName, Number value)
  {
    return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.GE);
  }
  public static TupleCriterion ge(String columnName, String columnFormat, Date date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTime (), RelationalOperator.GE);
  }
  public static TupleCriterion ge(String columnName, Date date)
  {
    return new DateCriterion(columnName, null, date.getTime (), RelationalOperator.GE);
  }
  public static TupleCriterion ge(String columnName, String columnFormat, Calendar date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTimeInMillis (), RelationalOperator.GE);
  }
  public static TupleCriterion ge(String columnName, Calendar date)
  {
    return new DateCriterion(columnName, null, date.getTimeInMillis (), RelationalOperator.GE);
  }
  /**
   * compare if column1's value greater or equals to column2's value
   */
  public static TupleCriterion geColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.GE);
  }
 
 
 
  /**
   * less than or equal
   */
  public static TupleCriterion le(String columnName, String value)
  {
    return new StringCriterion(columnName, value, RelationalOperator.LE);
  }
  public static TupleCriterion le(String columnName, Number value)
  {
    return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LE);
  }
  public static TupleCriterion le(String columnName, String columnFormat, Date date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTime (), RelationalOperator.LE);
  }
  public static TupleCriterion le(String columnName, Date date)
  {
    return new DateCriterion(columnName, null, date.getTime (), RelationalOperator.LE);
  }
  public static TupleCriterion le(String columnName, String columnFormat, Calendar date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTimeInMillis (), RelationalOperator.LE);
  }
  public static TupleCriterion le(String columnName, Calendar date)
  {
    return new DateCriterion(columnName, null, date.getTimeInMillis (), RelationalOperator.LE);
  }
  /**
   * compare if column1's value less than column2's value
   */
  public static TupleCriterion leColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.LE);
  }
 
 
 
  /**
   * less than
   */
  public static TupleCriterion lt(String columnName, String value)
  {
    return new StringCriterion(columnName, value, RelationalOperator.LT);
  }
  public static TupleCriterion lt(String columnName, Number value)
  {
    return new NumberCriterion(columnName, value.doubleValue(), RelationalOperator.LT);
  }
  public static TupleCriterion lt(String columnName, String columnFormat, Date date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTime (), RelationalOperator.LT);
  }
  public static TupleCriterion lt(String columnName, Date date)
  {
    return new DateCriterion(columnName, null, date.getTime (), RelationalOperator.LT);
  }
  public static TupleCriterion lt(String columnName, String columnFormat, Calendar date)
  {
    return new DateCriterion(columnName, columnFormat, date.getTimeInMillis (), RelationalOperator.LT);
  }
  public static TupleCriterion lt(String columnName, Calendar date)
  {
    return new DateCriterion(columnName, null, date.getTimeInMillis (), RelationalOperator.LT);
  }
  /**
   * compare if column1's value less or equals to column2's value
   */
  public static TupleCriterion ltColumns(final String column1, final String column2)
  {
    return new ColumnsCriterion(column1, column2, RelationalOperator.LT);
  }
}
TOP

Related Classes of com.ebay.erl.mobius.core.criterion.TupleRestrictions

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.