Package eu.stratosphere.test.recordJobs.relational

Source Code of eu.stratosphere.test.recordJobs.relational.TPCHQueryAsterix$JoinCO

/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/

package eu.stratosphere.test.recordJobs.relational;

import java.io.Serializable;
import java.util.Iterator;

import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.functions.ReduceFunction;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept;
import eu.stratosphere.api.java.record.io.CsvInputFormat;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.Collector;

/**
* The TPC-H is a decision support benchmark on relational data.
* Its documentation and the data generator (DBGEN) can be found
* on http://www.tpc.org/tpch/ .This implementation is tested with
* the DB2 data format. 
*
* This program implements a query on the TPC-H schema
* including one join and an aggregation.
* This query is used as example in the Asterix project (http://asterix.ics.uci.edu/).
*
* SELECT c_mktsegment, COUNT(o_orderkey)
*   FROM orders, customer
*   WHERE c_custkey = o_custkey
* GROUP BY c_mktsegment;
*
*/

public class TPCHQueryAsterix implements Program, ProgramDescription {

  private static final long serialVersionUID = 1L;


  /**
   * Realizes the join between Customers and Order table.
   */
  @ConstantFieldsSecondExcept(0)
  public static class JoinCO extends JoinFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    private final IntValue one = new IntValue(1);
   
    /**
     * Output Schema:
     *  0: PARTIAL_COUNT=1
     *  1: C_MKTSEGMENT
     */
    @Override
    public void join(Record order, Record cust, Collector<Record> out)
        throws Exception {
      cust.setField(0, one);
      out.collect(cust);
    }
  }

  /**
   * Reduce implements the aggregation of the results. The
   * Combinable annotation is set as the partial counts can be calculated
   * already in the combiner
   *
   */
  @Combinable
  @ConstantFields(1)
  public static class AggCO extends ReduceFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    private final IntValue integer = new IntValue();
    private Record record = new Record();
 
    /**
     * Output Schema:
     * 0: COUNT
     * 1: C_MKTSEGMENT
     *
     */
    @Override
    public void reduce(Iterator<Record> records, Collector<Record> out)
        throws Exception {

      int count = 0;

      while (records.hasNext()) {
        record = records.next();
        count+=record.getField(0, integer).getValue();
      }

      integer.setValue(count);
      record.setField(0, integer);
      out.collect(record);
    }
   
    /**
     * Computes partial counts
     */
    public void combine(Iterator<Record> records, Collector<Record> out)
        throws Exception {
      reduce(records, out);
    }

  }


  @Override
  public Plan getPlan(final String... args) {

    // parse program parameters
    int numSubtasks       = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String ordersPath    = (args.length > 1 ? args[1] : "");
    String customerPath  = (args.length > 2 ? args[2] : "");
    String output        = (args.length > 3 ? args[3] : "");

    /*
     * Output Schema:
     * 0: CUSTOMER_ID
     */
    // create DataSourceContract for Orders input
    FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders");
    orders.setDegreeOfParallelism(numSubtasks);
    CsvInputFormat.configureRecordFormat(orders)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(IntValue.class, 1);
   
    /*
     * Output Schema:
     * 0: CUSTOMER_ID
     * 1: MKT_SEGMENT
     */
    // create DataSourceContract for Customer input
    FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers");
    customers.setDegreeOfParallelism(numSubtasks);
    CsvInputFormat.configureRecordFormat(customers)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(IntValue.class, 0)
      .field(StringValue.class, 6);
   
    // create JoinOperator for joining Orders and LineItems
    JoinOperator joinCO = JoinOperator.builder(new JoinCO(), IntValue.class, 0, 0)
      .name("JoinCO")
      .build();
    joinCO.setDegreeOfParallelism(numSubtasks);

    // create ReduceOperator for aggregating the result
    ReduceOperator aggCO = ReduceOperator.builder(new AggCO(), StringValue.class, 1)
      .name("AggCo")
      .build();
    aggCO.setDegreeOfParallelism(numSubtasks);

    // create DataSinkContract for writing the result
    FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, "Output");
    result.setDegreeOfParallelism(numSubtasks);
    CsvOutputFormat.configureRecordFormat(result)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(IntValue.class, 0)
      .field(StringValue.class, 1);

    // assemble the plan
    result.setInput(aggCO);
    aggCO.setInput(joinCO);
    joinCO.setFirstInput(orders);
    joinCO.setSecondInput(customers);

    return new Plan(result, "TPCH Asterix");
  }


  @Override
  public String getDescription() {
    return "Parameters: [numSubStasks], [orders], [customer], [output]";
  }
}
TOP

Related Classes of eu.stratosphere.test.recordJobs.relational.TPCHQueryAsterix$JoinCO

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.