Package org.apache.flink.test.recordJobs.sort

Source Code of org.apache.flink.test.recordJobs.sort.TeraSort

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


package org.apache.flink.test.recordJobs.sort;

import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.Program;
import org.apache.flink.api.common.ProgramDescription;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.Ordering;
import org.apache.flink.api.java.record.operators.FileDataSink;
import org.apache.flink.api.java.record.operators.FileDataSource;
import org.apache.flink.test.recordJobs.sort.tsUtil.TeraDistribution;
import org.apache.flink.test.recordJobs.sort.tsUtil.TeraInputFormat;
import org.apache.flink.test.recordJobs.sort.tsUtil.TeraKey;
import org.apache.flink.test.recordJobs.sort.tsUtil.TeraOutputFormat;

/**
* This is an example implementation of the TeraSort benchmark using the Flink system. The benchmark
* requires the input data to be generated according to the rules of Jim Gray's sort benchmark. A possible way to such
* input data is the Hadoop TeraGen program. For more details see <a
* href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/examples/terasort/TeraGen.html">
* http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/examples/terasort/TeraGen.html</a>.
*/
public final class TeraSort implements Program, ProgramDescription {

  private static final long serialVersionUID = 1L;


  @Override
  public String getDescription() {
    return "Parameters: [numSubStasks] [input] [output]";
  }


  @Override
  public Plan getPlan(String... args) throws IllegalArgumentException {
    // parse job parameters
    final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    final String input = (args.length > 1 ? args[1] : "");
    final String output = (args.length > 2 ? args[2] : "");

    // This task will read the input data and generate the key/value pairs
    final FileDataSource source =
        new FileDataSource(new TeraInputFormat(), input, "Data Source");
    source.setDegreeOfParallelism(numSubTasks);

    // This task writes the sorted data back to disk
    final FileDataSink sink =
        new FileDataSink(new TeraOutputFormat(), output, "Data Sink");
    sink.setDegreeOfParallelism(numSubTasks);
    sink.setGlobalOrder(new Ordering(0, TeraKey.class, Order.ASCENDING), new TeraDistribution());

    sink.setInput(source);

    return new Plan(sink, "TeraSort");
  }
}
TOP

Related Classes of org.apache.flink.test.recordJobs.sort.TeraSort

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.