Package org.apache.tez.mapreduce.examples

Source Code of org.apache.tez.mapreduce.examples.UnionExample$UnionProcessor

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.examples;

import java.io.IOException;
import java.util.EnumSet;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.tez.client.TezClient;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.GroupInputEdge;
import org.apache.tez.dag.api.VertexGroup;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.StatusGetOpts;
import org.apache.tez.mapreduce.input.MRInput;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.mapreduce.processor.SimpleMRProcessor;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.Output;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.library.api.KeyValueReader;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.api.KeyValuesReader;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.apache.tez.runtime.library.input.ConcatenatedMergedKeyValuesInput;
import org.apache.tez.runtime.library.partitioner.HashPartitioner;

import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;

public class UnionExample {

  public static class TokenProcessor extends SimpleMRProcessor {
    IntWritable one = new IntWritable(1);
    Text word = new Text();

    public TokenProcessor(ProcessorContext context) {
      super(context);
    }

    @Override
    public void run() throws Exception {
      Preconditions.checkArgument(getInputs().size() == 1);
      boolean inUnion = true;
      if (getContext().getTaskVertexName().equals("map3")) {
        inUnion = false;
      }
      Preconditions.checkArgument(getOutputs().size() == (inUnion ? 2 : 1));
      Preconditions.checkArgument(getOutputs().containsKey("checker"));
      MRInput input = (MRInput) getInputs().values().iterator().next();
      KeyValueReader kvReader = input.getReader();
      Output output =  getOutputs().get("checker");
      KeyValueWriter kvWriter = (KeyValueWriter) output.getWriter();
      MROutput parts = null;
      KeyValueWriter partsWriter = null;
      if (inUnion) {
        parts = (MROutput) getOutputs().get("parts");
        partsWriter = parts.getWriter();
      }
      while (kvReader.next()) {
        StringTokenizer itr = new StringTokenizer(kvReader.getCurrentValue().toString());
        while (itr.hasMoreTokens()) {
          word.set(itr.nextToken());
          kvWriter.write(word, one);
          if (inUnion) {
            partsWriter.write(word, one);
          }
        }
      }
    }

  }

  public static class UnionProcessor extends SimpleMRProcessor {
    IntWritable one = new IntWritable(1);

    public UnionProcessor(ProcessorContext context) {
      super(context);
    }

    @Override
    public void run() throws Exception {
      Preconditions.checkArgument(getInputs().size() == 2);
      Preconditions.checkArgument(getOutputs().size() == 2);
      MROutput out = (MROutput) getOutputs().get("union");
      MROutput allParts = (MROutput) getOutputs().get("all-parts");
      KeyValueWriter kvWriter = out.getWriter();
      KeyValueWriter partsWriter = allParts.getWriter();
      Map<String, AtomicInteger> unionKv = Maps.newHashMap();
      LogicalInput union = getInputs().get("union");
      KeyValuesReader kvReader = (KeyValuesReader) union.getReader();
      while (kvReader.next()) {
        String word = ((Text) kvReader.getCurrentKey()).toString();
        IntWritable intVal = (IntWritable) kvReader.getCurrentValues().iterator().next();
        for (int i = 0; i < intVal.get(); ++i) {
          partsWriter.write(word, one);
        }
        AtomicInteger value = unionKv.get(word);
        if (value == null) {
          unionKv.put(word, new AtomicInteger(intVal.get()));
        } else {
          value.addAndGet(intVal.get());
        }
      }
      LogicalInput map3 = getInputs().get("map3");
      kvReader = (KeyValuesReader) map3.getReader();
      while (kvReader.next()) {
        String word = ((Text) kvReader.getCurrentKey()).toString();
        IntWritable intVal = (IntWritable) kvReader.getCurrentValues().iterator().next();
        AtomicInteger value = unionKv.get(word);
        if (value == null) {
          throw new TezUncheckedException("Expected to exist: " + word);
        } else {
          value.getAndAdd(intVal.get() * -2);
        }
      }
      for (AtomicInteger value : unionKv.values()) {
        if (value.get() != 0) {
          throw new TezUncheckedException("Unexpected non-zero value");
        }
      }
      kvWriter.write("Union", new IntWritable(unionKv.size()));
    }

  }

  private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
      Map<String, LocalResource> localResources, Path stagingDir,
      String inputPath, String outputPath) throws IOException {
    DAG dag = DAG.create("UnionExample");
   
    int numMaps = -1;
    Configuration inputConf = new Configuration(tezConf);
    inputConf.setBoolean("mapred.mapper.new-api", false);
    inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
    inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
    MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
    DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

    Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
        TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

    Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
        UnionProcessor.class.getName()), 1);

    Configuration outputConf = new Configuration(tezConf);
    outputConf.setBoolean("mapred.reducer.new-api", false);
    outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
    outputConf.set(FileOutputFormat.OUTDIR, outputPath);
    DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
    checkerVertex.addDataSink("union", od);
   

    Configuration allPartsConf = new Configuration(tezConf);
    DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
        TextOutputFormat.class, outputPath + "-all-parts").build();
    checkerVertex.addDataSink("all-parts", od2);

    Configuration partsConf = new Configuration(tezConf);   
    DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
        TextOutputFormat.class, outputPath + "-parts").build();
    VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
    unionVertex.addDataSink("parts", od1);

    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
        .newBuilder(Text.class.getName(), IntWritable.class.getName(),
            HashPartitioner.class.getName()).build();

    dag.addVertex(mapVertex1)
        .addVertex(mapVertex2)
        .addVertex(mapVertex3)
        .addVertex(checkerVertex)
        .addEdge(
            Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
        .addEdge(
            GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
                InputDescriptor.create(
                    ConcatenatedMergedKeyValuesInput.class.getName())));
    return dag; 
  }

  private static void printUsage() {
    System.err.println("Usage: " + " unionexample <in1> <out1>");
  }

  public boolean run(String inputPath, String outputPath, Configuration conf) throws Exception {
    System.out.println("Running UnionExample");
    // conf and UGI
    TezConfiguration tezConf;
    if (conf != null) {
      tezConf = new TezConfiguration(conf);
    } else {
      tezConf = new TezConfiguration();
    }
    UserGroupInformation.setConfiguration(tezConf);
    String user = UserGroupInformation.getCurrentUser().getShortUserName();

    // staging dir
    FileSystem fs = FileSystem.get(tezConf);
    String stagingDirStr = Path.SEPARATOR + "user" + Path.SEPARATOR
        + user + Path.SEPARATOR+ ".staging" + Path.SEPARATOR
        + Path.SEPARATOR + Long.toString(System.currentTimeMillis());   
    Path stagingDir = new Path(stagingDirStr);
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDirStr);
    stagingDir = fs.makeQualified(stagingDir);
   

    // No need to add jar containing this class as assumed to be part of
    // the tez jars.

    // TEZ-674 Obtain tokens based on the Input / Output paths. For now assuming staging dir
    // is the same filesystem as the one used for Input/Output.
   
    TezClient tezSession = TezClient.create("UnionExampleSession", tezConf);
    tezSession.start();

    DAGClient dagClient = null;

    try {
        if (fs.exists(new Path(outputPath))) {
          throw new FileAlreadyExistsException("Output directory "
              + outputPath + " already exists");
        }
       
        Map<String, LocalResource> localResources =
          new TreeMap<String, LocalResource>();
       
        DAG dag = createDAG(fs, tezConf, localResources,
            stagingDir, inputPath, outputPath);

        tezSession.waitTillReady();
        dagClient = tezSession.submitDAG(dag);

        // monitoring
        DAGStatus dagStatus = dagClient.waitForCompletionWithStatusUpdates(EnumSet.of(StatusGetOpts.GET_COUNTERS));
        if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) {
          System.out.println("DAG diagnostics: " + dagStatus.getDiagnostics());
          return false;
        }
        return true;
    } finally {
      fs.delete(stagingDir, true);
      tezSession.stop();
    }
  }

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      printUsage();
      System.exit(2);
    }
    UnionExample job = new UnionExample();
    job.run(args[0], args[1], null);
  }
}
TOP

Related Classes of org.apache.tez.mapreduce.examples.UnionExample$UnionProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.