Examples of JobClient


Examples of org.apache.hadoop.mapred.JobClient

   */
  public static void runJob(String input, String output,
                            String measureClassName, double t1, double t2) throws IOException {
    log.info("Input: {} Out: {} "
      + "Measure: {} t1: {} t2: {}", new Object[] {input, output, measureClassName, t1, t2});
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CanopyDriver.class);
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
   
    conf.setInputFormat(SequenceFileInputFormat.class);
   
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Canopy.class);
   
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
   
    conf.setMapperClass(CanopyMapper.class);
    conf.setReducerClass(CanopyReducer.class);
    conf.setNumReduceTasks(1);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
   
    client.setConf(conf);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    JobClient.runJob(conf);
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

                            String canopies,
                            String output,
                            String measureClassName,
                            double t1,
                            double t2) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(ClusterDriver.class);
   
    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
    conf.set(CanopyConfigKeys.CANOPY_PATH_KEY, canopies);
   
    conf.setInputFormat(SequenceFileInputFormat.class);
   
    /*
     * conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(RandomAccessSparseVector.class);
     */
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(VectorWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
   
    FileInputFormat.setInputPaths(conf, new Path(points));
    Path outPath = new Path(output + DEFAULT_CLUSTER_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(conf, outPath);
   
    conf.setMapperClass(ClusterMapper.class);
    conf.setReducerClass(IdentityReducer.class);
   
    client.setConf(conf);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    JobClient.runJob(conf);
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

   * @param all
   *          if true select all categories
   */
  public static void runJob(String input, String output, String catFile,
                            boolean exactMatchOnly, boolean all) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaToSequenceFile.class);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
      log.info("Input: " + input + " Out: " + output + " Categories: " + catFile
                                       + " All Files: " + all);
    }
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(WikipediaMapper.class);
    conf.setInputFormat(XmlInputFormat.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
   
    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
   
    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
      for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase());
      }
    }
   
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil
        .getClass(categories));
   
    String categoriesStr = setStringifier.toString(categories);
   
    conf.set("wikipedia.categories", categoriesStr);
   
    client.setConf(conf);
    JobClient.runJob(conf);
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

  public static void runJob(String input,
                            String output,
                            String catFile,
                            boolean exactMatchOnly,
                            Class<? extends Analyzer> analyzerClass) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.class);
    if (WikipediaDatasetCreatorDriver.log.isInfoEnabled()) {
      log.info("Input: {} Out: {} Categories: {}", new Object[] {input, output,
                                                                                               catFile});
    }
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
    conf.set("xmlinput.end", "</text>");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(WikipediaDatasetCreatorMapper.class);
    conf.setNumMapTasks(100);
    conf.setInputFormat(XmlInputFormat.class);
    // conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
    conf.setReducerClass(WikipediaDatasetCreatorReducer.class);
    conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.class);
    conf
        .set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
   
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
   
    Set<String> categories = new HashSet<String>();
    for (String line : new FileLineIterable(new File(catFile))) {
      categories.add(line.trim().toLowerCase());
    }
   
    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil
        .getClass(categories));
   
    String categoriesStr = setStringifier.toString(categories);
   
    conf.set("wikipedia.categories", categoriesStr);
   
    client.setConf(conf);
    JobClient.runJob(conf);
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

   * @param output
   *          the output pathname String
   */
  @Override
  public void runJob(String input, String output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);
   
    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
   
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output + "/trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
   
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
   
    Path sigmaKFiles = new Path(output + "/trainer-weights/Sigma_k/*");
    Map<String,Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(conf,
        GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
   
    log.info("Sigma_k for Each Label");
    Map<String,Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
   
    Path sigmaJSigmaKFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
   
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
   
    Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
   
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output);
    client.setConf(conf);
   
    JobClient.runJob(conf);
   
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

                                         Path dictionaryFilePath,
                                         Path output,
                                         int dimension,
                                         boolean sequentialAccess) throws IOException {
   
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
   
    conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input
                    + ", dictionary-file: " + dictionaryFilePath.toString());
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
   
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(StringTuple.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(VectorWritable.class);
    DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
    FileInputFormat.setInputPaths(conf, new Path(input));
   
    FileOutputFormat.setOutputPath(conf, output);
   
    conf.setMapperClass(IdentityMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setReducerClass(TFPartialVectorReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileSystem dfs = FileSystem.get(output.toUri(), conf);
    if (dfs.exists(output)) {
      dfs.delete(output, true);
    }
   
    client.setConf(conf);
    JobClient.runJob(conf);
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

   * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
   * {@link SequenceFile} format
   */
  private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
   
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DictionaryVectorizer.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
   
    conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
    conf.setInt(MIN_SUPPORT, minSupport);
   
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);
   
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
   
    conf.setMapperClass(TermCountMapper.class);
   
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(TermCountReducer.class);
    conf.setReducerClass(TermCountReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
   
    FileSystem dfs = FileSystem.get(output.toUri(), conf);
    if (dfs.exists(output)) {
      dfs.delete(output, true);
    }
   
    client.setConf(conf);
    JobClient.runJob(conf);
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

   * @throws IOException
   */
  public static void tokenizeDocuments(String input, Class<? extends Analyzer> analyzerClass,
                                       String output) throws IOException {
   
    Configurable client = new JobClient();
    JobConf conf = new JobConf(DocumentProcessor.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
   
    conf.set(ANALYZER_CLASS, analyzerClass.getName());
    conf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
   
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
   
    conf.setMapperClass(SequenceFileTokenizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setNumReduceTasks(0);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
   
    client.setConf(conf);
    JobClient.runJob(conf);
  }
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

        return;
      }
     
      if (cmdLine.hasOption(seqOpt)) {
        Path path = new Path(cmdLine.getValue(seqOpt).toString());
        JobClient client = new JobClient();
        JobConf conf = new JobConf(Job.class);
        client.setConf(conf);
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
       
        Writer writer;
        if (cmdLine.hasOption(outputOpt)) {
View Full Code Here

Examples of org.apache.hadoop.mapred.JobClient

   * @param output
   *          the output pathname String
   */
  @Override
  public void runJob(String input, String output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
   
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
   
    conf.setMapperClass(BayesFeatureMapper.class);
   
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureReducer.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf
        .set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values
   
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    if (dfs.exists(outPath)) {
      dfs.delete(outPath, true);
    }
    conf.set("bayes.parameters", params.toString());
   
    client.setConf(conf);
    JobClient.runJob(conf);
   
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.