Package org.apache.hadoop.mapred

Examples of org.apache.hadoop.mapred.JobConf


      // need to get unique dir per cluster
      System.setProperty("test.build.data", "build/test/data/" + clusterName);

      log.info("Starting cluster=" + clusterName);

      Configuration config = new JobConf();

      // umask trick
      String umask = getCurrentUmask(tmpDir, config);
      if (umask != null) {
        log.info("Setting expected umask to " + umask);
        config.set("dfs.datanode.data.dir.perm", umask);
      }

      // dfs cluster is updating config
      // newer dfs cluster are using builder pattern
      // but we need to support older versions in
View Full Code Here


                                          "host2", "host3"});
      }
   @Test
   public void testNumInputs() throws Exception {
     Configuration conf = new Configuration();
     JobConf job = new JobConf(conf);
     MiniDFSCluster dfs = newDFSCluster(job);
        FileSystem fs = dfs.getFileSystem();
        System.out.println("FileSystem " + fs.getUri());
        Path inputDir = new Path("/foo/");
        final int numFiles = 10;
View Full Code Here

    // Switch to this if you'd like to look at all text files.  May take many minutes just to read the file listing.
    //String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/*/textData-*";

    // Creates a new job configuration for this Hadoop job.
    JobConf job = new JobConf(this.getConf());

    job.setJarByClass(TotalAnalysis.class);

    // fix from the google groups discussion
    String segmentListFile = "s3n://aws-publicdatasets/common-crawl/parse-output/valid_segments.txt";

    FileSystem fsInput = FileSystem.get(new URI(segmentListFile), job);
    BufferedReader reader = new BufferedReader(new InputStreamReader(fsInput.open(new Path(segmentListFile))));

    String segmentId;

    while ((segmentId = reader.readLine()) != null) {
      String inputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment/"+segmentId+"/textData-*";
      FileInputFormat.addInputPath(job, new Path(inputPath));
    }

    // Read in any additional config parameters.
    if (configFile != null) {
      LOG.info("adding config parameters from '"+ configFile + "'");
      this.getConf().addResource(configFile);
    }

    // Scan the provided input path for ARC files.
    //LOG.info("setting input path to '"+ inputPath + "'");
    //FileInputFormat.addInputPath(job, new Path(inputPath));
    //FileInputFormat.setInputPathFilter(job, SampleFilter.class);

    // Delete the output path directory if it already exists.
    LOG.info("clearing the output path at '" + outputPath + "'");

    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath)))
      fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Set which OutputFormat class to use.
    job.setOutputFormat(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(TotalAnalysis.TotalAnalysisMapper.class);
    job.setReducerClass(TotalAnalysis.TotalAnalysisReducer.class);

    if (JobClient.runJob(job).isSuccessful())
      return 0;
    else
      return 1;
View Full Code Here

      throw new IllegalArgumentException("'run()' must be passed an output path.");

    outputPath = args[0];

    // Creates a new job configuration for this Hadoop job.
    JobConf job = new JobConf(this.getConf());

    job.setJarByClass(ExampleMetadataStats.class);

    baseInputPath = "s3n://aws-publicdatasets/common-crawl/parse-output/segment";

    FileSystem fs = null;

    // If you would like to process all segments, comment this out and
    // uncomment the block of code below
    String inputPath = baseInputPath + "/1341690154994/metadata-00062";

    LOG.info("adding input path '" + inputPath + "'");
    FileInputFormat.addInputPath(job, new Path(inputPath));
    /*
    fs = FileSystem.get(new URI("s3n://aws-publicdatasets"), job);

    for (FileStatus fileStatus : fs.globStatus(new Path("/common-crawl/parse-output/valid_segments/[0-9]*"))) {
      String[] parts = fileStatus.getPath().toString().split("/");
      String inputPath = baseInputPath + "/" + parts[parts.length-1] + "/metadata-*";
      LOG.info("adding input path '" + inputPath + "'");
      FileInputFormat.addInputPath(job, new Path(inputPath));
    }
    */

    // Delete the output path directory if it already exists.
    LOG.info("clearing the output path at '" + outputPath + "'");

    fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath)))
      fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormat(SequenceFileInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormat(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(ExampleMetadataStats.ExampleMetadataStatsMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful())
      return 0;
    else
      return 1;
View Full Code Here

      LOG.info("adding config parameters from '"+ configFile + "'");
      this.getConf().addResource(configFile);
    }

    // Creates a new job configuration for this Hadoop job.
    JobConf job = new JobConf(this.getConf());

    job.setJarByClass(ExampleArcMicroformat.class);

    // Scan the provided input path for ARC files.
    LOG.info("setting input path to '"+ inputPath + "'");
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputPathFilter(job, SampleFilter.class);

    // Delete the output path directory if it already exists.
    LOG.info("clearing the output path at '" + outputPath + "'");

    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath)))
      fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormat(ArcInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormat(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(ExampleArcMicroformat.ExampleArcMicroformatMapper.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful())
      return 0;
    else
      return 1;
View Full Code Here

      LOG.info("adding config parameters from '"+ configFile + "'");
      this.getConf().addResource(configFile);
    }

    // Creates a new job configuration for this Hadoop job.
    JobConf job = new JobConf(this.getConf());

    job.setJarByClass(ExampleTextWordCount.class);

    // Scan the provided input path for ARC files.
    LOG.info("setting input path to '"+ inputPath + "'");
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputPathFilter(job, SampleFilter.class);

    // Delete the output path directory if it already exists.
    LOG.info("clearing the output path at '" + outputPath + "'");

    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath)))
      fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormat(SequenceFileInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormat(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(ExampleTextWordCount.ExampleTextWordCountMapper.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful())
      return 0;
    else
      return 1;
View Full Code Here

      LOG.info("adding config parameters from '"+ configFile + "'");
      this.getConf().addResource(configFile);
    }

    // Creates a new job configuration for this Hadoop job.
    JobConf job = new JobConf(this.getConf());

    job.setJarByClass(ExampleMetadataDomainPageCount.class);

    // Scan the provided input path for ARC files.
    LOG.info("setting input path to '"+ inputPath + "'");
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // Optionally, you can add in a custom input path filter
    // FileInputFormat.setInputPathFilter(job, SampleFilter.class);

    // Delete the output path directory if it already exists.
    LOG.info("clearing the output path at '" + outputPath + "'");

    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath)))
      fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // Set which InputFormat class to use.
    job.setInputFormat(SequenceFileInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormat(TextOutputFormat.class);

    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(ExampleMetadataDomainPageCount.ExampleMetadataDomainPageCountMapper.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful())
      return 0;
    else
      return 1;
View Full Code Here

    }

    ClassLoader loader = (beanClassLoader != null ? beanClassLoader : org.springframework.util.ClassUtils.getDefaultClassLoader());

    if (jar != null) {
      JobConf conf = (JobConf) job.getConfiguration();
      conf.setJar(jar.getURI().toString());
      loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
      conf.setClassLoader(loader);
    }


    // set first to enable auto-detection of K/V to skip the key/value types to be specified
    if (mapper != null) {
View Full Code Here

   * @param original initial configuration to read from. May be null.
   * @param properties properties object to add to the newly created configuration. May be null.
   * @return newly created configuration based on the input parameters.
   */
  public static JobConf createFrom(Configuration original, Properties properties) {
    JobConf cfg = null;
    if (original != null) {
      cfg = new JobConf(original);
    }
    else {
      cfg = new JobConf();
    }
    ConfigurationUtils.addProperties(cfg, properties);
    return cfg;
  }
View Full Code Here

   * @return the result of merging the two configurations.
   */
  public static JobConf merge(Configuration one, Configuration two) {
    if (one == null) {
      if (two == null) {
        return new JobConf();
      }
      return new JobConf(two);
    }

    JobConf c = new JobConf(one);

    if (two == null) {
      return c;
    }

    for (Map.Entry<String, String> entry : two) {
      c.set(entry.getKey(), entry.getValue());
    }

    return c;
  }
View Full Code Here

TOP

Related Classes of org.apache.hadoop.mapred.JobConf

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.