Examples of HadoopInputFormat


Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

   
    delete(output);
   
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new GrepHandler(regex));
    Job job = b.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      b.cleanUpInstanceFiles();
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    for(Category category : Category.values()) { // For each Category
      String categoryString = category.toString().toLowerCase();
      // Add the category, book title input spec with the associated CategoryMapper
      for(FileStatus fileStatus : fileSystem.listStatus(new Path(input + "/" + categoryString))) {
        job.addInput(fileStatus.getPath(), new HadoopInputFormat(TextInputFormat.class),
            new CategoryMapper(category, fileStatus.getPath().getName()));
      }
      // Add a named output for each category
      job.addNamedOutput(categoryString, new TupleSolrOutputFormat(new File(
          "src/test/resources/shakespeare-solr"), job.getConf()), ITuple.class, NullWritable.class);
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    job.addIntermediateSchema(schema);
    job.setGroupByFields("min", "max");
    job.setCustomPartitionFields("min");
    // Define the input and its associated mapper
    // The mapper will just emit the (min, max) pairs to the reduce stage
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          Tuple tuple = new Tuple(schema);

          @Override
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("intField", "strField");
    mr.setOrderBy(new OrderBy().add("intField", Order.ASC).add("strField", Order.ASC)
        .add("longField", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        DoubleWritable.class);

    try {
      mr.createJob().waitForCompletion(true);
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    // We only need to execute a Map-only job for this task.
    // Every map will process a HTML file and extract the reviews from it.
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf);

    builder.addInput(new Path(inputFolder), new HadoopInputFormat(TextInputFormat.class),
        new MapOnlyMapper<LongWritable, Text, Text, BSONObject>() {

          StringBuffer inMemoryHtml = new StringBuffer();

          @Override
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

   
    init(conf, new Path(modelFolder));
   
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC));

    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
        NullWritable.class);

    mr.setTupleReducer(new Red());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    cg.setGroupByFields("topic", "word");
    // Here we instantiate a mapper with stop words:
    // Note that we don't need to use the DistributedCache for that becasuse mappers, reducers, etc themselves are
    // instantiable
    StopWordMapper mapper = new StopWordMapper(stopWords);
    cg.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), mapper);
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    cg.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    cg.setTupleReducer(new CountReducer());
    cg.setTupleCombiner(new CountReducer());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    delete(output);

    TupleMRBuilder job = new TupleMRBuilder(conf, "Naive Bayes Model Generator");
    job.addIntermediateSchema(INTERMEDIATE_SCHEMA);
    // perform per-category word count mapping
    job.addInput(new Path(inputExamples), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tuple = new Tuple(INTERMEDIATE_SCHEMA);

          @Override
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(), "topic", "word");
    mr.setOrderBy(new OrderBy().add("my_avro", Order.ASC, Criteria.NullOrder.NULL_SMALLEST, customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.