Package com.datasalt.pangool.tuplemr

Examples of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder


   
    delete(outPath);

    // We only need to execute a Map-only job for this task.
    // Every map will process a HTML file and extract the reviews from it.
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf);

    builder.addInput(new Path(inputFolder), new HadoopInputFormat(TextInputFormat.class),
        new MapOnlyMapper<LongWritable, Text, Text, BSONObject>() {

          StringBuffer inMemoryHtml = new StringBuffer();

          @Override
          protected void map(LongWritable key, Text value, Context context) throws IOException,
              InterruptedException {
            // for every line in the HTML just add it to a string buffer
            // we will process the entire HTML in the end (cleanup())
            inMemoryHtml.append(value.toString());
          }

          @Override
          protected void cleanup(Context context) throws IOException, InterruptedException {
            String html = inMemoryHtml.toString();

            Matcher startMatcher = startPattern.matcher(html);
            Matcher endMatcher = endPattern.matcher(html);

            Text documentId = new Text();
           
            Matcher placeMatcher = placePattern.matcher(html);
            // we assume this will always match - otherwise fail fast!
            placeMatcher.find();
            String placeId = placeMatcher.group(1);

            // Now we will proceed as follows:
            // We create a regex matcher for start of reviews and end of reviews
            // Within each (start, end) pair, we will execute an arbitrary number of matchers
            // for matching all the other properties (username, date, rating, review text...).
            // finally we add all the properties to a Mongo BSONObject that can be used as output.
            while(startMatcher.find()) {
              BSONObject review = new BasicBSONObject();
              review.put("place_id", placeId);
              int reviewStart = startMatcher.start();
              endMatcher.find();
              int reviewEnd = endMatcher.start();

              // Focus only on (start, end) text for this review
              String reviewText = html.substring(reviewStart, reviewEnd);
             
              for(Map.Entry<String, Pattern> parsingProperty : parsingConfig.entrySet()) {
                Matcher matcher = parsingProperty.getValue().matcher(reviewText);
                if(matcher.find()) {
                  review.put(parsingProperty.getKey(), matcher.group(1).trim());
                }
              }
             
              // The Mongo documentId of the review will the be the  Review_id.
              documentId.set((String) review.get("review_id"));
              // Write the pair (Id, document) to the output collector.
              context.write(documentId, review);
            }
          }
        });

    // --- This is the most important part (what makes it work with MongoDB: ---
    // Set the URL of the MongoDB we will write to. Here we specify the DB and the final Table.
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.qype");
    // Set the output format to HadoopOutputFormat(MongoOutputFormat.class)
    // The key will be the documentIds for the Mongo table and the value a Mongo BSONObject with all the properties we wish.
    builder.setOutput(new Path(outPath), new HadoopOutputFormat(MongoOutputFormat.class), Text.class,
        BSONObject.class);

    // Finally, build and execute the Pangool Job.
    try {
      builder.createJob().waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }
   
    // we are not interested in the output folder, so delete it
    delete(outPath);
   
View Full Code Here


    conf.setLong("mapred.max.split.size", 10 * 1024);

    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);

    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path(IN), inputFormat,
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context) throws IOException,
              InterruptedException {
            Assert.assertEquals("str1", key.get("a").toString());
            Assert.assertEquals("str2", key.get("b").toString());
            Assert.assertEquals((Integer) 30, (Integer) key.get("c"));
            Assert.assertEquals((Long) 4000l, (Long) key.get("d"));
            context.getCounter("stats", "nlines").increment(1);
          };
        });

    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mapOnly.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mapOnly.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, new Path(IN));

    assertEquals(10000, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
View Full Code Here

    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);
    Path outPath = new Path(OUT);

    MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf);
    mapOnly.addInput(new Path("src/test/resources/*.gz"), inputFormat,
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context) throws IOException,
              InterruptedException {
            Assert.assertNotNull(key.get("a").toString());
            Assert.assertNotNull(key.get("b").toString());
            Assert.assertTrue((Integer) key.get("c") > 0);
            Assert.assertTrue((Long) key.get("d") > 0);
            context.getCounter("stats", "nlines").increment(1);
          };
        });

    HadoopUtils.deleteIfExists(fS, outPath);
    mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mapOnly.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mapOnly.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, new Path(IN));

    assertEquals(100, job.getCounters().getGroup("stats").findCounter("nlines").getValue());
View Full Code Here

    HadoopUtils.deleteIfExists(fS, outPath);

    Schema schema = new Schema("schema",
        Fields.parse("name:string,name2:string,age:int,name3:string,emptystring:string"));

    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, ITuple, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {

            try {
              Assert.assertNull(key.get("name2"));
              Assert.assertNull(key.get("age"));
              Assert.assertEquals("Joe", key.get("name"));
              Assert.assertEquals("\"Joan\"", key.get("name3"));
              Assert.assertEquals("", key.get("emptystring"));
              context.write(key, value);
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });

    mO.setOutput(outPath, new TupleTextOutputFormat(schema, false, ',', '"', '\\', "\\N"), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
      String str = Files.toString(new File(outPath.toString() + "/part-m-00000"), Charset.defaultCharset());
      assertEquals("\"Joe\",\\N,\\N,\"\\\"Joan\\\"\",\"\"", str.trim());
    } finally {
      mO.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }
View Full Code Here

    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);

    Schema schema = new Schema("schema", Fields.parse("n1:int,n2:long,n3:float,n4:double"));

    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {

            try {
              Assert.assertNull(key.get("n1"));
              Assert.assertNull(key.get("n2"));
              Assert.assertNull(key.get("n3"));
              Assert.assertNull(key.get("n4"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });

    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }
View Full Code Here

  public final static String OUTPUT = "out-" + TestCascadingTupleInputFormat.class.getName();

  @Test
  public void test() throws Exception {
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(getConf());
    // Enable Cascading serialization in Hadoop config.
    CascadingTupleInputFormat.setSerializations(getConf());
    // Instantiate InputFormat
    InputFormat<ITuple, NullWritable> iF = new CascadingTupleInputFormat("logs", "day", "month", "year",
        "count", "metric", "value");
    builder.addInput(new Path("src/test/resources/cascading-binary"), iF,
        new MapOnlyMapper<ITuple, NullWritable, Text, NullWritable>() {

          @Override
          protected void map(ITuple key, NullWritable value, Context context) throws IOException,
              InterruptedException {
            context.write(new Text(key.toString()), NullWritable.get());
          }
        });

    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    String expectedOutput = "{\"day\":20,\"month\":10,\"year\":2012,\"count\":97,\"metric\":\"ALL\",\"value\":\"\"}\n"
        + "{\"day\":21,\"month\":10,\"year\":2012,\"count\":717,\"metric\":\"ALL\",\"value\":\"\"}\n"
        + "{\"day\":22,\"month\":10,\"year\":2012,\"count\":186,\"metric\":\"ALL\",\"value\":\"\"}";
View Full Code Here

        + "continent:string," + "region:string," + "surface_area:double," + "indep_year:int,"
        + "population:int," + "life_expectancy:double," + "gnp:double," + "gnp_old:double,"
        + "local_name:string," + "government_form:string," + "head_of_state:string," + "capital:int,"
        + "code2:string"));

    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, false, false, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {

            try {
              Assert.assertEquals("Constitutional Monarchy, Federation", key.get("government_form")
                  .toString());
              Assert.assertEquals("Salahuddin Abdul Aziz Shah Alhaj", key.get("head_of_state")
                  .toString());
              Assert.assertEquals(2464, key.get("capital"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });
    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }
View Full Code Here

    Path inPath = new Path(IN);
    HadoopUtils.deleteIfExists(fS, outPath);

    Schema schema = new Schema("schema", Fields.parse("name:string,name2:string"));

    MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf);
    mO.addInput(inPath, new TupleTextInputFormat(schema, fieldsPos, false, "-"),
        new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {

          protected void map(ITuple key, NullWritable value, Context context,
              MultipleOutputsCollector collector) throws IOException, InterruptedException {

            try {
              Assert.assertNull(key.get("name2"));
              Assert.assertEquals("1000", key.get("name"));
            } catch(Throwable t) {
              t.printStackTrace();
              throw new RuntimeException(t);
            }
          }
        });

    mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class,
        NullWritable.class);
    Job job = mO.createJob();
    try {
      assertTrue(job.waitForCompletion(true));
    } finally {
      mO.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, inPath);
    HadoopUtils.deleteIfExists(fS, outPath);
  }
View Full Code Here

    String output = args[2];
    deleteOutput(output);
   
    init(conf, new Path(modelFolder));
   
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setMapper(new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class));
    job.createJob().waitForCompletion(true);
   
    return 1;
  }
View Full Code Here

      if(nativeLibs.exists()) {
        SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
      }
    }
   
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    TableSpec tableSpec = new TableSpec(schema, schema.getFields().get(1));
   
    job.setOutput(new Path(out, "store"), new SploutSQLProxyOutputFormat(new SQLite4JavaOutputFormat(1000000, tableSpec)), ITuple.class,
        NullWritable.class);
    job.addInput(input, new HadoopInputFormat(TextInputFormat.class), new MapOnlyMapper<LongWritable, Text, ITuple, NullWritable>() {

      ITuple metaTuple = new Tuple(schema);

      protected void map(LongWritable key, Text value, Context context) throws IOException,
          InterruptedException {

        String[] partitionRange = value.toString().split("\t");
        Integer partition = Integer.parseInt(partitionRange[0]);
        metaTuple.set(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD, partition);
        String[] minMax = partitionRange[1].split(":");
        Integer min = Integer.parseInt(minMax[0]);
        Integer max = Integer.parseInt(minMax[1]);
        for(int i = min; i < max; i++) {
          metaTuple.set("key", i);
          metaTuple.set("value", theValue);
          context.write(metaTuple, NullWritable.get());
        }
      }
    });

    job.createJob().waitForCompletion(true);

    HadoopUtils.deleteIfExists(inFs, input);
    return 0;
  }
View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.