Package org.apache.hadoop.mapreduce

Examples of org.apache.hadoop.mapreduce.Job


      boolean usedthedate,
      String custFields,
      updateStatus update,String uniqCheckField,Integer parallel
      ) throws Exception
  {
    Job job = new Job(new Configuration(jconf));
    JobIndexPublic.setJars(job.getConfiguration());
    if (filetype.equals("seq")) {
      job.setInputFormatClass(SequenceFileInputFormat.class);
      for (String input : inputs) {
        Path p = new Path(inputBase, "*" + input + "*/"+inputmatch+"");
        System.out.println(p.toString());
        FileStatus[] list=fs.globStatus(p);
        if(list==null||list.length==0)
        {
          continue;
        }
        SequenceFileInputFormat.addInputPath(job, p);
      }
    } else {
      for (String input : inputs) {
        Path p = new Path(inputBase, "*" + input + "*/"+inputmatch+"");
        System.out.println(p.toString());
        FileStatus[] list=fs.globStatus(p);
        if(list==null||list.length==0)
        {
          continue;
        }
        FileInputFormat.addInputPath(job, p);
      }
    }
   
    Path baseP=new Path(output);
    Path baseParent=new Path(output);
    if(baseP.getParent()!=null&&baseP.getParent().getParent()!=null)
    {
      baseParent=baseP.getParent().getParent();
    }

    String jobnameOutput=new String(baseParent.toString()+"*"+baseP.getName());
    int cutoutlen=50;
    if(jobnameOutput.length()>cutoutlen)
    {
      jobnameOutput="*"+jobnameOutput.substring(jobnameOutput.length()-cutoutlen, jobnameOutput.length());
    }
   
    System.out.println("output:"+output+"@"+jobnameOutput);
    System.out.println("tmp:"+smallindex.toString());
    job.setJobName("mdrill_stage_1@"+jobnameOutput);
    job.setJarByClass(JobIndexerPartion.class);

    fs.delete(new Path(output), true);
    fs.delete(smallindex, true);
    Configuration conf = job.getConfiguration();

    String fields = JobIndexPublic.readFieldsFromSchemaXml(solrHome+ "/solr/conf/schema.xml",fs,conf);
    JobIndexPublic.setDistributecache(new Path(solrHome,"solr/conf"), fs,conf);
    if(!split.isEmpty()&&!split.equals("default")&&!split.equals("\001"))
    {
      conf.set("higo.column.split", split);
    }
   
    conf.set("uniq.check.field", uniqCheckField);
   
    if(split.equals("\t"))
    {
      conf.set("higo.column.split", "tab");
    }
    conf.set("higo.column.custfields", custFields);
    conf.set("higo.input.base", inputBase);
   
   
    conf.setBoolean("higo.column.userthedate", usedthedate);
    //conf.set("mapred.reduce.slowstart.completed.maps", "0.01");
    conf.set("higo.index.fields", fields);
    job.setPartitionerClass(PairPartion.class) ;

    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(PairWriteable.class);
    job.setMapOutputValueClass(DocumentMap.class);
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, smallindex);
    job.setNumReduceTasks(shards * parallel);
    int result=0;

    if(update!=null)
    {
          job.submit();
          while(!job.isComplete())
          {
            update.update(1, job);
            Thread.sleep(3000);
          }
          if(update.dump(job)){
            return -1;
          }
    }else{
      result=job.waitForCompletion(true)? 0 : -1;
    }
       
       

    if(result==0)
    {
      Job job2 = new Job(new Configuration(jconf));
      JobIndexPublic.setJars(job2.getConfiguration());
      job2.setJobName("mdrill_stage_2@"+jobnameOutput);
      Configuration conf2 = job2.getConfiguration();
      JobIndexPublic.setDistributecache(new Path(solrHome,"solr/conf"), fs,conf2);
      conf2.set("higo.index.fields", fields);
      job2.setJarByClass(JobIndexerPartion.class);
      job2.setInputFormatClass(SequenceFileInputFormat.class);
      SequenceFileInputFormat.addInputPath(job2, new Path(smallindex,"part-r-*"));
      job2.setMapOutputKeyClass(IntWritable.class);
      job2.setMapOutputValueClass(Text.class);
      job2.setPartitionerClass(IntPartion.class) ;
      job2.setReducerClass(IndexReducerMerge.class);
      job2.setOutputKeyClass(IntWritable.class);
      job2.setOutputValueClass(Text.class);
      job2.setOutputFormatClass(SequenceFileOutputFormat.class);
      job2.setNumReduceTasks(shards);
      SequenceFileOutputFormat.setOutputPath(job2, new Path(output));
      if(update!=null)
      {
        job2.submit();
            while(!job2.isComplete())
            {
              update.update(2, job2);
              Thread.sleep(3000);
            }
           
            update.finish();
      }else{
        result= job2.waitForCompletion(true) ? 0 : -1;
 
      }
    }
   
    fs.delete(smallindex, true);
View Full Code Here


    FileSystem fs=FileSystem.get(conf);
    conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
    conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    conf.setInt(CLUSTER_CONVERGENCE_ABTEST_REP, rep);

    Job job = new Job(conf,
        "KMeans Driver running clusterData over input: " + input);
//    job.setInputFormatClass(FileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
   
   

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(KMeansClusterMapper.class);
    job.setCombinerClass(KMeansClusterCombiner.class);
    job.setReducerClass(KMeansClusterReduce.class);
    job.setNumReduceTasks(this.reduce);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output, "cluster_abtest"));

    job.setJarByClass(KMeansDriver.class);

    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Clustering failed processing " + clustersIn);
    }
  }
View Full Code Here

    conf.set(CLUSTER_PATH_KEY, clustersIn.toString());
    conf.set(CLUSTER_CONVERGENCE_KEY, convergenceDelta);
    FileSystem fs=FileSystem.get(conf);

    Job job = new Job(conf);
    job.setJobName"KMeans Driver running runIteration over clustersIn: "
        + clustersIn);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Vector.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);

//    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(KMeansMapper.class);
    job.setCombinerClass(KMeansCombiner.class);
    job.setReducerClass(KMeansReducer.class);

    FileInputFormat.addInputPath(job, input);
    SequenceFileOutputFormat.setOutputPath(job, clustersOut);

    job.setNumReduceTasks(this.reduce);
    job.setJarByClass(KMeansDriver.class);
//    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Iteration failed processing " + clustersIn);
    }

    return isConverged(clustersOut, conf, fs);
View Full Code Here

 
  private Path InitCenter(Configuration conf, Path input, Path output, int k)
      throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    Path outFile = new Path(output, "part-InitCenter");
    Job job = new Job(conf);
    job.setJobName"KMeans Driver: "+ outFile);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Cluster.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Cluster.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(KMeansGroupMapper.class);
    job.setCombinerClass(KMeansGroupCombine.class);
    job.setReducerClass(KMeansGroupReducer.class);

    FileInputFormat.addInputPath(job, input);
    SequenceFileOutputFormat.setOutputPath(job, outFile);

    job.setNumReduceTasks(32);
    job.setJarByClass(KMeansDriver.class);
//    HadoopUtil.delete(conf, clustersOut);
    if (!job.waitForCompletion(true)) {
      throw new InterruptedException(
          "K-Means Iteration failed processing " + outFile);
    }
    return outFile;
View Full Code Here

  }

  /** {@inheritDoc} */
  public InputSplit[] getSplits(JobConf job, int chunks) throws IOException {
    List<org.apache.hadoop.mapreduce.InputSplit> newSplits =
      super.getSplits(new Job(job));
    InputSplit[] ret = new InputSplit[newSplits.size()];
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit s : newSplits) {
      org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split =
      (org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit)s;
View Full Code Here

      Path file2 = new Path(dir2 + "/file2");
      writeFile(conf, file2, (short)2, 2);

      // split it using a CombinedFile input format
      DummyInputFormat inFormat = new DummyInputFormat();
      Job job = new Job(conf);
      FileInputFormat.setInputPaths(job, dir1 + "," + dir2);
      inFormat.setMinSplitSizeRack(BLOCKSIZE);
      List<InputSplit> splits = inFormat.getSplits(job);
      System.out.println("Made splits(Test1): " + splits.size());
View Full Code Here

  /*
   * Prints out the input splits for the specified files
   */
  private void splitRealFiles(String[] args) throws IOException {
    Configuration conf = new Configuration();
    Job job = new Job();
    FileSystem fs = FileSystem.get(conf);
    if (!(fs instanceof DistributedFileSystem)) {
      throw new IOException("Wrong file system: " + fs.getClass().getName());
    }
    int blockSize = conf.getInt("dfs.block.size", 128 * 1024 * 1024);
 
View Full Code Here

    String userName = getJobOwnerName();
    File workDir = new File(new Path(TEST_ROOT_DIR, "workdir").toString());

    // Configures a job with a regular file
    Job job1 = new Job(conf);
    Configuration conf1 = job1.getConfiguration();
    conf1.set("user.name", userName);
    DistributedCache.addCacheFile(secondCacheFile.toUri(), conf1);
   
    TrackerDistributedCacheManager.determineTimestamps(conf1);
    TrackerDistributedCacheManager.determineCacheVisibilities(conf1);

    // Task localizing for first job
    TaskDistributedCacheManager handle = manager
        .newTaskDistributedCacheManager(conf1);
    handle.setup(localDirAllocator, workDir, TaskTracker
          .getPrivateDistributedCacheDir(userName),
          TaskTracker.getPublicDistributedCacheDir());
    handle.release();
    for (TaskDistributedCacheManager.CacheFile c : handle.getCacheFiles()) {
      assertEquals(0, manager.getReferenceCount(c.uri, conf1, c.timestamp,
          c.owner));
    }
   
    Path thirdCacheFile = new Path(TEST_ROOT_DIR, "thirdcachefile");
    createPrivateTempFile(thirdCacheFile);
   
    // Configures another job with three regular files.
    Job job2 = new Job(conf);
    Configuration conf2 = job2.getConfiguration();
    conf2.set("user.name", userName);
    // add a file that would get failed to localize
    DistributedCache.addCacheFile(firstCacheFile.toUri(), conf2);
    // add a file that is already localized by different job
    DistributedCache.addCacheFile(secondCacheFile.toUri(), conf2);
View Full Code Here

        List<String> nodes=Lists.newArrayList();
        for(String link:options.type)
            nodes.add("<"+link+">");
        getConf().set(EntityIsAReducer.TYPE_LIST, Joiner.on(",").join(nodes));

        Job job=new Job(getConf(),"extractIsA");
        job.setJarByClass(this.getClass());
        job.setMapperClass(EntityCentricMapper.class);
        job.setReducerClass(EntityIsAReducer.class);

        job.setNumReduceTasks(options.reducerCount);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        for(String path: options.input) {
            FileInputFormat.addInputPath(job, new Path(path));
        }

        FileOutputFormat.setOutputPath(job, new Path(options.output));
        FileOutputFormat.setCompressOutput(job,true);
        FileOutputFormat.setOutputCompressorClass(job,GzipCodec.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }
View Full Code Here

            conf.set("mapred.compress.map.output", "true");
            conf.set("mapred.output.compression.type", "BLOCK");
            conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

            conf.set(RanSampleMapper.NULL_VALUE, Boolean.toString((reduceTasks==null || reduceTasks==0)));
            Job job=new Job(conf,"ranSample");
            FileInputFormat.addInputPath(job, input);
           
            job.setJarByClass(RanSampleTool.class);
            job.setMapperClass(RanSampleMapper.class);

            FileOutputFormat.setOutputPath(job, output);
            FileOutputFormat.setCompressOutput(job,true);
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);
            if(reduceTasks==null) {
                job.setNumReduceTasks(0);
            } else {
                job.setNumReduceTasks(reduceTasks);
                job.setReducerClass(PassthroughReducer.class);
            }
            return job.waitForCompletion(true) ? 0 :1;
        } catch(Main.IncorrectUsageException iue) {
            return 2;
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.hadoop.mapreduce.Job

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.