Examples of org.apache.hadoop.io.SequenceFile.Writer

Package org.apache.hadoop.io.SequenceFile

Examples of org.apache.hadoop.io.SequenceFile.Writer

org.apache.hadoop.io.SequenceFile.Writer

  }


  @Test
  public void partitionTooLow() throws IOException {


    Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);


    Text key = new Text();
    IntWritable partNum = new IntWritable();


    key.set("bucket-1");
    partNum.set(0);
    writer.append(key, partNum);


    key.set("bucket-2");
    partNum.set(0);
    writer.append(key, partNum);


    key.set("bucket-4");
    partNum.set(2);
    writer.append(key, partNum);


    key.set("bucket-5");
    partNum.set(2);
    writer.append(key, partNum);


    key.set("bucket-6");
    partNum.set(-1);
    writer.append(key, partNum);


    writer.close();




    job.setNumReduceTasks(3);


    try {

View Full Code Here

    List<Path> dirs = asList(status.getPath());


    Text key = new Text();
    Text value = new Text();


    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK);


    int numPartitions = Integer.parseInt(job.get("mapred.reduce.tasks"));


    Bucketer partitionBucketer = new Bucketer(numPartitions, 0, false);
    partitionBucketer.reset("partition-map");


    jobCounters = new Counters();


    try {
      while (!dirs.isEmpty()) {
        List<Path> nextLevel = new LinkedList<Path>();


        for (Path dir : dirs) {
          jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);


          print(Verbosity.INFO, "\n\n" + dir.toUri().getPath());


          FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
            @Override
            public boolean accept(Path testPath) {
              if (ignoredFiles == null) return true;
              ignoredFiles.reset(testPath.toUri().getPath());
              return !ignoredFiles.matches();
            }
            
          });


          if (contents == null || contents.length == 0) {
            print(Verbosity.INFO, " is empty");


            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
          } else {
            List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
            Set<String> uncrushedFiles = new HashSet<String>(contents.length);


            long crushableBytes = 0;


            /*
             * Queue sub directories for subsequent inspection and examine the files in this directory.
             */
            for (FileStatus content : contents) {
              Path path = content.getPath();


              if (content.isDir()) {
                nextLevel.add(path);
              } else {
                boolean changed = uncrushedFiles.add(path.toUri().getPath());


                assert changed : path.toUri().getPath();


                long fileLength = content.getLen();


                if (fileLength <= maxEligibleSize) {
                  crushables.add(content);
                  crushableBytes += fileLength;
                }
              }
            }


            /*
             * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
             * number of files we found.
             */
            if (!uncrushedFiles.isEmpty()) {
              if (-1 == findMatcher(dir)) {
                throw new IllegalArgumentException("Could not find matching regex for directory: " + dir);
              }


              jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
            }


            if (0 == crushableBytes) {
              print(Verbosity.INFO, " has no crushable files");


              jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
            } else {
              /*
               * We found files to consider for crushing.
               */
              long nBlocks = crushableBytes / dfsBlockSize;


              if (nBlocks * dfsBlockSize != crushableBytes) {
                nBlocks++;
              }


              /*
               * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
               */
              long dirBuckets = nBlocks / maxFileBlocks;


              if (dirBuckets * maxFileBlocks != nBlocks) {
                dirBuckets++;
              }


              if (dirBuckets > Integer.MAX_VALUE) {
                throw new AssertionError("Too many buckets: " + dirBuckets);
              }


              Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);


              directoryBucketer.reset(getPathPart(dir));


              for (FileStatus file : crushables) {
                directoryBucketer.add(new FileStatusHasSize(file));
              }


              List<Bucket> crushFiles = directoryBucketer.createBuckets();


              if (crushFiles.isEmpty()) {
                jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
              } else {
                nBuckets += crushFiles.size();


                jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);


                print(Verbosity.INFO, " => " + crushFiles.size() + " output files");


                /*
                 * Write out the mapping between a bucket and a file.
                 */
                for (Bucket crushFile : crushFiles) {
                  String bucketId = crushFile.name();


                  List<String> bucketFiles = crushFile.contents();


                  print(Verbosity.INFO, format("\n  Output %s will include %,d input bytes from %,d files", bucketId,
                      crushFile.size(), bucketFiles.size()));


                  key.set(bucketId);


                  for (String f : bucketFiles) {
                    boolean changed = uncrushedFiles.remove(f);


                    assert changed : f;


                    pathMatcher.reset(f);


                    pathMatcher.matches();


                    value.set(pathMatcher.group(5));


                    writer.append(key, value);


                    /*
                     * Print the input file with four leading spaces.
                     */
                    print(Verbosity.VERBOSE, "\n    " + f);
                  }


                  jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, bucketFiles.size());


                  partitionBucketer.add(crushFile);
                }
              }
            }


            if (!uncrushedFiles.isEmpty()) {
              print(Verbosity.INFO, "\n\n  Skipped " + uncrushedFiles.size() + " files");


              for (String uncrushed : uncrushedFiles) {
                print(Verbosity.VERBOSE, "\n    " + uncrushed);
              }


              jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
            }


            skippedFiles.addAll(uncrushedFiles);
          }
        }


        dirs = nextLevel;
      }
    } finally {
      try {
        writer.close();
      } catch (Exception e) {
        LOG.error("Trapped exception during close: " + bucketFiles, e);
      }
    }




    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();


    assert partitions.size() <= numPartitions;


    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);


    IntWritable partNum = new IntWritable();


    try {
      for (Bucket partition : partitions) {
        String partitionName = partition.name();


        partNum.set(Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)));


        for (String bucketId : partition.contents()) {
          key.set(bucketId);


          writer.append(key, partNum);
        }
      }
    } finally {
      try {
        writer.close();
      } catch (Exception e) {
        LOG.error("Trapped exception during close: " + partitionMap, e);
      }
    }

View Full Code Here

  }


  @Test
  public void partitionTooHigh() throws IOException {


    Writer writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);


    Text key = new Text();
    IntWritable partNum = new IntWritable();


    key.set("bucket-1");
    partNum.set(0);
    writer.append(key, partNum);


    key.set("bucket-2");
    partNum.set(0);
    writer.append(key, partNum);


    key.set("bucket-4");
    partNum.set(2);
    writer.append(key, partNum);


    key.set("bucket-5");
    partNum.set(2);
    writer.append(key, partNum);


    key.set("bucket-6");
    partNum.set(3);
    writer.append(key, partNum);


    writer.close();




    job.setNumReduceTasks(3);


    try {

View Full Code Here

      CustomWritable value = new CustomWritable();


      DefaultCodec codec = new DefaultCodec();
      codec.setConf(job);


      Writer writer = SequenceFile.createWriter(fs, job, new Path(file.getAbsolutePath()), CustomWritable.class,
          CustomWritable.class, compressionType, codec);


      for (int k = 1, v = 100 * fileNum + 1; k <= fileNum; k++, v++) {
        key.set(k);
        value.set(v);


        writer.append(key, value);
      }


      writer.close();
    }


    return new Text(file.getAbsolutePath());
  }

View Full Code Here

        Path dest = new Path(fullyQualifiedFileName + ".copy");
        log.debug("Copying log file to DSF " + dest);
        fs.delete(dest, true);
        LogFileKey key = new LogFileKey();
        LogFileValue value = new LogFileValue();
        Writer writer = null;
        Reader reader = null;
        try {
          short replication = (short) acuConf.getCount(Property.LOGGER_RECOVERY_FILE_REPLICATION);
          writer = SequenceFile.createWriter(fs, fs.getConf(), dest, LogFileKey.class, LogFileValue.class, fs.getConf().getInt("io.file.buffer.size", 4096),
              replication, fs.getDefaultBlockSize(), SequenceFile.CompressionType.BLOCK, new DefaultCodec(), null, new Metadata());
          FileSystem local = TraceFileSystem.wrap(FileSystem.getLocal(fs.getConf()).getRaw());
          reader = new SequenceFile.Reader(local, new Path(findLocalFilename(localLog)), fs.getConf());
          while (reader.next(key, value)) {
            writer.append(key, value);
          }
        } catch (IOException ex) {
          log.warn("May have a partial copy of a recovery file: " + localLog, ex);
        } finally {
          if (reader != null)
            reader.close();
          if (writer != null)
            writer.close();
        }
        // Make file appear in the shared file system as the target name only after it is completely copied
        fs.rename(dest, new Path(fullyQualifiedFileName));
        log.info("Copying " + localLog + " complete");
      }

View Full Code Here

        log.error("\t" + entry.getKey() + " -> " + keyExtent);
    }
    
    try {
      
      Writer outSeq = SequenceFile.createWriter(fs, conf, new Path(failureDir, "failures.seq"), Text.class, KeyExtent.class);
      
      for (Entry<Path,List<KeyExtent>> entry : es) {
        List<KeyExtent> extents = entry.getValue();
        
        for (KeyExtent keyExtent : extents)
          outSeq.append(new Text(entry.getKey().toString()), keyExtent);
      }
      
      outSeq.close();
    } catch (IOException ioe) {
      log.error("Failed to create " + new Path(failureDir, "failures.seq") + " : " + ioe.getMessage());
    }
    
    // we should make copying multi-threaded

View Full Code Here

    final Path p = new Path("/testSequenceFileSync/foo");
    final int len = 1 << 16;
    FSDataOutputStream out = fs.create(p, FsPermission.getDefault(),
        EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE, CreateFlag.SYNC_BLOCK),
        4096, (short) 1, len, null);
    Writer w = SequenceFile.createWriter(new Configuration(),
        Writer.stream(out),
        Writer.keyClass(RandomDatum.class),
        Writer.valueClass(RandomDatum.class),
        Writer.compression(CompressionType.NONE, new DefaultCodec()));
    w.hflush();
    checkSyncMetric(cluster, 0);
    w.hsync();
    checkSyncMetric(cluster, 1);
    int seed = new Random().nextInt();
    RandomDatum.Generator generator = new RandomDatum.Generator(seed);
    generator.next();
    w.append(generator.getKey(), generator.getValue());
    w.hsync();
    checkSyncMetric(cluster, 2);
    w.close();
    checkSyncMetric(cluster, 2);
    out.close();
    checkSyncMetric(cluster, 3);
    cluster.shutdown();
  }

View Full Code Here

        annot.setStart(0);
        annot.setEnd(12);
        annot.setType("annotType");
        doc.getAnnotations().add(annot);


        Writer writer = SequenceFile.createWriter(fs, conf, file, Text.class,
                BehemothDocument.class);
        writer.append(new Text("test"), doc);
        writer.close();


        Reader reader = new org.apache.hadoop.io.SequenceFile.Reader(fs, file,
                conf);
        Text key2 = new Text();
        BehemothDocument doc2 = new BehemothDocument();

View Full Code Here

        log.error("\t" + entry.getKey() + " -> " + keyExtent);
    }
    
    try {
      
      Writer outSeq = SequenceFile.createWriter(fs, conf, new Path(failureDir, "failures.seq"), Text.class, KeyExtent.class);
      
      for (Entry<Path,List<KeyExtent>> entry : es) {
        List<KeyExtent> extents = entry.getValue();
        
        for (KeyExtent keyExtent : extents)
          outSeq.append(new Text(entry.getKey().toString()), keyExtent);
      }
      
      outSeq.close();
    } catch (IOException ioe) {
      log.error("Failed to create " + new Path(failureDir, "failures.seq") + " : " + ioe.getMessage());
    }
    
    // we should make copying multithreaded

View Full Code Here

        Path dest = new Path(fullyQualifiedFileName + ".copy");
        log.debug("Copying log file to DSF " + dest);
        fs.delete(dest, true);
        LogFileKey key = new LogFileKey();
        LogFileValue value = new LogFileValue();
        Writer writer = null;
        Reader reader = null;
        try {
          short replication = 1;
          writer = SequenceFile.createWriter(fs, conf, dest, LogFileKey.class, LogFileValue.class, fs.getConf().getInt("io.file.buffer.size", 4096),
              replication, fs.getDefaultBlockSize(), SequenceFile.CompressionType.BLOCK, new DefaultCodec(), null, new Metadata());
          FileSystem local = FileSystem.getLocal(conf).getRaw();
          reader = new SequenceFile.Reader(local, localFilename(localLog), conf);
          while (reader.next(key, value)) {
            writer.append(key, value);
          }
        } catch (IOException ex) {
          log.warn("May have a partial copy of a recovery file: " + localLog, ex);
        } finally {
          if (reader != null)
            reader.close();
          if (writer != null)
            writer.close();
        }
        // Make file appear in the shared file system as the target name only after it is completely copied
        fs.rename(dest, new Path(fullyQualifiedFileName));
        log.info("Copying " + localLog + " complete");
      }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.apache.hadoop.io.SequenceFile.Writer

com.cloudera.flume.handlers.hdfs.DFSEventSink

com.cloudera.flume.handlers.hdfs.TestDFSWrite

com.cloudera.flume.PerfHdfsIO

com.digitalpebble.behemoth.SerializationTest

com.m6d.filecrush.crush.Crush

com.m6d.filecrush.crush.CrushPartitionerTest

com.m6d.filecrush.crush.CrushReducerParameterizedTest

com.m6d.filecrush.crush.CrushStandAloneSequenceFileTest

com.m6d.filecrush.crush.integration.CrushMapReduceTest

net.sf.katta.indexing.SequenceFileCreator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.