Package org.apache.hadoop.fs

Examples of org.apache.hadoop.fs.ContentSummary


    // map-reduce jobs will be run locally based on data size
    // first find out if any of the jobs needs to run non-locally
    boolean hasNonLocalJob = false;
    for (ExecDriver mrtask: mrtasks) {
      try {
        ContentSummary inputSummary = Utilities.getInputSummary
          (ctx, (MapredWork)mrtask.getWork(), p);
        int numReducers = getNumberOfReducers(mrtask.getWork(), conf);

        if (LOG.isDebugEnabled()) {
          LOG.debug("Task: " + mrtask.getId() + ", Summary: " +
                   inputSummary.getLength() + "," + inputSummary.getFileCount() + ","
                   + numReducers);
        }

        if(MapRedTask.isEligibleForLocalMode(conf, inputSummary, numReducers) != null) {
          hasNonLocalJob = true;
View Full Code Here


        if (filter != null && !filter.accept(p)) {
          continue;
        }

        ContentSummary cs = ctx.getCS(path);
        if (cs == null) {
          FileSystem fs = p.getFileSystem(ctx.getConf());
          cs = fs.getContentSummary(p);
          ctx.addCS(path, cs);
        }

        summary[0] += cs.getLength();
        summary[1] += cs.getFileCount();
        summary[2] += cs.getDirectoryCount();

      } catch (IOException e) {
        LOG.info("Cannot get size of " + path + ". Safely ignored.");
        if (path != null) {
          ctx.addCS(path, new ContentSummary(0, 0, 0));
        }
      }
    }
    return new ContentSummary(summary[0], summary[1], summary[2]);
  }
View Full Code Here

    long r = 0;
    FileSystem fs = FileSystem.get(job);
    // For each input path, calculate the total size.
    for (String path: work.getPathToAliases().keySet()) {
      try {
        ContentSummary cs = fs.getContentSummary(new Path(path));
        r += cs.getLength();
      } catch (IOException e) {
        LOG.info("Cannot get size of " + path + ". Safely ignored.");
      }
    }
    return r;
View Full Code Here

        build();
  }
 
  public static ContentSummary convert(ContentSummaryProto cs) {
    if (cs == null) return null;
    return new ContentSummary(
      cs.getLength(), cs.getFileCount(), cs.getDirectoryCount(), cs.getQuota(),
      cs.getSpaceConsumed(), cs.getSpaceQuota());
  }
View Full Code Here

    final long directoryCount = (Long)m.get("directoryCount");
    final long quota = (Long)m.get("quota");
    final long spaceConsumed = (Long)m.get("spaceConsumed");
    final long spaceQuota = (Long)m.get("spaceQuota");

    return new ContentSummary(length, fileCount, directoryCount,
        quota, spaceConsumed, spaceQuota);
  }
View Full Code Here

        if (filter != null && !filter.accept(p)) {
          continue;
        }

        ContentSummary cs = ctx.getCS(path);
        if (cs == null) {
          if (path == null) {
            continue;
          }
          pathNeedProcess.add(path);
        } else {
          summary[0] += cs.getLength();
          summary[1] += cs.getFileCount();
          summary[2] += cs.getDirectoryCount();
        }
      }

      // Process the case when name node call is needed
      final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
      ArrayList<Future<?>> results = new ArrayList<Future<?>>();
      final ThreadPoolExecutor executor;
      int maxThreads = ctx.getConf().getInt("mapred.dfsclient.parallelism.max", 0);
      if (pathNeedProcess.size() > 1 && maxThreads > 1) {
        int numExecutors = Math.min(pathNeedProcess.size(), maxThreads);
        LOG.info("Using " + numExecutors + " threads for getContentSummary");
        executor = new ThreadPoolExecutor(numExecutors, numExecutors, 60, TimeUnit.SECONDS,
            new LinkedBlockingQueue<Runnable>());
      } else {
        executor = null;
      }

      HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
        @Override
        public void interrupt() {
          if (executor != null) {
            executor.shutdownNow();
          }
        }
      });
      try {
        Configuration conf = ctx.getConf();
        JobConf jobConf = new JobConf(conf);
        for (String path : pathNeedProcess) {
          final Path p = new Path(path);
          final String pathStr = path;
          // All threads share the same Configuration and JobConf based on the
          // assumption that they are thread safe if only read operations are
          // executed. It is not stated in Hadoop's javadoc, the sourcce codes
          // clearly showed that they made efforts for it and we believe it is
          // thread safe. Will revisit this piece of codes if we find the assumption
          // is not correct.
          final Configuration myConf = conf;
          final JobConf myJobConf = jobConf;
          final PartitionDesc partDesc = work.getPathToPartitionInfo().get(
              p.toString());
          Runnable r = new Runnable() {
            public void run() {
              try {
                ContentSummary resultCs;

                Class<? extends InputFormat> inputFormatCls = partDesc
                    .getInputFileFormatClass();
                InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
                    inputFormatCls, myJobConf);
                if (inputFormatObj instanceof ContentSummaryInputFormat) {
                  resultCs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p,
                      myJobConf);
                } else {
                  FileSystem fs = p.getFileSystem(myConf);
                  resultCs = fs.getContentSummary(p);
                }
                resultMap.put(pathStr, resultCs);
              } catch (IOException e) {
                // We safely ignore this exception for summary data.
                // We don't update the cache to protect it from polluting other
                // usages. The worst case is that IOException will always be
                // retried for another getInputSummary(), which is fine as
                // IOException is not considered as a common case.
                LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
              }
            }
          };

          if (executor == null) {
            r.run();
          } else {
            Future<?> result = executor.submit(r);
            results.add(result);
          }
        }

        if (executor != null) {
          for (Future<?> result : results) {
            boolean executorDone = false;
            do {
              try {
                result.get();
                executorDone = true;
              } catch (InterruptedException e) {
                LOG.info("Interrupted when waiting threads: ", e);
                Thread.currentThread().interrupt();
                break;
              } catch (ExecutionException e) {
                throw new IOException(e);
              }
            } while (!executorDone);
          }
          executor.shutdown();
        }
        HiveInterruptUtils.checkInterrupted();
        for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
          ContentSummary cs = entry.getValue();

          summary[0] += cs.getLength();
          summary[1] += cs.getFileCount();
          summary[2] += cs.getDirectoryCount();

          ctx.addCS(entry.getKey(), cs);
          LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength()
              + " file count: "
              + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
        }

        perfLogger.PerfLogEnd(LOG, PerfLogger.INPUT_SUMMARY);
        return new ContentSummary(summary[0], summary[1], summary[2]);
      } finally {
        HiveInterruptUtils.remove(interrup);
      }
    }
  }
View Full Code Here

    }
  }

  public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx)
      throws Exception {
    ContentSummary cs = ctx.getCS(dirPath);
    if (cs != null) {
      LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: "
          + cs.getFileCount() + " num directories: " + cs.getDirectoryCount());
      return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1);
    } else {
      LOG.info("Content Summary not cached for " + dirPath);
    }
    return isEmptyPath(job, dirPath);
  }
View Full Code Here

              " is null, which is not expected.");
        }

        Path p = table.getPath();
        FileSystem fs = null;
        ContentSummary resultCs = null;
        try {
          fs = table.getPath().getFileSystem(pCtx.getConf());
          resultCs = fs.getContentSummary(p);
        } catch (IOException e) {
          LOG.warn("Encounter a error while querying content summary of table " +
              table.getCompleteName() + " from FileSystem. " +
              "Cannot guess if CommonJoinOperator will optimize " +
              joinOp.getName() + " " + joinOp.getIdentifier());
        }
        if (resultCs == null) {
          isAbleToGuess = false;
          break;
        }

        long size = resultCs.getLength();
        aliasTotalKnownInputSize += size;
        Long es = aliasToSize.get(alias);
        if(es == null) {
          es = new Long(0);
        }
View Full Code Here

    // map-reduce jobs will be run locally based on data size
    // first find out if any of the jobs needs to run non-locally
    boolean hasNonLocalJob = false;
    for (ExecDriver mrtask : mrtasks) {
      try {
        ContentSummary inputSummary = Utilities.getInputSummary
            (ctx, ((MapredWork) mrtask.getWork()).getMapWork(), p);
        int numReducers = getNumberOfReducers(mrtask.getWork(), conf);

        long estimatedInput;

        if (globalLimitCtx != null && globalLimitCtx.isEnable()) {
          // If the global limit optimization is triggered, we will
          // estimate input data actually needed based on limit rows.
          // estimated Input = (num_limit * max_size_per_row) * (estimated_map + 2)
          //
          long sizePerRow = HiveConf.getLongVar(conf,
              HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
          estimatedInput = globalLimitCtx.getGlobalLimit() * sizePerRow;
          long minSplitSize = HiveConf.getLongVar(conf,
              HiveConf.ConfVars.MAPREDMINSPLITSIZE);
          long estimatedNumMap = inputSummary.getLength() / minSplitSize + 1;
          estimatedInput = estimatedInput * (estimatedNumMap + 1);
        } else {
          estimatedInput = inputSummary.getLength();
        }

        if (LOG.isDebugEnabled()) {
          LOG.debug("Task: " + mrtask.getId() + ", Summary: " +
              inputSummary.getLength() + "," + inputSummary.getFileCount() + ","
              + numReducers + ", estimated Input: " + estimatedInput);
        }

        if (MapRedTask.isEligibleForLocalMode(conf, numReducers,
            estimatedInput, inputSummary.getFileCount()) != null) {
          hasNonLocalJob = true;
          break;
        } else {
          mrtask.setLocalMode(true);
        }
View Full Code Here

      final StreamingOutput streaming = getListingStream(np, fullpath);
      return Response.ok(streaming).type(MediaType.APPLICATION_JSON).build();
    }
    case GETCONTENTSUMMARY:
    {
      final ContentSummary contentsummary = np.getContentSummary(fullpath);
      final String js = JsonUtil.toJsonString(contentsummary);
      return Response.ok(js).type(MediaType.APPLICATION_JSON).build();
    }
    case GETFILECHECKSUM:
    {
View Full Code Here

TOP

Related Classes of org.apache.hadoop.fs.ContentSummary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.