Examples of FileLineIterator


Examples of org.apache.mahout.common.FileLineIterator

    StringBuilder content = new StringBuilder();
    content.append(header);
    int filenumber = 0;
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    FileLineIterator it = new FileLineIterator(new File(dumpFilePath));
    while (it.hasNext()) {
      String thisLine = it.next();
      if(thisLine.trim().startsWith("<page>")){
        boolean end = false;
        while(thisLine.trim().startsWith("</page>") == false){
          content.append(thisLine).append('\n');
          if (it.hasNext()) {
            thisLine = it.next();
          } else {
            end = true;
            break;
          }
        }
View Full Code Here

Examples of org.apache.mahout.common.FileLineIterator

  }

  @Override
  protected DataModel buildModel() throws IOException {
    FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>();
    FileLineIterator iterator = new FileLineIterator(getDataFile(), false);
    processFile(iterator, data, ',');
    return new GenericDataModel(GenericDataModel.toDataMap(data, true));
  }
View Full Code Here

Examples of org.apache.mahout.common.FileLineIterator

   * @param path data path
   * @return
   */
  public static int[] extractLabels(Dataset dataset, FileSystem fs, Path path) throws IOException {
    FSDataInputStream input = fs.open(path);
    FileLineIterator iterator = new FileLineIterator(input);

    int[] labels = new int[dataset.nbInstances()];
    DataConverter converter = new DataConverter(dataset);
   
    int index = 0;
   
    while (iterator.hasNext()) {
      labels[index++] = converter.convert(0, iterator.next()).label;
    }
   
    iterator.close();
   
    return labels;
  }
View Full Code Here

Examples of org.apache.mahout.common.FileLineIterator

    }
  }

  // Reads dictionary in created by the vector Driver in util
  private static List<String> readDictionary(File path) throws IOException {
    FileLineIterator it = new FileLineIterator(path);

    List<String> result = new ArrayList<String>();

    // skip 2 lines
    it.next();
    it.next();
    while (it.hasNext()) {
      String line = it.next();
      String[] parts = TAB_PATTERN.split(line);
      String word = parts[0];
      int index = Integer.parseInt(parts[2]);
      if (index != result.size()) {
        throw new IllegalArgumentException();
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

     * <pre>
     * term DocFreq Index
     * </pre>
     */
    static String[] loadTermDictionary(InputStream is) throws IOException {
      FileLineIterator it = new FileLineIterator(is);

      int numEntries = Integer.parseInt(it.next());
      // System.out.println(numEntries);
      String[] result = new String[numEntries];

      while (it.hasNext()) {
        String line = it.next();
        if (line.startsWith("#")) {
          continue;
        }
        String[] tokens = ClusterUtil.TAB_PATTERN.split(line);
        if (tokens.length < 3) {
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

   * <pre>
   * term DocFreq Index
   * </pre>
   */
  private static String[] loadTermDictionary(InputStream is) throws IOException {
    FileLineIterator it = new FileLineIterator(is);
   
    int numEntries = Integer.parseInt(it.next());
    String[] result = new String[numEntries];
   
    while (it.hasNext()) {
      String line = it.next();
      if (line.startsWith("#")) {
        continue;
      }
      String[] tokens = TAB_PATTERN.split(line);
      if (tokens.length < 3) {
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

    files = out.listFiles();
    assertEquals("files Size: " + files.length + " is not: " + WORDS.length, files.length, WORDS.length);
    for (File file : files) {
      //should only be one line in the file, and it should be label label
      Iterator<String> it = new FileLineIterator(file);
      String line = it.next().trim();
      assertFalse(it.hasNext());
      String label = "animal" + '\t' + file.getName();
      assertEquals(line + ":::: is not equal to " + label + "::::", line, label);
    }
  }
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

   
    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);
    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
      // default compression format from http://download.wikimedia.org
      CompressionCodec codec = new BZip2Codec();
      it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
      // assume the user has previously de-compressed the dump file
      it = new FileLineIterator(dumpFile);
    }
    int filenumber = 0;
    while (it.hasNext()) {
      String thisLine = it.next();
      if (thisLine.trim().startsWith("<page>")) {
        boolean end = false;
        while (!thisLine.trim().startsWith("</page>")) {
          content.append(thisLine).append('\n');
          if (it.hasNext()) {
            thisLine = it.next();
          } else {
            end = true;
            break;
          }
        }
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

   * <pre>
   * term DocFreq Index
   * </pre>
   */
  private static String[] loadTermDictionary(InputStream is) throws IOException {
    FileLineIterator it = new FileLineIterator(is);

    int numEntries = Integer.parseInt(it.next());
    String[] result = new String[numEntries];

    while (it.hasNext()) {
      String line = it.next();
      if (line.startsWith("#")) {
        continue;
      }
      String[] tokens = TAB_PATTERN.split(line);
      if (tokens.length < 3) {
View Full Code Here

Examples of org.apache.mahout.common.iterator.FileLineIterator

    log.info("Creating FileDataModel for file {}", dataFile);

    this.lastModified = dataFile.lastModified();
    this.lastUpdateFileModified = readLastUpdateFileModified();

    FileLineIterator iterator = new FileLineIterator(dataFile, false);
    String firstLine = iterator.peek();
    while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
      iterator.next();
      firstLine = iterator.peek();
    }
    Closeables.close(iterator, true);

    delimiter = determineDelimiter(firstLine);
    delimiterPattern = Splitter.on(delimiter);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.