Package cc.mallet.pipe.iterator

Source Code of cc.mallet.pipe.iterator.FileListIterator

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org.  For further
information, see the file `LICENSE' included with this distribution. */




/**
  @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a>
  */

package cc.mallet.pipe.iterator;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.net.URI;
import java.util.regex.*;
import java.io.*;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.util.Strings;

/**
* An iterator that generates instances for a pipe from a list of filenames.
* Each file is treated as a text file whose target is determined by
* a user-specified regular expression pattern applied to the filename
*
@author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a>
*/
public class FileListIterator implements Iterator<Instance>
{
  FileFilter fileFilter;
  ArrayList fileArray;
  Iterator subIterator;
  Pattern targetPattern;  // Set target slot to string coming from 1st group of this Pattern
  int commonPrefixIndex;

  /** Special value that means to use the directories[i].getPath() as the target name */
  // xxx Note that these are specific to UNIX directory delimiter characters!  Fix this.

  /** Use as label names the directories of the given files,
   * optionally removing common prefix of all starting directories
   */
  public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_");
  /** Use as label names the first directory in the filename. */
  public static final Pattern FIRST_DIRECTORY = Pattern.compile ("/?([^/]*)/.+");
  /** Use as label name the last directory in the filename. */
  public static final Pattern LAST_DIRECTORY = Pattern.compile(".*/([^/]+)/[^/]+"); // was ("([^/]*)/[^/]+");
  /** Use as label names all the directory names in the filename. */
  public static final Pattern ALL_DIRECTORIES = Pattern.compile ("^(.*)/[^/]+");


  /* Pass null as targetPattern to get null targets */
  /**
   * Construct an iterator over the given arry of Files
   *
   * The instances constructed from the files are returned in the same order
   * as they appear in the given array
   *
   * @param files  Array of files from which to construct instances
   * @param fileFilter   class implementing interface FileFilter that will decide which names to accept.
   *                     May be null.
   * @param targetPattern  regex Pattern applied to the filename whose first parenthesized group
   *                       on matching is taken to be the target value of the generated instance.
   *                       The pattern is applied to the filename with the matcher.find() method.
   * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES
   *                           pattern, removing the common prefix of all initially specified
   *                           directories, leaving the remainder of each filename as the target value.
   *
   */
  public FileListIterator(File[] files, FileFilter fileFilter,
      Pattern targetPattern, boolean removeCommonPrefix)
  {
    this.fileFilter = fileFilter;
    this.fileArray = new ArrayList();
    this.targetPattern = targetPattern;

    fillFileArrayAssignCommonPrefixIndexAndSubIterator(files, removeCommonPrefix);
  }

  public FileListIterator(String[] filenames, FileFilter fileFilter,
      Pattern targetPattern, boolean removeCommonPrefix)
  {
    this(FileIterator.stringArray2FileArray(filenames), fileFilter,
        targetPattern, removeCommonPrefix);
  }

  /**
   * Construct a FileListIterator with the file containing the list of files, which
   * contains one filename per line. 
   *
   * The instances constructed from the filelist are returned in the same order
   * as listed
   */
  public FileListIterator(File filelist, FileFilter fileFilter,
      Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException
  {
    this.fileFilter = fileFilter;
    this.fileArray = new ArrayList();
    this.targetPattern = targetPattern;

    List filenames = readFileNames (filelist);
    File[] fa = stringList2FileArray (filenames, null);

    fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix);
  }

  /**
   * Construct a FileListIterator with the file containing the list of files
   *   of RELATIVE pathnames, one filename per line.
   * <p>
   * The instances constructed from the filelist are returned in the same order
   * as listed
   * @param filelist List of relative file names.
   * @param baseDirectory Base directory for relative file names.
   *
   */
  public FileListIterator(File filelist, File baseDirectory, FileFilter fileFilter,
      Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException
  {
    this.fileFilter = fileFilter;
    this.fileArray = new ArrayList();
    this.targetPattern = targetPattern;

    List filenames = readFileNames (filelist);
    File[] fa = stringList2FileArray (filenames, baseDirectory);

    fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix);
  }


  private static File[] stringList2FileArray (List filenames, File baseDir)
  {
    File[] fa = new File[filenames.size()];

    for (int i = 0; i < filenames.size(); i++)
      if (baseDir != null) {
        fa[i] = new File (baseDir, (String) filenames.get(i));
      } else {
        fa[i] = new File ((String) filenames.get(i));
      }
    return fa;
  }

  private static List readFileNames (File filelist) throws IOException
  {
    ArrayList filenames = new ArrayList();
    BufferedReader reader = new BufferedReader(new FileReader (filelist));
    String filename = reader.readLine();

    while (filename != null && filename.trim().length() > 0) {
      filenames.add(filename.trim());
      filename = reader.readLine();
    }

    reader.close();
    return filenames;
  }

  public FileListIterator(String filelistName, FileFilter fileFilter,
                          Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException
  {
    this (new File(filelistName), fileFilter, targetPattern, removeCommonPrefix);
  }

  public FileListIterator(String filelistName, Pattern targetPattern) throws FileNotFoundException, IOException
  {
    this (new File(filelistName), null, targetPattern, true);
  }

  // The PipeInputIterator interface
  public Instance next ()
  {
    File nextFile = (File) subIterator.next();
    String path = nextFile.getParent();
    String targetName = null;

    if (targetPattern == STARTING_DIRECTORIES) {
      targetName = path.substring(commonPrefixIndex);
    }
    else if (targetPattern != null) {
      Matcher m = targetPattern.matcher(path);
      if (m.find ()){
        targetName = m.group (1);
      }
    }

    return new Instance (nextFile, targetName, nextFile.toURI(), null);
  }

  public File nextFile ()
  {
    return (File) subIterator.next();   
  }

  public boolean hasNext () 
  {
    return subIterator.hasNext();
  }
 
  public void remove () {
    throw new IllegalStateException ("This Iterator<Instance> does not support remove().");
  }


  public ArrayList getFileArray()
  {
    return fileArray;
  }

  private void fillFileArrayAssignCommonPrefixIndexAndSubIterator(File[] files, boolean removeCommonPrefix)
  {
    ArrayList filenames = new ArrayList();

    for (int i = 0; i < files.length; i++) {
      if (files[i].isDirectory())
        throw new IllegalArgumentException(files[i] + " is not a file.");
      else if (! files[i].exists())
        throw new IllegalArgumentException(files[i] + " does not exist.");

      if (this.fileFilter == null || this.fileFilter.accept(files[i])) {
        this.fileArray.add(files[i]);

        if (removeCommonPrefix)
          filenames.add(files[i].getPath());
      }
    }

    this.subIterator = this.fileArray.iterator();

    if (removeCommonPrefix) { // find the common prefix index of all filenames

      String[] fn = new String[filenames.size()];

      for (int i = 0; i < fn.length; i++)
        fn[i] = (String) filenames.get(i);

      this.commonPrefixIndex = Strings.commonPrefixIndex(fn);
    }
    else
      this.commonPrefixIndex = 0;


  }

}
TOP

Related Classes of cc.mallet.pipe.iterator.FileListIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.