Source Code of edu.stanford.nlp.trees.DiskTreebank

package edu.stanford.nlp.trees;


import java.io.*;
import java.util.*;


import edu.stanford.nlp.io.FileSequentialCollection;
import edu.stanford.nlp.io.RuntimeIOException;




/**
 * A <code>DiskTreebank</code> is a <code>Collection</code> of
 * <code>Tree</code>s.
 * A <code>DiskTreebank</code> object stores merely the information to
 * get at a corpus of trees that is stored on disk.  Access is usually
 * via apply()'ing a TreeVisitor to each Tree in the Treebank or by using
 * an iterator() to get an iteration over the Trees.
 * <p/>
 * If the root Label of the Tree objects built by the TreeReader
 * implements HasIndex, then the filename and index of the tree in
 * a corpus will be inserted as they are read in.
 *
 * @author Christopher Manning
 */
public final class DiskTreebank extends Treebank {


  private static final boolean PRINT_FILENAMES = false;


  private final ArrayList<File> filePaths = new ArrayList<File>();
  private final ArrayList<FileFilter> fileFilters = new ArrayList<FileFilter>();


  /**
   * Maintains as a class variable the <code>File</code> from which
   * trees are currently being read.
   */
  private File currentFile; // = null;




  /**
   * Create a new DiskTreebank.
   * The trees are made with a <code>LabeledScoredTreeReaderFactory</code>.
   * <p/>
   * <i>Compatibility note: Until Sep 2004, this used to create a Treebank
   * with a SimpleTreeReaderFactory, but this was changed as the old
   * default wasn't very useful, especially to naive users.</i>
   */
  public DiskTreebank() {
    this(new LabeledScoredTreeReaderFactory());
  }


  /**
   * Create a new treebank, set the encoding for file access.
   *
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(String encoding) {
    this(new LabeledScoredTreeReaderFactory(), encoding);
  }


  /**
   * Create a new DiskTreebank.
   *
   * @param trf the factory class to be called to create a new
   *            <code>TreeReader</code>
   */
  public DiskTreebank(TreeReaderFactory trf) {
    super(trf);
  }


  /**
   * Create a new DiskTreebank.
   *
   * @param trf      the factory class to be called to create a new
   *                 <code>TreeReader</code>
   * @param encoding The charset encoding to use for treebank file decoding
   */
  public DiskTreebank(TreeReaderFactory trf, String encoding) {
    super(trf, encoding);
  }


  /**
   * Create a new Treebank.
   * The trees are made with a <code>LabeledScoredTreeReaderFactory</code>.
   * <p/>
   * <i>Compatibility note: Until Sep 2004, this used to create a Treebank
   * with a SimpleTreeReaderFactory, but this was changed as the old
   * default wasn't very useful, especially to naive users.</i>
   *
   * @param initialCapacity The initial size of the underlying Collection.
   *                        For a <code>DiskTreebank</code>, this parameter is ignored.
   */
  public DiskTreebank(int initialCapacity) {
    this(initialCapacity, new LabeledScoredTreeReaderFactory());
  }


  /**
   * Create a new Treebank.
   *
   * @param initialCapacity The initial size of the underlying Collection,
   *                        For a <code>DiskTreebank</code>, this parameter is ignored.
   * @param trf             the factory class to be called to create a new
   *                        <code>TreeReader</code>
   */
  @SuppressWarnings({"UnusedDeclaration"})
  public DiskTreebank(int initialCapacity, TreeReaderFactory trf) {
    this(trf);
  }




  /**
   * Empty a <code>Treebank</code>.
   */
  @Override
  public void clear() {
    filePaths.clear();
    fileFilters.clear();
  }


  /**
   * Load trees from given directory.  This version just records
   * the paths to be processed, and actually processes them at apply time.
   *
   * @param path file or directory to load from
   * @param filt a FilenameFilter of files to load
   */
  @Override
  public void loadPath(File path, FileFilter filt) {
    filePaths.add(path);
    fileFilters.add(filt);
  }


  /**
   * Applies the TreeVisitor to to all trees in the Treebank.
   *
   * @param tp A class that can process trees.
   */
  @Override
  public void apply(final TreeVisitor tp) {
    for (Tree t : this) {
      tp.visitTree(t);
    }
  }


  /**
   * Return the <code>File</code> from which trees are currently being
   * read by an Iterator or <code>apply()</code> and passed to a
   * <code>TreePprocessor</code>.
   * <p/>
   * This is useful if one wants to map the original file and
   * directory structure over to a set of modified trees.  New code
   * might prefer to build trees with labels that implement
   * HasIndex.
   *
   * @return the file that trees are currently being read from, or
   *         <code>null</code> if no file is currently open
   */
  public File getCurrentFile() {
    return currentFile;
  }




  private class DiskTreebankIterator implements Iterator<Tree> {


    private int fileUpto; // = 0 (will start on index array 0)
    Iterator<File> fileIterator;
    private TreeReader tr;
    private Tree storedTree;  // null means iterator is exhausted (or not yet constructed)


    private DiskTreebankIterator() {
      storedTree = primeNextTree();
    }


    private Tree primeNextTree() {
      Tree nextTree = null;
      int fpsize = filePaths.size();
      while (nextTree == null && fileUpto <= fpsize) {
        if (tr == null && (fileIterator == null || ! fileIterator.hasNext())) {
          if (fileUpto < fpsize) {
            FileSequentialCollection fsc = new FileSequentialCollection(Collections.singletonList(filePaths.get(fileUpto)), fileFilters.get(fileUpto));
            fileIterator = fsc.iterator();
          }
          // else we're finished, but increment anyway so we leave outermost loop
          fileUpto++;
        }
        while (nextTree == null && (tr != null || (fileIterator != null && fileIterator.hasNext()))) {
          try {
            while (nextTree == null && (tr != null || (fileIterator != null && fileIterator.hasNext()))) {
              if (tr != null) {
                nextTree = tr.readTree();
                if (nextTree == null) {
                  tr.close();
                  tr = null;
                }
              }
              if (nextTree == null && (fileIterator != null && fileIterator.hasNext())) {
                currentFile = fileIterator.next();
                // maybe print file name to stdout to get some feedback
                if (PRINT_FILENAMES) {
                  System.err.println(currentFile);
                }
                tr = treeReaderFactory().newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), encoding())));
              }
            }
          } catch (IOException e) {
            throw new RuntimeIOException("primeNextTree IO Exception in file " + currentFile, e);
          }
        }
      }
      if (nextTree == null) {
        currentFile = null;
      }
      return nextTree;
    }




    /**
     * Returns true if the iteration has more elements.
     */
    public boolean hasNext() {
      return storedTree != null;
    }


    /**
     * Returns the next element in the iteration.
     */
    public Tree next() {
      if (storedTree == null) {
        throw new NoSuchElementException();
      }
      Tree ret = storedTree;
      storedTree = primeNextTree();
      return ret;
    }


    /**
     * Not supported
     */
    public void remove() {
      throw new UnsupportedOperationException();
    }


  } // end class DiskTreebankIterator




  /**
   * Return an Iterator over Trees in the Treebank.  This is implemented
   * by building per-file MemoryTreebanks for the files in the
   * DiskTreebank.  As such, it isn't as efficient as using
   * <code>apply()</code>.
   */
  @Override
  public Iterator<Tree> iterator() {
    return new DiskTreebankIterator();
  }


}
Source Code of edu.stanford.nlp.trees.DiskTreebank

Related Classes of edu.stanford.nlp.trees.DiskTreebank