Package edu.stanford.nlp.trees.international.pennchinese

Source Code of edu.stanford.nlp.trees.international.pennchinese.ChineseCollinizer

package edu.stanford.nlp.trees.international.pennchinese;

import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeTransformer;

import java.util.ArrayList;
import java.util.List;

/**
* Performs collinization operations on Chinese trees similar to
* those for English Namely: <ul>
* <li> strips all functional &amp; automatically-added tags
* <li> strips all punctuation
* <li> merges PRN and ADVP
* <li> eliminates ROOT (note that there are a few non-unary ROOT nodes;
* these are not eliminated)
* </ul>
*
* @author Roger Levy
* @author Christopher Manning
*/
public class ChineseCollinizer implements TreeTransformer {

  private final static boolean VERBOSE = false;

  private final boolean deletePunct;
  ChineseTreebankLanguagePack ctlp;

  protected TreeFactory tf = new LabeledScoredTreeFactory();


  public ChineseCollinizer(ChineseTreebankLanguagePack ctlp) {
    this(ctlp, true);
  }

  public ChineseCollinizer(ChineseTreebankLanguagePack ctlp, boolean deletePunct) {
    this.deletePunct = deletePunct;
    this.ctlp = ctlp;
  }


  public Tree transformTree(Tree tree) {
    return transformTree(tree, true);
  }

  private Tree transformTree(Tree tree, boolean isRoot) {
    String label = tree.label().value();

    // System.err.println("ChineseCollinizer: Node label is " + label);

    if (tree.isLeaf()) {
      if (deletePunct && ctlp.isPunctuationWord(label)) {
        return null;
      } else {
        return tf.newLeaf(new StringLabel(label));
      }
    }
    if (tree.isPreTerminal() && deletePunct && ctlp.isPunctuationTag(label)) {
      // System.out.println("Deleting punctuation");
      return null;
    }
    List<Tree> children = new ArrayList<Tree>();

    if (label.matches("ROOT.*") && tree.numChildren() == 1) { // keep non-unary roots for now
      return transformTree(tree.children()[0], true);
    }

    //System.out.println("Enhanced label is " + label);

    // remove all functional and machine-generated annotations
    label = label.replaceFirst("[^A-Z].*$", "");
    // merge parentheticals with adverb phrases
    label = label.replaceFirst("PRN", "ADVP");

    //System.out.println("New label is " + label);

    for (int cNum = 0; cNum < tree.children().length; cNum++) {
      Tree child = tree.children()[cNum];
      Tree newChild = transformTree(child, false);
      if (newChild != null) {
        children.add(newChild);
      }
    }
    // We don't delete the root because there are trees in the
    // Chinese treebank that only have punctuation in them!!!
    if (children.isEmpty() && ! isRoot) {
      if (VERBOSE) {
        System.err.println("ChineseCollinizer: all children of " + label +
                           " deleted; returning null");
      }
      return null;
    }
    return tf.newTreeNode(new StringLabel(label), children);
  }

}
TOP

Related Classes of edu.stanford.nlp.trees.international.pennchinese.ChineseCollinizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.