Package edu.stanford.nlp.international.arabic.pipeline

Source Code of edu.stanford.nlp.international.arabic.pipeline.MWETreeVisitorExternal

package edu.stanford.nlp.international.arabic.pipeline;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory;
import edu.stanford.nlp.util.Generics;

/**
* Converts all contiguous MWEs listed in an MWE list to flattened trees.
*
* @author Spence Green
*
*/
public class MWETreeVisitorExternal implements TreeVisitor {

  private static final String mweFile = "/home/rayder441/sandbox/javanlp/projects/core/data/edu/stanford/nlp/pipeline/attia-mwe-list.txt.out.tok.fixed.proc.uniq";
 
  private final Set<String> mweDictionary;
 
  public MWETreeVisitorExternal() {
    mweDictionary = loadMWEs();
  }
 
  private Set<String> loadMWEs() {
    Set<String> mweSet = Generics.newHashSet()
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(mweFile), "UTF-8"));
      for (String line; (line = br.readLine()) != null;) {
        mweSet.add(line.trim());
      }
      br.close();
   
    } catch (UnsupportedEncodingException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return mweSet;
  }


  /**
   * Perform (possibly destructive) operations on the tree. Do a top-down DFS on the tree.
   */
  public void visitTree(Tree tree) {
    if (tree == null) return;
    String yield = Sentence.listToString(tree.yield());
    if (mweDictionary.contains(yield)) {
      List<Tree> children = getPreterminalSubtrees(tree);
      String newLabel = "MW" + tree.value();
      tree.setValue(newLabel);
      tree.setChildren(children);
      // Bottom out of the recursion
      return;
     
    } else {
      for (Tree subTree : tree.children()) {
        if (subTree.isPhrasal()) {
          // Only phrasal trees can have yields > 1!!
          visitTree(subTree);
        }
      }
    }
  }
 
  private List<Tree> getPreterminalSubtrees(Tree tree) {
    List<Tree> preterminals = new ArrayList<Tree>();
    for (Tree subTree : tree) {
      if (subTree.isPreTerminal()) {
        preterminals.add(subTree);
      }
    }
    return preterminals;
  }
 
  /**
   * For debugging.
   *
   * @param args
   */
  public static void main(String[] args) {
    if (args.length != 1) {
      System.err.printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", MWETreeVisitorExternal.class.getName());
      System.exit(-1);
    }
   
    TreeReaderFactory trf = new ArabicTreeReaderFactory();
    try {
      TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")));
      TreeVisitor visitor = new MWETreeVisitorExternal();
     
      int treeId = 0;
      for (Tree tree; (tree = tr.readTree()) != null; ++treeId) {
        if (tree.value().equals("ROOT")) {
          // Skip over the ROOT tag
          tree = tree.firstChild();
        }
        visitor.visitTree(tree);
        System.out.println(tree.toString());
      }
      tr.close();
     
      System.err.printf("Processed %d trees.%n", treeId);
   
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of edu.stanford.nlp.international.arabic.pipeline.MWETreeVisitorExternal

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.