Package opennlp.tools.lang.english

Source Code of opennlp.tools.lang.english.TreebankNameFinder

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.lang.english;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import opennlp.tools.namefind.NameFinderEventStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.parser.Parse;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;

/**
* Class is used to create a name finder for English.
*
* @deprecated will be removed soon!
*/
@Deprecated
public class TreebankNameFinder {
 
  public static String[] NAME_TYPES = {"person", "organization", "location", "date", "time", "percentage", "money"};

  private NameFinderME nameFinder;
 
  /** Creates an English name finder using the specified model.
   * @param mod The model used for finding names.
   */
  public TreebankNameFinder(TokenNameFinderModel mod) {
    nameFinder = new NameFinderME(mod);
  }

  private static void clearPrevTokenMaps(TreebankNameFinder[] finders) {
    for (int mi = 0; mi < finders.length; mi++) {
      finders[mi].nameFinder.clearAdaptiveData();
    }
  }

  private static void processParse(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
    Span[][] nameSpans = new Span[finders.length][];
   
    for (String line = input.readLine(); null != line; line = input.readLine()) {
      if (line.equals("")) {
        System.out.println();
        clearPrevTokenMaps(finders);
        continue;
      }
      Parse p = Parse.parseParse(line);
      Parse[] tagNodes = p.getTagNodes();
      String[] tokens = new String[tagNodes.length];
      for (int ti=0;ti<tagNodes.length;ti++){
        tokens[ti] = tagNodes[ti].getCoveredText();
      }
      //System.err.println(java.util.Arrays.asList(tokens));
      for (int fi = 0, fl = finders.length; fi < fl; fi++) {
        nameSpans[fi] = finders[fi].nameFinder.find(tokens);
        //System.err.println("english.NameFinder.processParse: "+tags[fi] + " " + java.util.Arrays.asList(nameSpans[fi]));
      }
     
      for (int fi = 0, fl = finders.length; fi < fl; fi++) {
        Parse.addNames(tags[fi],nameSpans[fi],tagNodes);
      }
      p.show();
    }
  }
     
  /**
   * Adds sgml style name tags to the specified input buffer and outputs this information to stdout.
   * @param finders The name finders to be used.
   * @param tags The tag names for the corresponding name finder.
   * @param input The input reader.
   * @throws IOException
   */
  private static void processText(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
    Span[][] nameSpans = new Span[finders.length][];
    String[][] nameOutcomes = new String[finders.length][];
    opennlp.tools.tokenize.Tokenizer tokenizer = new SimpleTokenizer();
    StringBuffer output = new StringBuffer();
    for (String line = input.readLine(); null != line; line = input.readLine()) {
      if (line.equals("")) {
        clearPrevTokenMaps(finders);
        System.out.println();
        continue;
      }
      output.setLength(0);
      Span[] spans = tokenizer.tokenizePos(line);
      String[] tokens = Span.spansToStrings(spans,line);
      for (int fi = 0, fl = finders.length; fi < fl; fi++) {
        nameSpans[fi] = finders[fi].nameFinder.find(tokens);
        //System.err.println("EnglighNameFinder.processText: "+tags[fi] + " " + java.util.Arrays.asList(finderTags[fi]));
        nameOutcomes[fi] = NameFinderEventStream.generateOutcomes(nameSpans[fi], null, tokens.length);
      }
     
      for (int ti = 0, tl = tokens.length; ti < tl; ti++) {
        for (int fi = 0, fl = finders.length; fi < fl; fi++) {
          //check for end tags
          if (ti != 0) {
            if ((nameOutcomes[fi][ti].equals(NameFinderME.START) || nameOutcomes[fi][ti].equals(NameFinderME.OTHER)) &&
                (nameOutcomes[fi][ti - 1].equals(NameFinderME.START) || nameOutcomes[fi][ti - 1].equals(NameFinderME.CONTINUE))) {
              output.append("</").append(tags[fi]).append(">");
            }
          }
        }
        if (ti > 0 && spans[ti - 1].getEnd() < spans[ti].getStart()) {
          output.append(line.substring(spans[ti - 1].getEnd(), spans[ti].getStart()));
        }
        //check for start tags
        for (int fi = 0, fl = finders.length; fi < fl; fi++) {
          if (nameOutcomes[fi][ti].equals(NameFinderME.START)) {
            output.append("<").append(tags[fi]).append(">");
          }
        }
        output.append(tokens[ti]);
      }
      //final end tags
      if (tokens.length != 0) {
        for (int fi = 0, fl = finders.length; fi < fl; fi++) {
          if (nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.START) || nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.CONTINUE)) {
            output.append("</").append(tags[fi]).append(">");
          }
        }
      }
      if (tokens.length != 0) {
        if (spans[tokens.length - 1].getEnd() < line.length()) {
          output.append(line.substring(spans[tokens.length - 1].getEnd()));
        }
      }
      System.out.println(output);
    }
  }

  public static void main(String[] args) throws IOException {
    if (args.length == 0) {
      System.err.println("Usage NameFinder -[parse] model1 model2 ... modelN < sentences");
      System.err.println(" -parse: Use this option to find names on parsed input.  Un-tokenized sentence text is the default.");
      System.exit(1);
    }
    int ai = 0;
    boolean parsedInput = false;
    while (args[ai].startsWith("-") && ai < args.length) {
      if (args[ai].equals("-parse")) {
        parsedInput = true;
      }
      else {
        System.err.println("Ignoring unknown option "+args[ai]);
      }
      ai++;
    }
    TreebankNameFinder[] finders = new TreebankNameFinder[args.length-ai];
    String[] names = new String[args.length-ai];
    for (int fi=0; ai < args.length; ai++,fi++) {
      String modelName = args[ai];
      finders[fi] = new TreebankNameFinder(new TokenNameFinderModel(new FileInputStream(modelName)));
      int nameStart = modelName.lastIndexOf(System.getProperty("file.separator")) + 1;
      int nameEnd = modelName.indexOf('.', nameStart);
      if (nameEnd == -1) {
        nameEnd = modelName.length();
      }
      names[fi] = modelName.substring(nameStart, nameEnd);
    }
    //long t1 = System.currentTimeMillis();
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    if (parsedInput) {
      processParse(finders,names,in);
    }
    else {
      processText(finders,names,in);
    }
    //long t2 = System.currentTimeMillis();
    //System.err.println("Time "+(t2-t1));
  }
}
TOP

Related Classes of opennlp.tools.lang.english.TreebankNameFinder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.