Package org.fnlp.nlp.cn

Source Code of org.fnlp.nlp.cn.CNFactory

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.cn;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;

import org.fnlp.ml.types.Dictionary;
import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.nlp.cn.tag.NERTagger;
import org.fnlp.nlp.cn.tag.POSTagger;
import org.fnlp.nlp.parser.dep.DependencyTree;
import org.fnlp.nlp.parser.dep.JointParser;
import org.fnlp.nlp.parser.dep.TreeCache;
import org.fnlp.util.exception.LoadModelException;
/**
* 统一的自然语言处理入口
* 自然语言处理对象有此函数产生, 确保整个系统只有这一个对象,避免重复构造。
* @author xpqiu
*/
public class CNFactory {

  public static CWSTagger seg;
  public static POSTagger pos;
  public static NERTagger ner;
  public static JointParser parser;
  public static TreeCache treeCache;

  public static String segModel = "/seg.m";
  public static String posModel = "/pos.m";
  public static String parseModel = "/dep.m";
  public static Dictionary dict = new Dictionary(true);

  public static CNFactory factory = new CNFactory();
  public static ChineseTrans ct = new ChineseTrans();
  private static boolean isEnFilter = true;

  /**
   * 设置自定义词典
   * @param path
   * @throws IOException
   * @throws LoadModelException
   */
  public static void loadDict(String... path) throws LoadModelException {

    for(String file:path){
      dict.addFile(file);
    }
    setDict();

  }
  /**
   *
   * @param al 字典 ArrayList<String[]>
   *             每一个元素为一个单元String[].
   *             String[] 第一个元素为单词,后面为对应的词性
   * @throws LoadModelException
   */
  public static void addDict(ArrayList<String[]> al)  {
    dict.add(al);
    setDict();
  }
 
  public static void addDict(Collection<String> strs) {
    dict.addSegDict(strs);
    setDict();   
  }
 
  /**
   * 增加词典
   * @param words
   * @param pos
   */
  public static void addDict(Collection<String> words, String pos) {
    for(String w:words){
      dict.add(w, pos);
    }
    setDict();
  }

  /**
   * 更新词典
   */
  private static void setDict()  {
    if(dict==null||dict.size()==0)
      return;
    if(pos!=null){
      pos.setDictionary(dict, true);
    }
    else if(seg!=null){
      seg.setDictionary(dict);
    }
  }
 
  public static void saveDict(String path){
    if(dict==null||dict.size()==0)
      return;
    dict.save(path);
  }




  public enum Models{
    SEG,
    TAG,
    SEG_TAG,
    NER,
    PARSER,
    ALL,
  }


  /**
   *  初始化
   * @return
   */
  public static CNFactory getInstance(){
    return factory;
  }
  /**
   *  初始化,加载所有模型
   * @return
   * @throws LoadModelException
   */
  public static CNFactory getInstance(String path) throws LoadModelException{
    return getInstance(path,Models.ALL);
  }
  /**
   * 初始化
   * @param path 模型文件所在目录,结尾不带"/"。
   * @param model 载入模型类型
   * @return
   * @throws LoadModelException
   */
  public static CNFactory getInstance(String path,Models model) throws LoadModelException{
    if(path.endsWith("/"))
      path = path.substring(0,path.length()-1);
    if(model ==Models.SEG){
      loadSeg(path);
    }else if(model ==Models.SEG){
      loadTag(path);
    }else if(model ==Models.SEG_TAG){
      loadSeg(path);
      loadTag(path);
    }else if(model ==Models.ALL){
      loadSeg(path);
      loadTag(path);
      loadNER(path);
      loadParser(path);
    }   
    setDict();
    return factory;
  }
  /**
   * 读入句法模型
   * @param path 模型所在目录,结尾不带"/"。
   * @throws LoadModelException
   */
  public static void loadParser(String path) throws LoadModelException {
    if(parser==null){
      String file = path+parseModel;
      parser = new JointParser(file);
    }
  }
  /**
   * 读入实体名识别模型
   * @param path 模型所在目录,结尾不带"/"。
   * @throws LoadModelException
   */
  public static void loadNER(String path) throws LoadModelException {
    if(ner==null && pos!=null){
      ner = new NERTagger(pos);
    }
  }
  /**
   * 读入词性标注模型
   * @param path 模型所在目录,结尾不带"/"。
   * @throws LoadModelException
   */
  public static void loadTag(String path) throws LoadModelException {
    if(pos==null){

      String file = path+posModel;
      if(seg==null)
        pos = new POSTagger(file);
      else{
        pos = new POSTagger(seg,file);         
      }
    }
  }

  /**
   * 读入分词模型
   * @param path 模型所在目录,结尾不带"/"。
   * @throws LoadModelException
   */
  public static void loadSeg(String path) throws LoadModelException {
    if(seg==null)
    {
      String file = path+segModel;
      seg = new CWSTagger(file);
      seg.setEnFilter(isEnFilter);
    }
  }

  /**
   * 分词
   * @param input 字符串
   * @return 词数组
   */
  public String[] seg(String input){
    if(seg==null)
      return null;
    return seg.tag2Array(input);
  }

  /**
   * 词性标注
   * @param input 词数组
   * @return 词性数组
   */
  public String[] tag(String[] input){
    if(pos==null)
      return null;
    return pos.tagSeged(input);
  }

  /**
   * 词性标注
   * @param input 字符串
   * @return 词+词性数组
   */
  public String[][] tag(String input){
    if(pos==null||seg==null)
      return null;
    return pos.tag2Array(input);
  }
  /**
   * 词性标注
   * @param input 字符串
   * @return 词/词性 的连续字符串
   */
  public String tag2String(String input) {
    if(pos==null||seg==null)
      return null;
    return pos.tag(input);
  }

  /**
   * 句法分析
   * @param words 词数组
   * @param pos 词性数组
   * @return 依赖数组
   */
  public int[] parse(String[] words, String[] pos){
    if(parser==null)
      return null;
    return parser.parse(words, pos);
  }
  /**
   * 句法分析
   * @param tagg 词数组+词性数组
   * @return
   */
  public DependencyTree parse2T(String[][] tagg) {
    if(tagg==null)
      return null;
    return parse2T(tagg[0], tagg[1]);
  }

  /**
   * 句法分析
   * @param words 词数组
   * @param pos 词性数组
   * @return 句法树
   */
  public DependencyTree parse2T(String[] words, String[] pos) {
    if(parser==null)
      return null;
    if(treeCache!=null){
      DependencyTree tree = treeCache.get(words,pos);
      if(tree!=null)
        return tree;
    }
   
    if(words==null||pos==null||words.length==0||pos.length==0||words.length!=pos.length)
      return null;
    return parser.parse2T(words, pos);
  }
  /**
   * 句法树
   * @param sent 字符串
   * @return 句法树
   */
  public DependencyTree parse2T(String sent){
    if(pos==null||seg==null||parser==null||sent==null)
      return null;
    String[][] wc = tag(sent);
    if(wc==null)
      return null;   
   
    return parse2T(wc[0], wc[1]);
  }
  /**
   * 实体名识别
   * @param s
   * @return 实体名+类型
   */
  public static HashMap<String, String> ner(String s) {
    if(ner==null)
      return null;
    return ner.tag(s);
  }
  public static void setEnFilter(boolean b) {
    isEnFilter = b;

  }

  public static void readDepCache(String file) throws IOException{
    treeCache = new TreeCache();
    treeCache.read(file)
  }
 
 


}
TOP

Related Classes of org.fnlp.nlp.cn.CNFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.