Package org.fnlp.nlp.cn.anaphora

Source Code of org.fnlp.nlp.cn.anaphora.EntitiesGetter

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.cn.anaphora;

import java.util.EnumSet;
import java.util.LinkedList;

import org.fnlp.nlp.cn.PartOfSpeech;
import org.fnlp.nlp.cn.anaphora.Entity.FUNC;
import org.fnlp.nlp.cn.anaphora.Entity.Singular;

/**
* 检测文中的实体和代词
* @author jszhao,xpqiu
* @version 1.2
* @since FudanNLP 1.5
*/
public class EntitiesGetter {

  static EnumSet<PartOfSpeech> NP = EnumSet.noneOf(PartOfSpeech.class);
  static{
    NP.add(PartOfSpeech.专有名);
    NP.add(PartOfSpeech.人名);
    NP.add(PartOfSpeech.机构名);
    NP.add(PartOfSpeech.地名);
    NP.add(PartOfSpeech.专有名);
    NP.add(PartOfSpeech.序数词);
    NP.add(PartOfSpeech.数词);
    NP.add(PartOfSpeech.量词);
    NP.add(PartOfSpeech.形谓词);
    NP.add(PartOfSpeech.形容词);
    NP.add(PartOfSpeech.限定词);
    NP.add(PartOfSpeech.名词);
    NP.add(PartOfSpeech.代词);
    NP.add(PartOfSpeech.指示代词);
    NP.add(PartOfSpeech.人称代词);
    NP.add(PartOfSpeech.疑问代词);
   
 

  public EntitiesGetter() {
   
  }
  /**
   * 是实体的一部分
   * @param pos
   * @param word
   * @return
   */
  private boolean isPart(PartOfSpeech pos, String word){
    if(NP.contains(pos))
      return true;
   
    if(pos == PartOfSpeech.结构助词 && word.equals("的"))
      return true;
   
    return false;
  }


  public LinkedList<Entity> parse(String[][][] taggedstr) { 
   
    LinkedList<Entity> EntityList = new LinkedList<Entity>();
    Entity ey = null
    int distance = 0;
    int index =  0;
    int subDistance = 0;
    String strdata= null;
    int flag = 0;
   
   
   
   

    for(int i=0;i<taggedstr.length;i++){
      String[] words = taggedstr[i][0];
      String[] pos = taggedstr[i][1];
     
      PartOfSpeech strtag = null;
     
      PartOfSpeech[] epos = PartOfSpeech.valueOf(pos);
     
      for(int j=0;j<words.length;j++){ 
        index++;
        subDistance = 0;
        String headword = null;
        if(epos[j]==PartOfSpeech.标点&&((words[j].equals(","))
            ||(words[j].equals(":")))){
          subDistance++;
        }
       
       
        if(isPart(epos[j],words[j])){
          int id = j;
          strdata = words[j];
          strtag = epos[j];
          headword = words[j];
          flag = 0;
          ey = new Entity();         
          ey.start = index;
         
          Singular isSing = Singular.UNKONW;
         
          while(j<words.length-1){
            boolean isModify = !(isNN(epos[j])&&words[j+1].equals("的"));
            if(isModify&&isPart(epos[j+1],words[j+1])){           
              if(epos[j]==PartOfSpeech.数词 &&(words[j].equals("一")||
                  words[j].equals("半")||words[j].equals("1")))
                isSing = Singular.Yes;
              else if (epos[j]==PartOfSpeech.数词 &&!(words[j].equals("一")
                  ||words[j].equals("半")||words[j].equals("1"))){
                isSing = Singular.No;
              }
              strdata+= words[j+1];
              strtag = epos[j+1];
              headword = words[j+1];
              j++;   
              flag++;
            }
            else
              break;
          }
          if(strtag.isPronoun()||strdata.contains("这")||
              strdata.contains("那")||strdata.contains("该")){
            ey.setIsResolution(true);
          }
          else
            ey.setIsResolution(false);
          int jj = j;
          while((!isNN(strtag))&&jj>=0){
            int ij = strdata.indexOf(words[jj]);
            if(ij>=0)
              strdata = strdata.substring(0,ij);
            else
              break;
            jj--;
            flag--;
            if(jj>=0)
              strtag = epos[jj];
          }
          if(strdata.length() == 0)
            continue;
          if(strdata.indexOf("的")==0){
            strdata = strdata.substring(1);
            ey.start = ey.start+1 ;
          }

          ey.setPosTag(strtag);
          ey.setData(strdata);
          ey.setHeadWord(headword);
         
          if(isSingular(ey.getData())){
            isSing = Singular.Yes;
          }
          else if(isNotSingular(ey.getData())){
            isSing = Singular.No;
          }


          if(this.isFemale(ey.getData())){
            ey.setFemale();
          }
          else if(this.isMale(ey.getData())){
            ey.setMale();
          }
         
          FUNC graTag = FUNC.SUB;
          while((j-flag-1)>=0&&!epos[j-flag-1].isMark()){
            if(isObj(epos[j-flag-1])){
              graTag = FUNC.OBJ;
              break;
            }
            flag++;
          }

          if(j<words.length-1&&pos[j+1].equals("DEG")&&
              words[j+1].equals("的")){
            graTag = FUNC.ADJ;
          }
          ey.setId(id);
          ey.setGraTag(graTag);
          ey.singular = isSing;
          ey.sentNo = i;   
          ey.setSubDistance(subDistance);
          ey.end = j;
         
          EntityList.add(ey);
        }

      }
    }
    return EntityList;
  }
 
  static EnumSet<PartOfSpeech> NN = EnumSet.noneOf(PartOfSpeech.class);
  static{
    NN.add(PartOfSpeech.名词);
    NN.add(PartOfSpeech.专有名);
    NN.add(PartOfSpeech.人名)
    NN.add(PartOfSpeech.地名);
    NN.add(PartOfSpeech.机构名);
    NN.add(PartOfSpeech.代词);
    NN.add(PartOfSpeech.人称代词);
    NN.add(PartOfSpeech.指示代词);
    NN.add(PartOfSpeech.疑问代词);
  }
 
  /**
   * 是否为NN
   * @param strtag
   * @return
   */
  public boolean isNN(PartOfSpeech pos) {
    return NN.contains(pos);
  }
 
 
  static EnumSet<PartOfSpeech> obj = EnumSet.noneOf(PartOfSpeech.class);
  static{
    obj.add(PartOfSpeech.副词);
    obj.add(PartOfSpeech.动词);
    obj.add(PartOfSpeech.介词);
    obj.add(PartOfSpeech.形谓词);
    obj.add(PartOfSpeech.形容词);
  }
 
  /**
   * 是否是宾语
   * @param pos
   * @return
   */
  private boolean isObj(PartOfSpeech pos) {
    return obj.contains(pos);
  }
 
 
  private Boolean isSingular(String str){
    if(str.contains("这个")||str.contains("这种")||
        str.contains("每")||str.equals("他")||
        str.equals("它")||str.equals("她")){
      return true;
    }
    else
      return false;

  }
 
  private Boolean isNotSingular(String str){
    if(str.startsWith("各")||str.contains("群")||
        str.contains("多")||str.startsWith("二者")||
        str.startsWith("全体")||str.startsWith("所有")
        ||str.contains("们")){
      return true;
    }
    else
      return false;

  }
 
  private Boolean isFemale(String str){
    if(str.contains("娘")||str.contains("妻")||
        str.contains("媳")||str.contains("姑")||
        str.contains("夫人")||str.contains("她")||
        str.contains("小姐")||str.contains("女")||
        str.contains("母")||str.contains("妞")||
        str.contains("妈")||str.contains("妇")||
        str.contains("婆")){
      return true;
    }
    else
      return false;

  }

  private Boolean isMale(String str){
    if(str.contains("先生")||str.contains("男")||
        str.contains("丈夫")||str.contains("父")||
        str.contains("兄")||str.contains("儿子")
        ||str.contains("哥")){
      return true;
    }
    else
      return false;

  }

  public static void main(String args[]) throws Exception{
    EntitiesGetter ep = new EntitiesGetter();
    Entity ey = null;
    String str2 = "复旦大学创建于1905年,它位于上海市,这个大学培育了好多优秀的学生。";
    String str3[] = {"复旦","大学","创建","于","1905年",",","它","位于","上海市",",","这个","大学","培育","了","好多","优秀","的","学生","。"};
    String str4[] = {"专有名","名词","动词","介词","时间短语","标点","代词","动词","专有名","标点","限定词","名词","动词","动态助词","数词","形容词","结构助词","名词","标点"};
    String str5[][][] = new String[1][2][str3.length];
    str5[0][0] = str3;
    str5[0][1] = str4;   
   
    LinkedList<Entity> list = ep.parse(str5);
    System.out.println(list);
  }


}
TOP

Related Classes of org.fnlp.nlp.cn.anaphora.EntitiesGetter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.