Package org.fnlp.nlp.cn.rl

Source Code of org.fnlp.nlp.cn.rl.RLSeg

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.cn.rl;


import gnu.trove.iterator.hash.TObjectHashIterator;
import gnu.trove.set.hash.THashSet;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;

import org.fnlp.nlp.cn.Chars;
import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.util.MyCollection;

/**
* 分词使用示例
* @author xpqiu
*
*/
public class RLSeg {

  static String dicfile;
  private CWSTagger tag;
  THashSet<String> dict;
  /**
   * 新的临时词典
   */
  static THashSet<String> tempdict;
  /**
   * 非词典
   */
  static THashSet<String> nodict;

  BufferedWriter bwNew;
  BufferedWriter bwNo;
  float prop = 0.5f;
  private String newdictfile = "../tmp/dict-new.txt";
  private String nodictfile = "../tmp/dict-no.txt";

  public RLSeg(CWSTagger tag, String string) throws IOException{
    this.tag = tag;
    dicfile = string;
    dict = MyCollection.loadTSet(dicfile);
    tempdict = MyCollection.loadTSet(newdictfile);
    nodict = MyCollection.loadTSet(nodictfile);

    bwNew = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(newdictfile ,true), "UTF-8"));
    bwNo = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(nodictfile ,true), "UTF-8"));

  }

  /**
   * 主程序
   * @param args
   * @throws IOException
   * @throws 
   */
  public static void main(String[] args) throws Exception {
    CWSTagger tag = new CWSTagger("./models/seg.m")

    RLSeg rlseg = new RLSeg(tag,"./tmpdata/FNLPDATA/all.dict");

    tag.setDictionary(tempdict);

    System.out.println("\n处理文件:");
    String s4 = tag.tagFile("../example-data/data-tag.txt");
    System.out.println(s4);
    String[] toks = s4.split("\\s+");
    int newset = rlseg.update(toks);
    rlseg.close();

    tag.setDictionary(tempdict);

    System.out.println("\n处理文件:");
    String s = tag.tagFile("../example-data/data-tag.txt");
    System.out.println(s);


  }

  void close() throws IOException {
    bwNew.close();
    bwNo.close();

  }

  int update(String[] toks) throws IOException {
    if(toks==null)
      return 0;
    THashSet<String> newdict = new THashSet<String>();
    String nowords = "";
    int count = 0;
    for(int i=0;i<toks.length;i++){//取得包含新词的最长子串
      if(Chars.isLetterOrDigitOrPunc(toks[i]))
        continue;

      if(!dict.contains(toks[i])&&!tempdict.contains(toks[i])){
        nowords += "" + toks[i];
        count++;
      }else{
        if(nowords.length()>0){
          System.out.println(nowords);
          newdict.add(nowords.trim());
          nowords = "";
        }
      }
    }


    TObjectHashIterator<String> it = newdict.iterator();
    while(it.hasNext()){
      String s = it.next();
      if(nodict.contains(s))
        continue;
      System.out.println("搜索: "+s);
      THashSet<String> sset = getNewWords(s);
      if(sset==null||sset.size()==0)
        continue;
      System.out.println(sset);
      tempdict.addAll(sset);
      if(!sset.contains(s)&&!nodict.contains(s)){
        nodict.add(s);
        bwNo.write(s);
        bwNo.write("\n");
      }

    }
    bwNew.flush();
    bwNo.flush();
    return count;
  }

  public static int getOccur(String src,String find){
    int o = 0;
    int index=-1;
    while((index=src.indexOf(find,index))>-1){
      ++index;
      ++o;
    }
    return o;
  }

  public static String genQuery(String src){

    StringBuilder sb = new StringBuilder(src);
    int ulen = src.length()-1;
    for(int i=0;i<src.length()-ulen+1;i++){
      sb.append("+");
      sb.append(src.substring(i, i+ulen));
    }
    return sb.toString();
  }

  public THashSet<String> getNewWords(String s) throws IOException {
    if(s.length()==0)
      return null;
    THashSet<String> newset = new THashSet<String>();
    HashMap<String,Float> map = new HashMap<String, Float>();
    String q = genQuery(s);
    String res = SearchByBaidu.search(q);
    if(res.length()==0)
      return null;

    String[] words = tag.tag2Array(res);

    for(int i=0;i<words.length;i++){
      String w = words[i];
      if(w.length()<2||dict.contains(w)||tempdict.contains(w))
        continue;
      //        if(dict.contains(words[i]))
      //          continue;
      if(map.containsKey(w))
        map.put(w, map.get(w)+1);
      else
        map.put(w, 1f);
    }
    //      Set<Entry<String, Float>> set = map.entrySet();
    //      for(Entry e:set){
    //        e.setValue((Float) e.getValue()/words.length);
    //      }
    List<Entry> list = MyCollection.sort(map);



    int num = getOccur(res, s);

    float thres = num*prop;
    thres = thres<50?50:thres;
    for(Entry e:list){
      String ss = (String) e.getKey();
      if((Float) e.getValue()>thres&&ss.length()>1&&!dict.contains(ss)&&!tempdict.contains(ss)){       
        newset.add(ss);
        bwNew.write(ss);
        bwNew.write("\n");
      }

    }

    newset.remove("快照");
    return newset;
  }

  /**
   *
   * @param toks
   * @param j
   * @return
   */
  public int calcOOV(String[] toks, int j) {
    int count = 0;
    for(int i=0;i<toks.length;i++){//取得包含新词的最长子串
      if(Chars.isLetterOrDigitOrPunc(toks[i]))
        continue;     
      if(toks[i].length()>j)
        continue;
      if(!dict.contains(toks[i])&&!tempdict.contains(toks[i])){       
        count++;
      }

    }
    return count;
  }
}
TOP

Related Classes of org.fnlp.nlp.cn.rl.RLSeg

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.