Package org.fnlp.nlp.cn.rl

Source Code of org.fnlp.nlp.cn.rl.Seg

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.cn.rl;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.nlp.cn.tag.POSTagger;

import gnu.trove.set.hash.THashSet;

public class Seg {

  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    CWSTagger seg = new CWSTagger("./models/seg.m")
    POSTagger pos = new POSTagger(seg, "./models/pos.m");

    RLSeg rlseg = new RLSeg(seg,"./tmpdata/FNLPDATA/all.dict");
//    tag.setDictionary(rlseg.tempdict);
    String file = "./tmpdata/20120927-微博分词-5000-test-utf-8.txt";
    BufferedReader bfr = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf8"));
    BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("./tmp/complex.txt"), "UTF-8"));
    BufferedWriter bcqa = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("./tmp/seged.txt"), "UTF-8"));
    String line = null
    int i=0;
    while ((line = bfr.readLine()) != null) {
      System.out.println(i++);

      if(line.length()==0)
        continue;
      String[] toks = seg.tag2Array(line);
     
      for(int j=0;j<toks.length;j++){
        bcqa.write(toks[j]);
        if(j<toks.length-1)
        bcqa.write(" ");
      }
      bcqa.write("\n");
      bcqa.write("\n");
      int oov = rlseg.update(toks);
//      int oov = rlseg.calcOOV(toks,2);
      if(oov>3){
//      if(oov>2 || sent.length()>4&&toks.length<sent.length()/2.5){
        for(int j=0;j<toks.length;j++){
          bout.write(toks[j]);
          bout.write(" ");
        }
        bout.write("\n");
        bout.flush();
      }
//
//      tag.setDictionary(rlseg.tempdict);
    }
    bcqa.close();
    bout.close();
    bfr.close();
    System.out.println("Done!");
  }

}
TOP

Related Classes of org.fnlp.nlp.cn.rl.Seg

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.