Package org.fnlp.train.corpus

Source Code of org.fnlp.train.corpus.CoNLL2FNLP

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.train.corpus;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import org.fnlp.nlp.cn.ChineseTrans;
import org.fnlp.nlp.corpus.fnlp.FNLPCorpus;
import org.fnlp.nlp.corpus.fnlp.FNLPDoc;
import org.fnlp.nlp.corpus.fnlp.FNLPSent;
import org.fnlp.util.MyCollection;
import org.fnlp.util.MyFiles;

/**
* 将CONLL格式转为FNLP格式
* @author xpqiu
*
*/
public class CoNLL2FNLP{

  private static boolean HASID = false;
  private static HashMap<String, String> posdict;
  private static HashMap<String, String> reldict;
  private static HashMap<String, String> NRdict;
  List<File> files;
  Charset charset;
  FNLPCorpus corpus;
  ChineseTrans ct = new ChineseTrans();
  private TagCorrect tc;

  public CoNLL2FNLP(String path) throws IOException {   
    this(path, "UTF8",null);
  }

  public CoNLL2FNLP(String path, String charsetName, String suffix) throws IOException {
    files = MyFiles.getAllFiles(path, suffix);
    charset = Charset.forName(charsetName);
    tc = new TagCorrect();
  }

  public void read() throws IOException {
    corpus = new FNLPCorpus();
    List<String> carrier = new ArrayList<String>();
    Iterator<File> it = files.iterator();
    while(it.hasNext()){
      BufferedReader bfr =null;
      File file = it.next();
      try {
       
        FileInputStream in = new FileInputStream(file);
        bfr = new BufferedReader(new InputStreamReader(in,charset));
      } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      FNLPDoc docs = new FNLPDoc();
      docs.name = file.getName();
      String line = null;
      carrier.clear();
      while ((line = bfr.readLine()) != null) {
       
        line = line.trim();
       
        if (line.matches("^$")){
          if(carrier.size()>0){
            FNLPSent sent = new FNLPSent();           
            sent.parse(carrier,1,HASID); //TODO: 需要根据不同语料修改
            //归一化
            for(int i=0;i<sent.words.length;i++){
              sent.words[i] = ct.normalize(sent.words[i]);
            }
            correct(sent);
            docs.add(sent);
            carrier.clear();
          }
        }else
          carrier.add(line);
      }
      if(!carrier.isEmpty()){
        FNLPSent sent = new FNLPSent();
       
        sent.parse(carrier,1,HASID); //TODO: 需要根据不同语料修改
        correct(sent);
        docs.add(sent);
        carrier.clear();
      }
      corpus.add(docs);
    }
  }

  public void correct(FNLPSent sent) {
   
    for(int i=0;i<sent.tags.length;i++){
//      if(sent.words[i].equals("觉得"))
//        System.out.print("");
      String newtag = posdict.get(sent.tags[i]);
      if(newtag!=null)
        sent.tags[i] = newtag;
      String pos = NRdict.get(sent.words[i]);
      if(pos!=null){
        if(sent.tags[i].equals("实体名")){
          sent.tags[i] = pos;
        }
      }
      String rel = reldict.get(sent.relations[i]);
     
      if(rel!=null){
        sent.relations[i] = rel;
      }
      tc.checkPronoun(sent.words, sent.tags, i);
    }
   
  }

  public static void main(String[] args) throws IOException{
     posdict = MyCollection.loadStringStringMap("../data/map/pos-ctb2fnlp.txt");
     reldict = MyCollection.loadStringStringMap("../data/map/rel-ctb2fnlp.txt");
     NRdict = MyCollection.loadStringStringMap("../data/map/pos-nr.txt");
    CoNLL2FNLP reader = new CoNLL2FNLP("../data/ctb/result.txt","utf-8",".txt");
    HASID = true;
    reader.read();   
    reader.corpus.writeOne("../data/FNLPDATA/ctb7.dat");
    reader.corpus.count("../data/FNLPDATA/count", false);
    MyCollection.write(reader.tc.pronount,"../data/FNLPDATA/pronount.txt");
    System.out.println("Done!");
  }
 
}
TOP

Related Classes of org.fnlp.train.corpus.CoNLL2FNLP

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.