Package org.fnlp.train.pos

Source Code of org.fnlp.train.pos.POSPrepare

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.train.pos;

import java.io.File;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.TreeSet;

import org.fnlp.nlp.corpus.fnlp.FNLPCorpus;
import org.fnlp.util.MyCollection;
import org.fnlp.util.MyFiles;

public class POSPrepare {

  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
   
    TreeSet<String> allpostSet = new TreeSet<String>();

    String datapath = "../data";

    String ctbfile = datapath + "/FNLPDATA/ctb.pos";
    (new File(ctbfile)).delete();
    String trainfile = datapath + "/FNLPDATA/train.pos";
    (new File(trainfile)).delete();
    String testfile = datapath + "/FNLPDATA/test.pos";
    (new File(testfile)).delete();
    String dictfile = datapath + "/FNLPDATA/dict.pos";
    (new File(dictfile)).delete();

    FNLPCorpus corpus = new FNLPCorpus();
    //读FNLP数据
    corpus.read(datapath + "/FNLPDATA/ctb7.dat", null);
    corpus.read(datapath + "/FNLPDATA/WeiboFTB(v1.0)-train.dat", null);

    //读分词+词性文件
    corpus.readPOS(datapath + "/FNLPDATA/pos",".txt","UTF8")
    //读自有数据
    corpus.readOurCorpus(datapath + "/ourdata",".txt","UTF8");

    FNLP2POS.trans(corpus,ctbfile);

    allpostSet.addAll(corpus.getAllPOS());

   


    //读字典
    DictPOS dp = new DictPOS();
    String out = datapath + "/FNLPDATA/dict.pos";
//    dp.loadPath(datapath + "/FNLPDATA/词性字典",".txt");
//    dp.loadPath(datapath + "/FNLPDATA/dict-sogou-input/txt", ".txt");
    dp.save(out);
   
    allpostSet.addAll(dp.getPosSet());


    //合并

    List<File> files = MyFiles.getAllFiles(datapath + "/FNLPDATA/", ".pos");

    MyFiles.combine(trainfile,files.toArray(new File[files.size()]))


    //处理测试数据
    FNLPCorpus corpust = new FNLPCorpus();
    //读自有数据
    corpust.read(datapath + "/FNLPDATA/WeiboFTB(v1.0)-test.dat", null)
   
    allpostSet.addAll(corpust.getAllPOS());
   
   
    System.out.println(allpostSet);

    FNLP2POS.trans(corpust,testfile);
   
   
    //check
   
    String c2ePath = datapath + "/map/pos-fnlp2e.txt";
    HashMap<String, String> e2c = MyCollection.loadStringStringMap(c2ePath);   
   
    for(String pos: allpostSet){
      if(!e2c.containsKey(pos)){
        System.out.print("not pos:");
        System.out.println(pos);      }
    }
   
    String allposset = datapath + "/FNLPDATA/allpos";
    MyCollection.write(allpostSet, allposset);


    System.out.println(new Date().toString());
    System.out.println("Done!");



  }

}
TOP

Related Classes of org.fnlp.train.pos.POSPrepare

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.