Source Code of org.fnlp.demo.nlp.ChineseWordSegmentation

/**
*  This file is part of FNLP (formerly FudanNLP).
*  
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  
*  Copyright 2009-2014 www.fnlp.org. All rights reserved. 
*/


package org.fnlp.demo.nlp;




import java.util.ArrayList;


import org.fnlp.ml.types.Dictionary;
import org.fnlp.nlp.cn.tag.CWSTagger;




/**
 * 分词使用示例
 * @author xpqiu
 *
 */
public class ChineseWordSegmentation {
  /**
   * 主程序
   * @param args 
   * @throws Exception
   * @throws  
   */
  public static void main(String[] args) throws Exception {
    CWSTagger tag = new CWSTagger("../models/seg.m");
    System.out.println("不使用词典的分词：");
    String str = " 媒体计算研究所成立了, 高级数据挖掘(data mining)很难。 乐phone热卖！";
    String s = tag.tag(str);
    System.out.println(s);
    
    //设置英文预处理
    tag.setEnFilter(true);
    s = tag.tag(str);
    System.out.println(s);
//    tag.setEnFilter(false);
    
    System.out.println("\n设置临时词典：");
    ArrayList<String> al = new ArrayList<String>();
    al.add("数据挖掘");
    al.add("媒体计算研究所");
    al.add("乐phone");
    Dictionary dict = new Dictionary(false);
    dict.addSegDict(al);
    tag.setDictionary(dict);
    s = tag.tag(str);
    System.out.println(s);
    
    
    CWSTagger tag2 = new CWSTagger("../models/seg.m", new Dictionary("../models/dict.txt"));
    System.out.println("\n使用词典的分词：");
    String str2 = "媒体计算研究所成立了, 高级数据挖掘很难。 乐phone热卖！";
    String s2 = tag2.tag(str2);
    System.out.println(s2);
    
    //使用不严格的词典
    CWSTagger tag3 = new CWSTagger("../models/seg.m", new Dictionary("../models/dict_ambiguity.txt",true));
    //尽量满足词典，比如词典中有“成立”“成立了”和“了”, 会使用Viterbi决定更合理的输出
    System.out.println("\n使用不严格的词典的分词：");
    String str3 = "媒体计算研究所成立了, 高级数据挖掘很难";
    String s3 = tag3.tag(str3);
    System.out.println(s3);
    str3 = "我送给力学系的同学一个玩具 (送给给力力学力学系都在词典中)";
    s3 = tag3.tag(str3);
    System.out.println(s3);
    
    System.out.println("\n处理文件：");
    String s4 = tag.tagFile("../example-data/data-tag.txt");
    System.out.println(s4);
    
    String s5 = tag2.tagFile("../example-data/data-tag.txt");
    System.out.println(s5);
    
  }


}
Source Code of org.fnlp.demo.nlp.ChineseWordSegmentation

Related Classes of org.fnlp.demo.nlp.ChineseWordSegmentation