Package org.wltea.analyzer.core

Source Code of org.wltea.analyzer.core.IKSegmenter

/**
* IK 中文分词  版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*/
package org.wltea.analyzer.core;

import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
* IK分词器主类
*
*/
public final class IKSegmenter {
 
  //字符窜reader
  private Reader input;
  //分词器配置项
  private Configuration cfg;
  //分词器上下文
  private AnalyzeContext context;
  //分词处理器列表
  private List<ISegmenter> segmenters;
  //分词歧义裁决器
  private IKArbitrator arbitrator;
    private  boolean useSmart = false;
 

  /**
   * IK分词器构造函数
   * @param input
     */
  public IKSegmenter(Reader input , Settings settings, Environment environment){
    this.input = input;
    this.cfg = new Configuration(environment);
        this.useSmart = settings.get("use_smart", "false").equals("true");
        this.init();
  }
 
  public IKSegmenter(Reader input){
    new IKSegmenter(input, null,null);
  }
 
//  /**
//   * IK分词器构造函数
//   * @param input
//   * @param cfg 使用自定义的Configuration构造分词器
//   *
//   */
//  public IKSegmenter(Reader input , Configuration cfg){
//    this.input = input;
//    this.cfg = cfg;
//    this.init();
//  }
 
  /**
   * 初始化
   */
  private void init(){
    //初始化词典单例
    Dictionary.initial(this.cfg);
    //初始化分词上下文
    this.context = new AnalyzeContext(useSmart);
    //加载子分词器
    this.segmenters = this.loadSegmenters();
    //加载歧义裁决器
    this.arbitrator = new IKArbitrator();
  }
 
  /**
   * 初始化词典,加载子分词器实现
   * @return List<ISegmenter>
   */
  private List<ISegmenter> loadSegmenters(){
    List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
    //处理字母的子分词器
    segmenters.add(new LetterSegmenter());
    //处理中文数量词的子分词器
    segmenters.add(new CN_QuantifierSegmenter());
    //处理中文词的子分词器
    segmenters.add(new CJKSegmenter());
    return segmenters;
  }
 
  /**
   * 分词,获取下一个词元
   * @return Lexeme 词元对象
   * @throws java.io.IOException
   */
  public synchronized Lexeme next()throws IOException{
    Lexeme l = null;
    while((l = context.getNextLexeme()) == null ){
      /*
       * 从reader中读取数据,填充buffer
       * 如果reader是分次读入buffer的,那么buffer要  进行移位处理
       * 移位处理上次读入的但未处理的数据
       */
      int available = context.fillBuffer(this.input);
      if(available <= 0){
        //reader已经读完
        context.reset();
        return null;
       
      }else{
        //初始化指针
        context.initCursor();
        do{
              //遍历子分词器
              for(ISegmenter segmenter : segmenters){
                segmenter.analyze(context);
              }
              //字符缓冲区接近读完,需要读入新的字符
              if(context.needRefillBuffer()){
                break;
              }
           //向前移动指针
        }while(context.moveCursor());
        //重置子分词器,为下轮循环进行初始化
        for(ISegmenter segmenter : segmenters){
          segmenter.reset();
        }
      }
      //对分词进行歧义处理
      this.arbitrator.process(context, useSmart);
      //将分词结果输出到结果集,并处理未切分的单个CJK字符
      context.outputToResult();
      //记录本次分词的缓冲区位移
      context.markBufferOffset();     
    }
    return l;
  }

  /**
     * 重置分词器到初始状态
     * @param input
     */
  public synchronized void reset(Reader input) {
    this.input = input;
    context.reset();
    for(ISegmenter segmenter : segmenters){
      segmenter.reset();
    }
  }
}
TOP

Related Classes of org.wltea.analyzer.core.IKSegmenter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.