Source Code of com.chenlb.mmseg4j.MMSeg

package com.chenlb.mmseg4j;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;


/**
 * Reader 流的分词(有字母,数字等), 析出中文(其实是 CJK)成句子 {@link Sentence} 再对 mmseg 算法分词.<br/>
 * 
 * 非线程安全
 * @author chenlb 2009-9-20下午10:41:41
 */
public class MMSeg {
  
  private PushbackReader reader;
  private Seg seg;
  
  private StringBuilder bufSentence = new StringBuilder(256);
  private Sentence currentSentence;
  private Queue<Word> bufWord;  // word 缓存, 因为有 chunk 分析三个以上.
  
  public MMSeg(Reader input, Seg seg) {
    this.seg = seg;
    
    reset(input);
  }


  private int readedIdx = 0;
  
  public void reset(Reader input) {
    this.reader = new PushbackReader(new BufferedReader(input), 20);
    currentSentence = null;
    bufWord = new LinkedList<Word>();
    bufSentence.setLength(0);
    readedIdx = -1;
  }
  
  private int readNext() throws IOException {
    int d = reader.read();
    if(d > -1) {
      readedIdx++;
      d = Character.toLowerCase(d);
    }
    return d;
  }
  
  private void pushBack(int data) throws IOException {
    readedIdx--;
    reader.unread(data);
  }


  
  public Word next() throws IOException {
    //先从缓存中取
    Word word = bufWord.poll();;
    if(word == null) {
      bufSentence.setLength(0);


      int data = -1;
      boolean read = true;
      while(read && (data=readNext()) != -1) {
        read = false;  //默认一次可以读出同一类字符,就可以分词内容
        int type = Character.getType(data);
        String wordType = Word.TYPE_WORD;
        switch(type) {
        case Character.UPPERCASE_LETTER:
        case Character.LOWERCASE_LETTER:
        case Character.TITLECASE_LETTER:
        case Character.MODIFIER_LETTER:
          /*
           * 1. 0x410-0x44f -> А-я  //俄文
           * 2. 0x391-0x3a9 -> Α-Ω  //希腊大写
           * 3. 0x3b1-0x3c9 -> α-ω  //希腊小写
           */
          data = toAscii(data);
          NationLetter nl = getNation(data);
          if(nl == NationLetter.UNKNOW) {
            read = true;
            break;
          }
          wordType = Word.TYPE_LETTER;
          bufSentence.appendCodePoint(data);
          switch(nl) {
          case EN:
            //字母后面的数字,如: VH049PA
            ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
            readChars(bufSentence, rcad);
            if(rcad.hasDigit()) {
              wordType = Word.TYPE_LETTER_OR_DIGIT;
            }
            //only english
            //readChars(bufSentence, new ReadCharByAscii());
            break;
          case RA:
            readChars(bufSentence, new ReadCharByRussia());
            break;
          case GE:
            readChars(bufSentence, new ReadCharByGreece());
            break;
          }
          bufWord.add(createWord(bufSentence, wordType));


          bufSentence.setLength(0);


          break;
        case Character.OTHER_LETTER:
          /*
           * 1. 0x3041-0x30f6 -> ぁ-ヶ  //日文(平|片)假名
           * 2. 0x3105-0x3129 -> ㄅ-ㄩ  //注意符号
           */
          bufSentence.appendCodePoint(data);
          readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));


          currentSentence = createSentence(bufSentence);


          bufSentence.setLength(0);


          break;
        case Character.DECIMAL_DIGIT_NUMBER:
          bufSentence.appendCodePoint(toAscii(data));
          readChars(bufSentence, new ReadCharDigit());  //读后面的数字, AsciiLetterOr
          wordType = Word.TYPE_DIGIT;
          int d = readNext();
          if(d > -1) {
            if(seg.isUnit(d)) {  //单位,如时间
              bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT));  //先把数字添加(独立)


              bufSentence.setLength(0);


              bufSentence.appendCodePoint(d);
              wordType = Word.TYPE_WORD;  //单位是 word
            } else {  //后面可能是字母和数字
              pushBack(d);
              if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {  //如果有字母或数字都会连在一起.
                wordType = Word.TYPE_DIGIT_OR_LETTER;
              }
            }
          }


          bufWord.add(createWord(bufSentence, wordType));




          bufSentence.setLength(0);  //缓存的字符清除


          break;
        case Character.LETTER_NUMBER:
          // ⅠⅡⅢ 单分
          bufSentence.appendCodePoint(data);
          readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));


          int startIdx = startIdx(bufSentence);
          for(int i=0; i<bufSentence.length(); i++) {
            bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
          }


          bufSentence.setLength(0);  //缓存的字符清除


          break;
        case Character.OTHER_NUMBER:
          //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
          bufSentence.appendCodePoint(data);
          readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));


          bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
          bufSentence.setLength(0);
          break;
        default :
          //其它认为无效字符
          read = true;
        }//switch
      }
        
      // 中文分词
      if(currentSentence != null) {
        do {
          Chunk chunk = seg.seg(currentSentence);
          for(int i=0; i<chunk.getCount(); i++) {
            bufWord.add(chunk.getWords()[i]);
          }
        } while (!currentSentence.isFinish());
        
        currentSentence = null;
      }
      
      word = bufWord.poll();
    }
    
    return word;
  }
  
  
  /**
   * 读取下一串指定类型字符.
   * 
   * @author chenlb 2009-8-15下午09:09:50
   */
  private static abstract class ReadChar {
    /**
     * 这个字符是否读取, 不读取也不会读下一个字符.
     */
    abstract boolean isRead(int codePoint);
    int transform(int codePoint) {
      return codePoint;
    }
  }
  
  /**
   * 读取下一串指定类型的字符放到 bufSentence 中.
   * @param bufSentence
   * @param readChar 判断字符的细节.
   * @return 返回读取的个数
   * @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的.
   */
  private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException {
    int num = 0;
    int data = -1;
    while((data = readNext()) != -1) {
      int d = readChar.transform(data);
      if(readChar.isRead(d)) {
        bufSentence.appendCodePoint(d);
        num++;
      } else {  //不是数字回压,要下一步操作
        pushBack(data);
        break;
      }
    }
    return num;
  }
  
  /**读取数字*/
  private static class ReadCharDigit extends ReadChar {


    boolean isRead(int codePoint) {
      int type = Character.getType(codePoint);
      return isDigit(type);
    }
    
    int transform(int codePoint) {
      return toAscii(codePoint);
    }
    
  }
  
  /**读取字母或数字*/
  private static class ReadCharByAsciiOrDigit extends ReadCharDigit {


    private boolean hasDigit = false;
    boolean isRead(int codePoint) {
      boolean isRead = super.isRead(codePoint);
      hasDigit |= isRead;
      return isAsciiLetter(codePoint) || isRead;
    }
    boolean hasDigit() {
      return hasDigit;
    }
  }
  
  /**读取字母*/
  @SuppressWarnings("unused")
  private static class ReadCharByAscii extends ReadCharDigit {
    boolean isRead(int codePoint) {
      return isAsciiLetter(codePoint);
    }
  }
  
  /**读取俄语*/
  private static class ReadCharByRussia extends ReadCharDigit {


    boolean isRead(int codePoint) {
      return isRussiaLetter(codePoint);
    }
    
  }
  
  /**读取希腊 */
  private static class ReadCharByGreece extends ReadCharDigit {


    boolean isRead(int codePoint) {
      return isGreeceLetter(codePoint);
    }
    
  }
  
  /**读取指定类型的字符*/
  private static class ReadCharByType extends ReadChar {
    int charType;
    public ReadCharByType(int charType) {
      this.charType = charType;
    }


    boolean isRead(int codePoint) {
      int type = Character.getType(codePoint);
      return type == charType;
    }
    
  }
  
  private Word createWord(StringBuilder bufSentence, String type) {
    return new Word(toChars(bufSentence), startIdx(bufSentence), type);
  }
  
  private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
    return new Word(toChars(bufSentence), startIdx, type);
  }
  
  private Sentence createSentence(StringBuilder bufSentence) {
    return new Sentence(toChars(bufSentence), startIdx(bufSentence));
  }
  
  /**取得 bufSentence 的第一个字符在整个文本中的位置*/
  private int startIdx(StringBuilder bufSentence) {
    return readedIdx - bufSentence.length() + 1;
  }
  
  /**从 StringBuilder 里复制出 char[] */
  private static char[] toChars(StringBuilder bufSentence) {
    char[] chs = new char[bufSentence.length()];
    bufSentence.getChars(0, bufSentence.length(), chs, 0);
    return chs;
  }
  
  /**
   * 双角转单角
   */
  private static int toAscii(int codePoint) {
    if((codePoint>=65296 && codePoint<=65305)  //０-９
        || (codePoint>=65313 && codePoint<=65338)  //Ａ-Ｚ
        || (codePoint>=65345 && codePoint<=65370)  //ａ-ｚ
        ) {  
      codePoint -= 65248;
    }
    return codePoint;
  }
  
  private static boolean isAsciiLetter(int codePoint) {
    return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
  }
  
  private static boolean isRussiaLetter(int codePoint) {
    return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
  }
  
  private static boolean isGreeceLetter(int codePoint) {
    return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
  }
  /**
   * EN -> 英语
   * RA -> 俄语
   * GE -> 希腊
   * 
   */
  private static enum NationLetter {EN, RA, GE, UNKNOW};
  
  private NationLetter getNation(int codePoint) {
    if(isAsciiLetter(codePoint)) {
      return NationLetter.EN;
    }
    if(isRussiaLetter(codePoint)) {
      return NationLetter.RA;
    }
    if(isGreeceLetter(codePoint)) {
      return NationLetter.GE;
    }
    return NationLetter.UNKNOW;
  }
  
  @SuppressWarnings("unused")
  private static boolean isCJK(int type) {
    return type == Character.OTHER_LETTER;
  }
  private static boolean isDigit(int type) {
    return type == Character.DECIMAL_DIGIT_NUMBER;
  }
  @SuppressWarnings("unused")
  private static boolean isLetter(int type) {
    return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
  }
}
Source Code of com.chenlb.mmseg4j.MMSeg

Related Classes of com.chenlb.mmseg4j.MMSeg