Package org.ictclas4j.segment

Source Code of org.ictclas4j.segment.GraphGenerate

package org.ictclas4j.segment;

import java.util.ArrayList;

import org.ictclas4j.bean.Atom;
import org.ictclas4j.bean.DictLib;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.POSTag;
import org.ictclas4j.bean.Pos;
import org.ictclas4j.bean.SegAtom;
import org.ictclas4j.bean.SegNode;
import org.ictclas4j.util.Utility;


public class GraphGenerate {

  /**
   * ȫ�з�,�����з�ͼ.���ҳ����п��ܵĴ���
   *
   * @param atoms
   * @return
   */
  public static SegGraph generate(ArrayList<Atom> atoms, DictLib dictLib) {
    SegGraph segGraph = null;
    SegNode sn = null;
    Atom atom = null;

    if (atoms != null && atoms.size() > 0 && dictLib != null) {
      segGraph = new SegGraph();
      Dictionary dict = dictLib.getCoreDict();

      // �ȰѷǺ����ַ��Ĵ���ʶ�����
      for (int i = 0; i < atoms.size(); i++) {
        atom = atoms.get(i);
        String word = atom.getWord();
        if (atom.getPos() == Utility.CT_CHINESE)
          sn = new SegNode(i, i + 1, 0, 0, atom.getWord());
        else {
          int pos = 0;
          double value = Utility.MAX_FREQUENCE;

          switch (atom.getPos()) {
          case Utility.CT_INDEX:
          case Utility.CT_NUM:
            pos = -POSTag.NUM;// 'm'*256
            word = Utility.UNKNOWN_NUM;
            value = 0;
            break;
          case Utility.CT_DELIMITER:
            pos = POSTag.PUNC;// 'w'*256;
            break;
          case Utility.CT_LETTER:
            pos = -POSTag.NOUN_LETTER;//
            value = 0;
            word = Utility.UNKNOWN_LETTER;
            break;
          case Utility.CT_SINGLE:// 12021-2129-3121
            if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
              pos = -POSTag.NUM;// 'm'*256
              word = Utility.UNKNOWN_NUM;
            } else {
              pos = -POSTag.NOUN_LETTER;//
              word = Utility.UNKNOWN_LETTER;
            }
            value = 0;
            break;
          default:
            pos = atom.getPos();// '?'*256;
            break;
          }

          int gbkID = dictLib.getGBKID(word);
          sn = new SegNode(i, i + 1, pos, value, word);
          sn.setGbkID(gbkID);
        }

        sn.setSrcWord(atom.getWord());
        segGraph.insert(sn, true);
      }

      StringBuffer words = new StringBuffer();
      for (int i = 0; i < atoms.size(); i++) {
        int j = i + 1;
        words.delete(0, words.length());
        words.append(atoms.get(i).getWord());

        // ����ǡ��·ݡ�����Ҫ�ָ�
        boolean flag = false;
        if (j < atoms.size()) {
          Atom a2 = atoms.get(j);
          if ("��".equals(words.toString()) && "��".equals(a2.getWord())) {
            segGraph.delete(i, j);
            segGraph.delete(i + 1, j + 1);
            words.append(a2.getWord());
            flag = true;
            j++;
          }
        }

        SegAtom sa = null;
        String word = words.toString();
        int gbkID = dictLib.getGBKID(word);
        int wordMaxLen = dict.getWordMaxLen(word, gbkID);
        for (; j <= atoms.size() && word.length() < wordMaxLen; j++) {
          word = words.toString();
          sa = dict.getSegAtom(word, gbkID);
          if (sa != null) {
            // 1���ڣ�1999��ĩ
            // if (word.length() == 2 && segGraph.getSize() > 0) {
            // SegNode g2 = segGraph.getLast();
            // if (Utility.isAllNum(g2.getWord()) ||
            // Utility.isAllChinese(g2.getWord())
            // && (g2.getWord().indexOf("��") == 0 ||
            // g2.getWord().indexOf("��") == 0)) {
            // if ("ĩ���е�ǰ���".indexOf(words.substring(1)) != -1)
            // break;
            // }
            // }
            // ֻ��һ���Դʣ�������
            SegNode sg = null;
            if (sa.getPosCount() == 1) {
              Pos pos = sa.getPos(0);
              sg = new SegNode(i, j, pos.getTag(), sa.getTotalFreq(), word);
            } else
              sg = new SegNode(i, j, 0, sa.getTotalFreq(), word);
            sg.setSrcWord(word);
            sg.setGbkID(gbkID);
            segGraph.insert(sg, true);
          }

          if (j < atoms.size()) {
            String word2 = atoms.get(j).getWord();
            words.append(word2);
          }
        }
        if (flag)
          i++;
      }

    }
    return segGraph;
  }

  /**
   * ���ɶ���ͼ��,ÿ���ڵ��ʾ���������������Ϲ�ϵ,��:˵@��ȷ
   *
   * @param sgs
   */
  public static SegGraph biGenerate(SegGraph seg, DictLib dictLib) {
    double curFreq;
    SegGraph segGraph = null;
    final double smoothParam = 0.1;
    if (dictLib == null)
      return null;
    Dictionary dict = dictLib.getCoreDict();
    Dictionary biDict = dictLib.getBigramDict();

    if (seg != null && dict != null && biDict != null) {
      segGraph = new SegGraph();
      ArrayList<SegNode> sgs = seg.getSnList();

      for (int i = 0; sgs != null && i < sgs.size(); i++) {
        SegNode sg = sgs.get(i);
        if (sg.getPos() >= 0)
          curFreq = sg.getWeight();
        else {
          int gbkID = sg.getGbkID();// dictLib.getGBKID(sg.getWord());
          curFreq = dict.getFreq(sg.getWord(), 2, gbkID);
        }

        // �õ�������ֵ�͸���ֵ��ȵ�����Ԫ��
        ArrayList<SegNode> nextSgs = seg.getNextElements(i);
        for (SegNode graph : nextSgs) {
          String twoWords = sg.getWord();
          twoWords += Utility.WORD_SEGMENTER;
          twoWords += graph.getWord();
          int gbkID = sg.getGbkID();// dictLib.getGBKID(twoWords);

          // ��������������֮���ƽ��ֵ
          // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
          int twoFreq = biDict.getFreq(twoWords, 3, gbkID);
          double temp = (double) 1 / Utility.MAX_FREQUENCE;
          double value = smoothParam * (1 + curFreq) / (Utility.MAX_FREQUENCE + 80000);
          value += (1 - smoothParam) * ((1 - temp) * twoFreq / (1 + curFreq) + temp);
          value = -Math.log(value);

          if (value < 0) {
            value += sg.getFreq();
          }

          SegNode sg2 = new SegNode();
          // �ָ���@ǰ�Ĵ��������е�λ��
          int wordIndex = getWordIndex(sgs, sg);
          sg2.setRow(wordIndex);

          // �ָ���@��Ĵ��������е�λ��
          wordIndex = getWordIndex(sgs, graph);
          sg2.setCol(wordIndex);
          sg2.setWord(twoWords);
          sg2.setPos(sg.getPos());
          sg2.setWeight(value);
          sg2.setGbkID(gbkID);
          segGraph.insert(sg2, false);
        }
      }
    }
    return segGraph;
  }

  private static int getWordIndex(ArrayList<SegNode> sgs, SegNode graph) {
    if (sgs != null && graph != null) {
      for (int i = 0; i < sgs.size(); i++) {
        if (sgs.get(i) == graph)
          return i;
      }
    }

    return -1;
  }

}
TOP

Related Classes of org.ictclas4j.segment.GraphGenerate

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.