Package org.ictclas4j.bean

Examples of org.ictclas4j.bean.Atom


      result = new ArrayList<Atom>();
      String[] ss = GFString.atomSplit(str);

      int index = str.indexOf(Utility.SENTENCE_BEGIN);
      if (index == 0) {
        Atom atom = new Atom();
        atom.setWord(Utility.SENTENCE_BEGIN);
        atom.setLen(Utility.SENTENCE_BEGIN.length());
        atom.setPos(Utility.CT_SENTENCE_BEGIN);
        result.add(atom);
        index += Utility.SENTENCE_BEGIN.length();
      }

      if (index == -1)
        index = 0;
      for (int i = index; i < ss.length; i++) {
        if (Utility.SENTENCE_END.equals(str.substring(i))) {
          Atom atom = new Atom();
          atom.setWord(Utility.SENTENCE_END);
          atom.setLen(Utility.SENTENCE_END.length());
          atom.setPos(Utility.CT_SENTENCE_END);
          result.add(atom);
          break;
        }

        String s = ss[i];
        sAtom += s;
        int curType = Utility.charType(s);
        if (".".equals(s)
            && (i + 1 < ss.length && (Utility.charType(ss[i + 1]) == Utility.CT_NUM || GFString
                .isNumeric(ss[i+1]))))
          curType = Utility.CT_NUM;

        // ����Ǻ��֡��ָ�����
        if (curType == Utility.CT_CHINESE || curType == Utility.CT_INDEX || curType == Utility.CT_DELIMITER
            || curType == Utility.CT_OTHER) {

          Atom atom = new Atom();
          atom.setWord(s);
          atom.setLen(s.length());
          atom.setPos(curType);
          result.add(atom);
          sAtom = "";
        }
        // ��������֡���ĸ�����ֽڷ��ţ�������ڵ���Щ��Ϊһ��ԭ�ӡ����磺����SHX-123�ͺŵ��ֻ��������е�SHX-123����һ��ԭ��
        else {
          int nextType = 255;// ��һ���ַ�������
          if (i < ss.length - 1)
            nextType = Utility.charType(ss[i + 1]);
          if (nextType != curType || i == ss.length - 1) {
            Atom atom = new Atom();
            atom.setWord(sAtom);
            atom.setLen(sAtom.length());
            atom.setPos(curType);
            result.add(atom);
            sAtom = "";
          }
        }
      }
View Full Code Here


   * @return
   */
  public static SegGraph generate(ArrayList<Atom> atoms, DictLib dictLib) {
    SegGraph segGraph = null;
    SegNode sn = null;
    Atom atom = null;

    if (atoms != null && atoms.size() > 0 && dictLib != null) {
      segGraph = new SegGraph();
      Dictionary dict = dictLib.getCoreDict();

      // �ȰѷǺ����ַ��Ĵ���ʶ�����
      for (int i = 0; i < atoms.size(); i++) {
        atom = atoms.get(i);
        String word = atom.getWord();
        if (atom.getPos() == Utility.CT_CHINESE)
          sn = new SegNode(i, i + 1, 0, 0, atom.getWord());
        else {
          int pos = 0;
          double value = Utility.MAX_FREQUENCE;

          switch (atom.getPos()) {
          case Utility.CT_INDEX:
          case Utility.CT_NUM:
            pos = -POSTag.NUM;// 'm'*256
            word = Utility.UNKNOWN_NUM;
            value = 0;
            break;
          case Utility.CT_DELIMITER:
            pos = POSTag.PUNC;// 'w'*256;
            break;
          case Utility.CT_LETTER:
            pos = -POSTag.NOUN_LETTER;//
            value = 0;
            word = Utility.UNKNOWN_LETTER;
            break;
          case Utility.CT_SINGLE:// 12021-2129-3121
            if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
              pos = -POSTag.NUM;// 'm'*256
              word = Utility.UNKNOWN_NUM;
            } else {
              pos = -POSTag.NOUN_LETTER;//
              word = Utility.UNKNOWN_LETTER;
            }
            value = 0;
            break;
          default:
            pos = atom.getPos();// '?'*256;
            break;
          }

          int gbkID = dictLib.getGBKID(word);
          sn = new SegNode(i, i + 1, pos, value, word);
          sn.setGbkID(gbkID);
        }

        sn.setSrcWord(atom.getWord());
        segGraph.insert(sn, true);
      }

      StringBuffer words = new StringBuffer();
      for (int i = 0; i < atoms.size(); i++) {
        int j = i + 1;
        words.delete(0, words.length());
        words.append(atoms.get(i).getWord());

        // ����ǡ��·ݡ�����Ҫ�ָ�
        boolean flag = false;
        if (j < atoms.size()) {
          Atom a2 = atoms.get(j);
          if ("��".equals(words.toString()) && "��".equals(a2.getWord())) {
            segGraph.delete(i, j);
            segGraph.delete(i + 1, j + 1);
            words.append(a2.getWord());
            flag = true;
            j++;
          }
        }

View Full Code Here

   * @return
   */
  public static SegGraph generate(ArrayList<Atom> atoms,Dictionary dict) {
    SegGraph segGraph = null;
    SegNode sn = null;
    Atom atom = null;

    if (atoms != null && atoms.size() > 0 && dict != null) {
      segGraph = new SegGraph();
      for (int i = 0; i < atoms.size(); i++) {
        atom = atoms.get(i);
        String word = atom.getWord();
        if (atom.getPos() == Utility.CT_CHINESE)
          sn = new SegNode(i, i + 1, 0,0, atom.getWord());
        else {
          double value = Utility.MAX_FREQUENCE;
          int pos = 0;

          switch (atom.getPos()) {
          case Utility.CT_INDEX:
          case Utility.CT_NUM:
            pos = -POSTag.NUM;// 'm'*256
            word = Utility.UNKNOWN_NUM;
            value = 0;
            break;
          case Utility.CT_DELIMITER:
            pos = POSTag.PUNC;// 'w'*256;
            break;
          case Utility.CT_LETTER:
            pos = -POSTag.NOUN_LETTER;//
            value = 0;
            word = Utility.UNKNOWN_LETTER;
            break;
          case Utility.CT_SINGLE:// 12021-2129-3121
            if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
              pos = -POSTag.NUM;// 'm'*256
              word = Utility.UNKNOWN_NUM;
            } else {
              pos = -POSTag.NOUN_LETTER;//
              word = Utility.UNKNOWN_LETTER;
            }
            value = 0;
            break;
          default:
            pos = atom.getPos();// '?'*256;
            break;
          }

          sn = new SegNode(i, i + 1,pos, value , word);
        }

        sn.setSrcWord(atom.getWord());
        segGraph.insert(sn, true);
      }

      String word = null;
      for (int i = 0; i < atoms.size(); i++) {
        int j = i + 1;
        word = atoms.get(i).getWord();
        // ����ǡ��·ݡ�����Ҫ�ָ�
        boolean flag = false;
        if (j < atoms.size()) {
          Atom a2 = atoms.get(j);
          if ("��".equals(word) && "��".equals(a2.getWord())) {
            segGraph.delete(i, j);
            segGraph.delete(i + 1, j + 1);
            word += a2.getWord();
            flag = true;
            j++;
          }
        }

View Full Code Here

      result = new ArrayList<Atom>();
      String[] ss = GFString.atomSplit(str);

      int index = str.indexOf(Utility.SENTENCE_BEGIN);
      if (index == 0) {
        Atom atom = new Atom();
        atom.setWord(Utility.SENTENCE_BEGIN);
        atom.setLen(Utility.SENTENCE_BEGIN.length());
        atom.setPos(Utility.CT_SENTENCE_BEGIN);
        result.add(atom);
        index += Utility.SENTENCE_BEGIN.length();
      }

      if (index == -1)
        index = 0;
      for (int i = index; i < ss.length; i++) {
        if (Utility.SENTENCE_END.equals(str.substring(i))) {
          Atom atom = new Atom();
          atom.setWord(Utility.SENTENCE_END);
          atom.setLen(Utility.SENTENCE_END.length());
          atom.setPos(Utility.CT_SENTENCE_END);
          result.add(atom);
          break;
        }

        String s = ss[i];
        sAtom += s;
        int curType = Utility.charType(s);
        if (".".equals(s)
            && (i + 1 < ss.length && (Utility.charType(ss[i + 1]) == Utility.CT_NUM || GFString
                .isNumeric(ss[i+1]))))
          curType = Utility.CT_NUM;

        // ����Ǻ��֡��ָ�����
        if (curType == Utility.CT_CHINESE || curType == Utility.CT_INDEX || curType == Utility.CT_DELIMITER
            || curType == Utility.CT_OTHER) {

          Atom atom = new Atom();
          atom.setWord(s);
          atom.setLen(s.length());
          atom.setPos(curType);
          result.add(atom);
          sAtom = "";
        }
        // ��������֡���ĸ�����ֽڷ��ţ�������ڵ���Щ��Ϊһ��ԭ�ӡ����磺����SHX-123�ͺŵ��ֻ��������е�SHX-123����һ��ԭ��
        else {
          int nextType = 255;// ��һ���ַ�������
          if (i < ss.length - 1)
            nextType = Utility.charType(ss[i + 1]);
          if (nextType != curType || i == ss.length - 1) {
            Atom atom = new Atom();
            atom.setWord(sAtom);
            atom.setLen(sAtom.length());
            atom.setPos(curType);
            result.add(atom);
            sAtom = "";
          }
        }
      }
View Full Code Here

TOP

Related Classes of org.ictclas4j.bean.Atom

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.