Package net.paoding.analysis.dictionary

Examples of net.paoding.analysis.dictionary.Hit


      // 大概有5639个字有词语,故取0x2fff=x^13>8000>8000*0.75=6000>5639
      vocabularyDictionary = new HashBinaryDictionary(
          getVocabularyWords(), 0x2fff, 0.75f);
      Dictionary noiseWordsDic = getNoiseWordsDictionary();
      for (int i = 0; i < noiseWordsDic.size(); i++) {
        Hit hit = vocabularyDictionary.search(noiseWordsDic.get(i), 0, noiseWordsDic.get(i).length());
        if (hit.isHit()) {
          hit.getWord().setNoiseWord();
        }
      }
      Dictionary noiseCharactorsDic = getNoiseCharactorsDictionary();
      for (int i = 0; i < noiseCharactorsDic.size(); i++) {
        Hit hit = vocabularyDictionary.search(noiseCharactorsDic.get(i), 0, noiseCharactorsDic.get(i).length());
        if (hit.isHit()) {
          hit.getWord().setNoiseCharactor();
        }
      }
     
    }
    return vocabularyDictionary;
View Full Code Here


      chs[curWord.length()] = (char) -1;
      Beef beef = new Beef(chs, 0, curWord.length() + 1);
      final BitSet bs = new BitSet(curWord.length());
      knife.dissect(new Collector(){
        public void collect(String word, int offset, int end) {
          Hit hit = vocabularyDictionary.search(word, 0, word.length());
          if (hit.isHit() && hit.getWord().length() != curWord.length()) {
            for (int j = offset; j < end; j++) {
              bs.set(j, true);
            }
          }
        }
View Full Code Here

    if (endPos == -1) {
      return Hit.UNDEFINED;
    }
    //中文数字还没结束,后面可能还有
    if (endPos == count - 1) {
      return new Hit(Hit.UNCLOSED_INDEX, null, null);
    }
    //只有一个中文数字,不是连续的,不处理
    if (endPos == 0) {
      return Hit.UNDEFINED;
    }
   
    //部分含有中文数字,取这一部分出来
    //trick: 我们这里用index参数传递该部分中文的结束位置
    return new Hit(offset + endPos + 1, new Word(nums.toString()), null);
  }
View Full Code Here

        binOffset = tempEnd;
        continue;
      }

      // 如果当前字符是noise单字,其不参加二元分词
      Hit curSearch = noiseCharactors.search(beef, curSearchOffset, 1);
      if (curSearch.isHit()) {
        binDissect(collector, beef, binOffset, curSearchOffset);
        binOffset = ++curSearchOffset;
        continue;
      }
      curSearchOffset++;
View Full Code Here

      }
      collector.collect(String.valueOf(number1), offset, curTail);
     
      if (units != null) {
        // 后面可能跟了计量单位
        Hit wd = null;
        Hit wd2 = null;
        int i = curTail + 1;
       
        /*
         * Fix issue 48: 查找计量单位引起的高亮越界错误
         */
        while (i <= limit && (wd = units.search(beef, curTail, i - curTail)).isHit()) {
          wd2 = wd;
          i ++;
          if (!wd.isUnclosed()) {
            break;
          }
        }
        i --;
        if (wd2 != null) {
          collector.collect(wd2.getWord().getText(), curTail, i);
          return i;
        }
      }
    }

View Full Code Here

    return curTail;
  }

  protected int skipNoiseWords(Collector collector, Beef beef, int offset,
      int end, int binOffset) {
    Hit word;
    for (int k = offset + 2; k <= end; k++) {
      word = noiseWords.search(beef, offset, k - offset);
      if (word.isHit()) {
        // 二元分词
        if (binOffset > 0 && offset > binOffset) {
          binDissect(collector, beef, binOffset, offset);
          binOffset = -1;
        }
        offset = k;
      }
      if (word.isUndefined() || !word.isUnclosed()) {
        break;
      }
    }
    return offset;
  }
View Full Code Here

    // 记录当前被检视的字符串的长度,它的值恒等于(curSearchEnd - curSearchOffset)
    int curSearchLength;

    // 当前检视的字符串的判断结果
    Hit curSearch = null;

    // 限制要判断的字符串的最大开始位置
    // 这个变量不随着程序的运行而变化
    final int offsetLimit;
    if (point != -1)
      offsetLimit = point;
    else
      offsetLimit = limit;

    // 记录到当前为止所分出的词典词语的最大结束位置
    int maxDicWordEnd = offset;

    // 记录最近的不在词典中的字符串(称为孤立字符串)在beef的位置,-1表示没有这个位置
    int isolatedOffset = -1;

    // 记录到当前为止经由词典所切出词的最大长度。
    // 用于辅助判断是否调用shouldBeWord()方法,以把前后有如引号、书名号之类的,但还没有被切出的字符串当成一个词
    // 详见本方法后面对maxDicWordLength的应用以及shouldBeWord()的实现
    int maxDicWordLength = 0;

    // 第1个循环定位被检视字符串的开始位置
    // 被检视的字符串开始位置的极限是offsetLimit,而非limit
    for (; curSearchOffset < offsetLimit; curSearchOffset++) {

      // 第二个循环定位被检视字符串的结束位置(不包含该位置的字符)
      // 它的起始状态是:被检视的字符串一长度为1,即结束位置为开始位置+1
      curSearchEnd = curSearchOffset + 1;
      curSearchLength = 1;
      for (; curSearchEnd <= limit; curSearchEnd++, curSearchLength++) {

        /*
         * Fix issue 50: 中文数字解析问题
         */       
        //先搜索连续的中文数字
        curSearch = searchNumber(beef, curSearchOffset, curSearchLength);
        if (curSearch.isHit()) {
          if (isolatedOffset >= 0) {
            dissectIsolated(collector, beef, isolatedOffset,
                curSearchOffset);
            isolatedOffset = -1;
          }
         
          // trick: 用index返回中文数字实际结束位置
          int numberSearchEnd = curSearch.getIndex();
          int numberSearchLength = curSearch.getIndex() - curSearchOffset;

          // 1.2)
          // 更新最大结束位置
          if (maxDicWordEnd < numberSearchEnd) {
            maxDicWordEnd = numberSearchEnd;
          }

          // 1.3)
          // 更新词语最大长度变量的值
          if (curSearchOffset == offset
              && maxDicWordLength < numberSearchLength) {
            maxDicWordLength = numberSearchLength;
          }

          Word word = curSearch.getWord();
          if (!word.isNoise()) {
            dissectIsolated(collector, beef, curSearchOffset,
                curSearch.getIndex());
          }
          curSearchOffset = numberSearchEnd - 1;
          break;
        }
        if (curSearch.isUnclosed()) {
          continue;
        }

        // 通过词汇表判断,返回判断结果curSearch
        curSearch = vocabulary.search(beef, curSearchOffset,
            curSearchLength);

        // ---------------分析返回的判断结果--------------------------

        // 1)
        // 从词汇表中找到了该词语...
        if (curSearch.isHit()) {

          // 1.1)
          // 确认孤立字符串的结束位置=curSearchOffset,
          // 并调用子方法分解把从isolatedOffset开始的到curSearchOffset之间的孤立字符串
          // 孤立字符串分解完毕,将孤立字符串开始位置isolatedOffset清空
          if (isolatedOffset >= 0) {
            dissectIsolated(collector, beef, isolatedOffset,
                curSearchOffset);
            isolatedOffset = -1;
          }

          // 1.2)
          // 更新最大结束位置
          if (maxDicWordEnd < curSearchEnd) {
            maxDicWordEnd = curSearchEnd;
          }

          // 1.3)
          // 更新词语最大长度变量的值
          if (curSearchOffset == offset
              && maxDicWordLength < curSearchLength) {
            maxDicWordLength = curSearchLength;
          }

          // 1.2)
          // 通知collector本次找到的词语
          Word word = curSearch.getWord();
          if (!word.isNoise()) {
            collector.collect(word.getText(), curSearchOffset,
                curSearchEnd);
          }
        }

        // 若isolatedFound==true,表示词典没有该词语
        boolean isolatedFound = curSearch.isUndefined();

        // 若isolatedFound==false,则通过Hit的next属性检视词典没有beef的从offset到curWordEnd
        // + 1位置的词
        // 这个判断完全是为了减少一次词典检索而设计的,
        // 如果去掉这个if判断,并不影响程序的正确性(但是会多一次词典检索)
        if (!isolatedFound && !curSearch.isHit()
            && curSearch.getNext() != null) {
          isolatedFound = curSearchEnd >= limit
              || beef.charAt(curSearchEnd) < curSearch.getNext()
                  .charAt(curSearchLength);
        }
        // 2)
        // 词汇表中没有该词语,且没有以该词语开头的词汇...
        // -->将它记录为孤立词语
View Full Code Here

      chs[curWord.length()] = (char) -1;
      Beef beef = new Beef(chs, 0, curWord.length() + 1);
      final BitSet bs = new BitSet(curWord.length());
      knife.dissect(new Collector(){
        public void collect(String word, int offset, int end) {
          Hit hit = vocabularyDictionary.search(word, 0, word.length());
          if (hit.isHit() && hit.getWord().length() != curWord.length()) {
            for (int j = offset; j < end; j++) {
              bs.set(j, true);
            }
          }
        }
View Full Code Here

      // 大概有5639个字有词语,故取0x2fff=x^13>8000>8000*0.75=6000>5639
      vocabularyDictionary = new HashBinaryDictionary(
          getVocabularyWords(), 0x2fff, 0.75f);
      Dictionary noiseWordsDic = getNoiseWordsDictionary();
      for (int i = 0; i < noiseWordsDic.size(); i++) {
        Hit hit = vocabularyDictionary.search(noiseWordsDic.get(i), 0, noiseWordsDic.get(i).length());
        if (hit.isHit()) {
          hit.getWord().setNoiseWord();
        }
      }
      Dictionary noiseCharactorsDic = getNoiseCharactorsDictionary();
      for (int i = 0; i < noiseCharactorsDic.size(); i++) {
        Hit hit = vocabularyDictionary.search(noiseCharactorsDic.get(i), 0, noiseCharactorsDic.get(i).length());
        if (hit.isHit()) {
          hit.getWord().setNoiseCharactor();
        }
      }
     
    }
    return vocabularyDictionary;
View Full Code Here

    curTail = curTail > limit ? curTail : limit;
   
    //
    // 后面可能跟了计量单位
    if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) {
      Hit wd = null;
      Hit wd2 = null;
      int i = curTail + 1;
     
      /*
       * Fix issue 48: 查找计量单位引起的高亮越界错误
       */
      while (i <= limit && (wd = units.search(beef, curTail, i - curTail)).isHit()) {
        wd2 = wd;
        i++;
        if (!wd.isUnclosed()) {
          break;
        }
      }
      i --;
      if (wd2 != null) {
        collector.collect(wd2.getWord().getText(), curTail, i);
        return i;
      }
    }
    //
   
View Full Code Here

TOP

Related Classes of net.paoding.analysis.dictionary.Hit

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.