Package org.apache.lucene.analysis.ja.dict

Examples of org.apache.lucene.analysis.ja.dict.Dictionary


          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
        } else {
          attrs = "";
        }

        final Dictionary dict = tok.getDict(posData.backType[idx]);
        final int wordCost = dict.getWordCost(posData.backID[idx]);
        final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
                                     dict.getLeftId(posData.backID[idx]));

        final String surfaceForm = new String(fragment,
                                              posData.backPos[idx] - startPos,
                                              pos - posData.backPos[idx]);
       
View Full Code Here


          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost = pathCost + dict2.getWordCost(wordID) +
            costs.get(rightID, dict2.getLeftId(wordID)) +
            computePenalty(pos, toPos-pos);
          if (VERBOSE) {
            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
          }
          positions.get(toPos).add(newCost,
                                   dict2.getRightId(wordID),
                                   pos,
                                   bestStartIDX,
                                   wordID,
                                   forwardType);
        }
View Full Code Here

          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for(int j=1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          //System.out.println("    add user: len=" + len);
          pending.add(new Token(wordID+j-1,
                                fragment,
                                current + offset,
                                len,
                                Type.USER,
                                current + backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size()-1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                            pending.size()));

        backCount += wordIDAndLength.length-1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for(int i=length-1;i>=0;i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
              i--;
              charLen = 2;
            }
            //System.out.println("    extended tok offset="
            //+ (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
              pending.add(new Token(CharacterDefinition.NGRAM,
                                    fragment,
                                    offset + i,
                                    charLen,
                                    Type.UNKNOWN,
                                    backPos + i,
                                    unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;
         
        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID,
                                fragment,
                                offset,
                                length,
                                backType,
                                backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size()-1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;
View Full Code Here

          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost = pathCost + dict2.getWordCost(wordID) +
            costs.get(rightID, dict2.getLeftId(wordID)) +
            computePenalty(pos, toPos-pos);
          if (VERBOSE) {
            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
          }
          positions.get(toPos).add(newCost,
                                   dict2.getRightId(wordID),
                                   pos,
                                   bestStartIDX,
                                   wordID,
                                   forwardType);
        }
View Full Code Here

          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for(int j=1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          //System.out.println("    add user: len=" + len);
          pending.add(new Token(wordID+j-1,
                                fragment,
                                current + offset,
                                len,
                                Type.USER,
                                current + backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size()-1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                            pending.size()));

        backCount += wordIDAndLength.length-1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for(int i=length-1;i>=0;i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
              i--;
              charLen = 2;
            }
            //System.out.println("    extended tok offset="
            //+ (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
              pending.add(new Token(CharacterDefinition.NGRAM,
                                    fragment,
                                    offset + i,
                                    charLen,
                                    Type.UNKNOWN,
                                    backPos + i,
                                    unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;
         
        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID,
                                fragment,
                                offset,
                                length,
                                backType,
                                backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size()-1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;
View Full Code Here

          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost = pathCost + dict2.getWordCost(wordID) +
            costs.get(rightID, dict2.getLeftId(wordID)) +
            computePenalty(pos, toPos-pos);
          if (VERBOSE) {
            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
          }
          positions.get(toPos).add(newCost,
                                   dict2.getRightId(wordID),
                                   pos,
                                   bestStartIDX,
                                   wordID,
                                   forwardType);
        }
View Full Code Here

          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for(int j=1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          //System.out.println("    add user: len=" + len);
          pending.add(new Token(wordID+j-1,
                                fragment,
                                current + offset,
                                len,
                                Type.USER,
                                current + backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size()-1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                            pending.size()));

        backCount += wordIDAndLength.length-1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for(int i=length-1;i>=0;i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
              i--;
              charLen = 2;
            }
            //System.out.println("    extended tok offset="
            //+ (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
              pending.add(new Token(CharacterDefinition.NGRAM,
                                    fragment,
                                    offset + i,
                                    charLen,
                                    Type.UNKNOWN,
                                    backPos + i,
                                    unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;
         
        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID,
                                fragment,
                                offset,
                                length,
                                backType,
                                backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size()-1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;
View Full Code Here

          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost = pathCost + dict2.getWordCost(wordID) +
            costs.get(rightID, dict2.getLeftId(wordID)) +
            computePenalty(pos, toPos-pos);
          if (VERBOSE) {
            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
          }
          positions.get(toPos).add(newCost,
                                   dict2.getRightId(wordID),
                                   pos,
                                   bestStartIDX,
                                   wordID,
                                   forwardType);
        }
View Full Code Here

          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for(int j=1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          //System.out.println("    add user: len=" + len);
          pending.add(new Token(wordID+j-1,
                                fragment,
                                current + offset,
                                len,
                                Type.USER,
                                current + backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size()-1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                            pending.size()));

        backCount += wordIDAndLength.length-1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for(int i=length-1;i>=0;i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) {
              i--;
              charLen = 2;
            }
            //System.out.println("    extended tok offset="
            //+ (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset+i])) {
              pending.add(new Token(CharacterDefinition.NGRAM,
                                    fragment,
                                    offset + i,
                                    charLen,
                                    Type.UNKNOWN,
                                    backPos + i,
                                    unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;
         
        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID,
                                fragment,
                                offset,
                                length,
                                backType,
                                backPos,
                                dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size()-1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;
View Full Code Here

          rightID = getDict(posData.backType[bestStartIDX]).getRightId(posData.backID[bestStartIDX]);
        }
        final int pathCost = posData.costs[bestStartIDX];
        for(int forwardArcIDX=0;forwardArcIDX<posData.forwardCount;forwardArcIDX++) {
          final Type forwardType = posData.forwardType[forwardArcIDX];
          final Dictionary dict2 = getDict(forwardType);
          final int wordID = posData.forwardID[forwardArcIDX];
          final int toPos = posData.forwardPos[forwardArcIDX];
          final int newCost = pathCost + dict2.getWordCost(wordID) +
            costs.get(rightID, dict2.getLeftId(wordID)) +
            computePenalty(pos, toPos-pos);
          if (VERBOSE) {
            System.out.println("      + " + forwardType + " word " + new String(buffer.get(pos, toPos-pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + computePenalty(pos, toPos-pos) + " toPos.idx=" + positions.get(toPos).count);
          }
          positions.get(toPos).add(newCost,
                                   dict2.getRightId(wordID),
                                   pos,
                                   bestStartIDX,
                                   wordID,
                                   forwardType);
        }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.ja.dict.Dictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.