Package org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText

Examples of org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token


    public void process() throws EngineException {
        int debugedIndex = 0;
        while(state.next()) {
            if(log.isDebugEnabled() && (state.getTokenIndex() > debugedIndex || state.getTokenIndex() ==  0)){
                debugedIndex = state.getTokenIndex();
                Token token = state.getToken();
                log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
                    isProcessableToken(token)? '+':'-',
                    token.getText(),token.getPosTags(),token.getPosProbabilities()
                });
            }
            if(isProcessableToken(state.getToken())){
                List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens());
                searchStrings.add(state.getToken().getText());
                //get the list of all tokens that can possible be matched
                int includeTokenIndex = state.getTokenIndex();
                includeTokenIndex++;
                while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings
                        (includeTokenIndex <= (state.getChunk() != null ? //still within
                                state.getChunk().getEnd() : //the chunk
                                    state.getSentence().getTokens().size()-1))){ //or sentence
                    Token included = state.getSentence().getTokens().get(includeTokenIndex);
                    if(log.isDebugEnabled()  && includeTokenIndex > debugedIndex){
                        debugedIndex = includeTokenIndex;
                        log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
                            isProcessableToken(included)? '+':'-',
                            included.getText(),included.getPosTags(),included.getPosProbabilities()
                        });
                    }
                    includeTokenIndex++;
                    if(isProcessableToken(included)){
                        searchStrings.add(included.getText());
                    }
                }
                //search for Entities
                List<Suggestion> suggestions = lookupEntities(searchStrings);
                if(!suggestions.isEmpty()){
View Full Code Here


        boolean search = true;
        int firstFoundIndex = -1;
        int lastFoundIndex = -1;
        int firstFoundLabelIndex = -1;
        int lastfoundLabelIndex = -1;
        Token currentToken;
        String currentTokenText;
        int currentTokenLength;
        int notFound = 0;
        float minTokenMatchFactor = config.getMinTokenMatchFactor();
        //search for matches within the correct order
        for(int currentIndex = state.getTokenIndex();
                currentIndex < state.getSentence().getTokens().size()
                && search ;currentIndex++){
            currentToken = state.getSentence().getTokens().get(currentIndex);
            if(currentToken.hasAplhaNumericChar()){
                currentTokenText = currentToken.getText();
                if(!config.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean isProcessable = isProcessableToken(currentToken);
                boolean found = false;
                float matchFactor = 0f;
                //iteration starts at the next token after the last matched one
                //so it is OK to skip tokens in the label, but not within the text
                for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
                    String labelTokenText = labelTokens[i];
                    int labelTokenLength = labelTokenText.length();
                    float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                    float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
                    if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
                        int matchCount = compareTokens(currentTokenText, labelTokenText);
                        if(matchCount/maxLength >= minTokenMatchFactor){
                            lastfoundLabelIndex = i; //set the last found index to the current position
                            found = true; //set found to true -> stops iteration
                            matchFactor = matchCount/maxLength; //how good is the match
                            //remove matched labels from the set to disable them for
                            //a later random oder search
                            labelTokenSet.remove(labelTokenText);
                        }
                    }
                }
                if(!found){
                    //search for a match in the wrong order
                    //currently only exact matches (for testing)
                    if(found = labelTokenSet.remove(currentTokenText)){
                        matchFactor = 0.7f;
                    }
                }
                //int found = text.indexOf(currentToken.getText().toLowerCase());
                if(found){ //found
                    if(isProcessable){
                        foundProcessableTokens++; //only count processable Tokens
                    }
                    foundTokens++;
                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                    if(firstFoundIndex < 0){
                        firstFoundIndex = currentIndex;
                        firstFoundLabelIndex = lastfoundLabelIndex;
                    }
                    lastFoundIndex = currentIndex;
                } else { //not found
                    notFound++;
                    if(isProcessable || notFound > config.getMaxNotFound()){
                        //stop as soon as a token that needs to be processed is
                        //not found in the label or the maximum number of tokens
                        //that are not processable are not found
                        search = false;
                    }
                }
            } // else token without alpha or numeric characters are not processed
        }
        //search backwards for label tokens until firstFoundLabelIndex if there
        //are unconsumed Tokens in the sentence before state.getTokenIndex
        int currentIndex = state.getTokenIndex()-1;
        int labelIndex = firstFoundLabelIndex-1;
        notFound = 0;
        search = true;
        while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
            String labelTokenText = labelTokens[labelIndex];
            if(labelTokenSet.remove(labelTokenText)){ //still not matched
                currentToken = state.getSentence().getTokens().get(currentIndex);
                boolean isProcessable = isProcessableToken(currentToken);
                currentTokenText = currentToken.getText();
                if(!config.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean found = false;
View Full Code Here

                includeTokenIndex++;
                while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings
                        (includeTokenIndex <= (state.getChunk() != null ? //still within
                                state.getChunk().getEnd() : //the chunk
                                    state.getSentence().getTokens().size()-1))){ //or sentence
                    Token included = state.getSentence().getTokens().get(includeTokenIndex);
                    includeTokenIndex++;
                    if(isProcessableToken(included)){
                        searchStrings.add(included.getText());
                    }
                }
                //search for Entities
                List<Suggestion> suggestions = lookupEntities(searchStrings);
                if(!suggestions.isEmpty()){
View Full Code Here

        boolean search = true;
        int firstFoundIndex = -1;
        int lastFoundIndex = -1;
        int firstFoundLabelIndex = -1;
        int lastfoundLabelIndex = -1;
        Token currentToken;
        String currentTokenText;
        int currentTokenLength;
        int notFound = 0;
        //search for matches within the correct order
        for(int currentIndex = state.getTokenIndex();
                currentIndex < state.getSentence().getTokens().size()
                && search ;currentIndex++){
            currentToken = state.getSentence().getTokens().get(currentIndex);
            if(currentToken.hasAplhaNumericChar()){
                currentTokenText = currentToken.getText();
                if(!config.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean isProcessable = isProcessableToken(currentToken);
                boolean found = false;
                float matchFactor = 0f;
                //iteration starts at the next token after the last matched one
                //so it is OK to skip tokens in the label, but not within the text
                for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
                    String labelTokenText = labelTokens[i];
                    int labelTokenLength = labelTokenText.length();
                    float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                    float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
                    if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary string comparison
                        int matchCount = compairTokens(currentTokenText, labelTokenText);
                        if(matchCount/maxLength >= 0.7f){
                            lastfoundLabelIndex = i; //set the last found index to the current position
                            found = true; //set found to true -> stops iteration
                            matchFactor = matchCount/maxLength; //how good is the match
                            //remove matched labels from the set to disable them for
                            //a later random oder search
                            labelTokenSet.remove(labelTokenText);
                        }
                    }
                }
                if(!found){
                    //search for a match in the wrong order
                    //currently only exact matches (for testing)
                    if(found = labelTokenSet.remove(currentTokenText)){
                        matchFactor = 0.7f;
                    }
                }
                //int found = text.indexOf(currentToken.getText().toLowerCase());
                if(found){ //found
                    if(isProcessable){
                        foundProcessableTokens++; //only count processable Tokens
                    }
                    foundTokens++;
                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                    if(firstFoundIndex < 0){
                        firstFoundIndex = currentIndex;
                        firstFoundLabelIndex = lastfoundLabelIndex;
                    }
                    lastFoundIndex = currentIndex;
                } else { //not found
                    notFound++;
                    if(isProcessable || notFound > maxNotFound){
                        //stop as soon as a token that needs to be processed is
                        //not found in the label or the maximum number of tokens
                        //that are not processable are not found
                        search = false;
                    }
                }
            } // else token without alpha or numeric characters are not processed
        }
        //search backwards for label tokens until firstFoundLabelIndex if there
        //are unconsumed Tokens in the sentence before state.getTokenIndex
        int currentIndex = state.getTokenIndex()-1;
        int labelIndex = firstFoundLabelIndex-1;
        notFound = 0;
        search = true;
        while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
            String labelTokenText = labelTokens[labelIndex];
            if(labelTokenSet.remove(labelTokenText)){ //still not matched
                currentToken = state.getSentence().getTokens().get(currentIndex);
                currentTokenText = currentToken.getText();
                if(!config.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean found = false;
View Full Code Here

TOP

Related Classes of org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.