Package org.apache.stanbol.enhancer.nlp.model

Examples of org.apache.stanbol.enhancer.nlp.model.Token


        TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
        try {
          tokens.reset();
            while(tokens.incrementToken()){
                OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
                Token t = at.addToken(offset.startOffset(), offset.endOffset());
                log.trace("detected {}",t);
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from "
                +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
View Full Code Here


                continue; //ignore terms without readings
            }
            //Add the LexicalEntry as Token to the Text. NOTE that if a
            //Token with the same start/end positions already exist this
            //Method returns the existing instance
            Token token = at.addToken(term.getFrom(), term.getTo());
            //Now try to get POS annotations for the Token
            for(Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)){
                if(posAnno.value().isMapped()){
                    for(LexicalCategory cat :posAnno.value().getCategories()){
                        if(!tokenLexCats.containsKey(cat)){ //do not override with lover prob
                            tokenLexCats.put(cat, posAnno.probability());
                        }
                    }
                }
            }
            for(Reading reading : term.getTermReadings()){
                MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
                //add the readings (MorphoFeatures)
                if(mf != null){
                    //use the POS tags of the morpho analysis and compare it
                    //with existing POS tags.
                    double posProbability = -1;
                    Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                    for(PosTag mfPos : mf.getPosList()){
                        mfCats.addAll(mfPos.getCategories());
                    }
                    for(LexicalCategory mfCat : mfCats){
                        Double prob = tokenLexCats.get(mfCat);
                        if(prob != null && posProbability < prob){
                            posProbability = prob;
                        }
                    }
                    //add the morpho features with the posProbabiliy
                    Value<MorphoFeatures> value = Value.value(mf,
                        posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                    token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
                }
            }
        }
    }
View Full Code Here

        int sentence = text.indexOf('.')+1;
        Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
        expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
            "cities such as Paris and people such as Bob Marley.");
       
        Token the = sent1.addToken(0, 3);
        expectedTokens.put(the, "The");
        the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PREP",Pos.Preposition), 0.85));
       
        Token stanbol = sent1.addToken(4,11);
        expectedTokens.put(stanbol, "Stanbol");
        stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
            0.5));
       
        //use index to create Tokens
        int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
        Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
        expectedTokens.put(enhancer, "enhancer");
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("N", LexicalCategory.Noun),0.87));
        MorphoFeatures morpho = new MorphoFeatures("enhance");
        morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
        morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
        morpho.addDefinitness(Definitness.Definite);
        morpho.addPerson(Person.First);
        morpho.addPos(new PosTag("PN", Pos.ProperNoun));
        morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
        morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
        morpho.addTense(new TenseTag("test-tense", Tense.Present));
        morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
        enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));

        //create a chunk
        Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
        expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
        stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
            new NerTag("organization", DBPEDIA_ORGANISATION)));
        stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
            new PhraseTag("NP", LexicalCategory.Noun),0.98));
View Full Code Here

    assertAnalysedTextEquality(parsedAt);
  }
 
  private static void initCorefAnnotations() {
    Sentence sentence1 = at.addSentence(0, sentenceText1.indexOf(".") + 1);
        Token obama = sentence1.addToken(0, "Obama".length());
       
        Sentence sentence2 = at.addSentence(sentenceText1.indexOf(".") + 2, sentenceText2.indexOf(".") + 1);
        int heStartIdx = sentence2.getSpan().toString().indexOf("He");
        Token he = sentence2.addToken(heStartIdx, heStartIdx + "He".length());
       
        Set<Span> obamaMentions = new HashSet<Span>();
        obamaMentions.add(he);
        obama.addAnnotation(NlpAnnotations.COREF_ANNOTATION,
          Value.value(new CorefFeature(true, obamaMentions)));
       
        Set<Span> heMentions = new HashSet<Span>();
        heMentions.add(obama);
        he.addAnnotation(NlpAnnotations.COREF_ANNOTATION,
          Value.value(new CorefFeature(false, heMentions)));
  }
View Full Code Here

    assertAnalysedTextEquality(parsedAt);
  }
 
  private static void initDepTreeAnnotations() {
    Sentence sentence = at.addSentence(0, text.indexOf(".") + 1);
        Token obama = sentence.addToken(0, "Obama".length());
       
        int visitedStartIdx = sentence.getSpan().toString().indexOf("visited");
        Token visited = sentence.addToken(visitedStartIdx, visitedStartIdx + "visited".length());
       
        int chinaStartIdx = sentence.getSpan().toString().indexOf("China");
        Token china = sentence.addToken(chinaStartIdx, chinaStartIdx + "China".length());
       
        GrammaticalRelationTag nSubjGrammRelTag = new GrammaticalRelationTag(
                "nsubj", GrammaticalRelation.NominalSubject);
        obama.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION,
          Value.value(new DependencyRelation(nSubjGrammRelTag, true, visited)));
       
        GrammaticalRelationTag rootGrammRelTag = new GrammaticalRelationTag(
                "root", GrammaticalRelation.Root);
        GrammaticalRelationTag dobjGrammRelTag = new GrammaticalRelationTag(
                "dobj", GrammaticalRelation.DirectObject);
        visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION,
          Value.value(new DependencyRelation(rootGrammRelTag, true, null)));
        visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION,
            Value.value(new DependencyRelation(nSubjGrammRelTag, false, obama)));
        visited.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION,
            Value.value(new DependencyRelation(dobjGrammRelTag, false, china)));
       
        china.addAnnotation(NlpAnnotations.DEPENDENCY_ANNOTATION,
            Value.value(new DependencyRelation(dobjGrammRelTag, true, visited)));
  }
View Full Code Here

        final List<SentimentPhrase> sentimentPhrases = new ArrayList<SentimentPhrase>();
        while(tokenIt.hasNext()){
            Span span = tokenIt.next();
            switch (span.getType()) {
                case Token:
                    Token word = (Token)span;
                    Integer wordIndex = sentimentTokens.size();
                    Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
                    boolean addToList = false;
                    Sentiment sentiment = null;
                    if(sentimentAnnotation != null && sentimentAnnotation.value() != null &&
                            !sentimentAnnotation.value().equals(ZERO)){
                        sentiment = new Sentiment(word, sentimentAnnotation.value(),
                            sentence == null || word.getEnd() > sentence.getEnd() ?
                                    null : sentence);
                        addToList = true;
                    }
                    if(isNegation((Token)span, language)){
                        addToList = true;
                        negations.put(wordIndex, word);
                    } else if(isNoun(word, firstTokenInSentence, language) ||
                            isPronoun(word,language)){
                        addToList = true;
                        nounsAndPronouns.put(wordIndex, word);
                    } else if(isSectionBorder(word, language)){
                        addToList = true;
                        sectionBorders.put(wordIndex, word);
                    } else if(isVerb(word, language)){
                        addToList = true;
                        verbs.put(wordIndex, word);
                    } else if(isCoordinatingConjuction(word,language)){
                        addToList = true;
                        conjuctions.put(wordIndex, word);
                    } else if(isCountable(word, language)){
                        addToList = true;
                    }
                    if(log.isDebugEnabled()){
                        Value<PosTag> pos = word.getAnnotation(NlpAnnotations.POS_ANNOTATION);
                        log.debug(" [{}] '{}' pos: {}, sentiment {}", new Object[]{
                                addToList ? sentimentTokens.size() : "-",
                                word.getSpan(),pos.value().getCategories(),
                                sentiment == null ? "none" : sentiment.getValue()});
                    }
                    if(addToList){
                        sentimentTokens.add(sentiment); //add the token
                    }
View Full Code Here

            return; //deactivate test
        }
        //now validate the enhancements
        int sentimentExpressionCnt=0;
        for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
            Token token = tokens.next();
            log.info("Token: {}",token);
            List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
            if(sentimentExpressionsList!=null && sentimentExpressionsList.size()>0)
              sentimentExpressionCnt++;
        }
      
        Assert.assertTrue("2 sentiment expressions should be recognized in: "+text,sentimentExpressionCnt==2);
View Full Code Here

        //TODO: locking for AnalysedText not yet defined
//        ci.getLock().writeLock().lock();
//        try {
        Iterator<Token> tokens = analysedText.getTokens();
        while(tokens.hasNext()){
            Token token = tokens.next();
            Set<LexicalCategory> cats = null;
            boolean process = false;
            if(!adjectivesOnly){
                process = true;
                Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
                if(posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY
                        || posTag.probability() >= (minPOSConfidence/2.0)){
                    cats = classifier.getCategories(posTag.value());
                } else { //no POS tags or probability to low
                    cats = Collections.emptySet();
                }
            } else { //check PosTags if we need to lookup this word
                Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
                boolean ignore = false;
                while(!ignore && !process && posTags.hasNext()) {
                    Value<PosTag> value = posTags.next();
                    PosTag tag = value.value();
                    cats = classifier.getCategories(tag);
                    boolean state = cats.contains(LexicalCategory.Adjective)
                            || cats.contains(LexicalCategory.Noun);
                    ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY ||
                            value.probability() >= minPOSConfidence);
                    process = state && (value.probability() == Value.UNKNOWN_PROBABILITY ||
                            value.probability() >= (minPOSConfidence/2.0));
                }
            } //else process all tokens ... no POS tag checking needed
            if(process){
                String word = token.getSpan();
                double sentiment = 0.0;
                if(cats.isEmpty()){
                    sentiment = classifier.classifyWord(null, word);
                } else { //in case of multiple Lexical Cats
                    //we build the average over NOT NULL sentiments for the word
                    int catSentNum = 0;
                    for(LexicalCategory cat : cats){
                        double catSent = classifier.classifyWord(cat, word);
                        if(catSent != 0.0){
                            catSentNum++;
                            sentiment = sentiment + catSent;
                        }
                    }
                    if(catSentNum > 0){
                        sentiment = sentiment / (double) catSentNum;
                    }
                }
                if(sentiment != 0.0){
                    token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
                } //else do not set sentiments with 0.0
            } // else do not process
        }
//        } finally {
//            ci.getLock().writeLock().unlock();
View Full Code Here

        ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
        Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
       
        //Add some Tokens with POS annotations to test the usage of
        //existing POS annotations by the lemmatizer
        Token verbrachten = at.addToken(de_verbStart,de_verbStart+de_verb.length());
        verbrachten.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("V",LexicalCategory.Verb), de_verbProb));
       
        Token schonen = at.addToken(de_adjectiveStart,de_adjectiveStart+de_adjective.length());
        schonen.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("ADJ",LexicalCategory.Adjective), de_adjectiveProb));
       
        Token urlaub = at.addToken(de_nounStart,de_nounStart+de_noun.length());
        urlaub.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("NC",LexicalCategory.Noun), de_nounProb));
       
        Assert.assertEquals("Can not enhance Test ContentItem",
            EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
        //compute the enhancements
        try {
            engine.computeEnhancements(ci);
        } catch (EngineException e) {
            RemoteServiceHelper.checkServiceUnavailable(e);
            return; //deactivate test
        }
        //now validate the enhancements
        boolean foundVerb = false;
        boolean foundAdjective = false;
        boolean foundNoun = false;
        for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
            Token token = tokens.next();
            log.info("Token: {}",token);
            List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
            if(de_verb.equals(token.getSpan())){
                foundVerb = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Verb,de_verbProb);
            } else if(de_adjective.equals(token.getSpan())){
                foundAdjective = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Adjective,de_adjectiveProb);
            } else if(de_noun.equals(token.getSpan())){
                foundNoun = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Noun,de_nounProb);
            }
            for(Value<MorphoFeatures> mf : mfs){
                log.info("  - {}",mf);
View Full Code Here

        }
       
        for(SentimentExpression se : seList){
            //Add the Sentiment Expression as Token to the Text. NOTE that if a Token with the same start/end positions already exist this
            //Method returns the existing instance
            Token token = at.addToken(se.getStartSnippet(),se.getEndSnippet());
            token.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, new Value<Double>(se.getSentimentPolarityAsDoubleValue()) );
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.nlp.model.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.