Package opennlp.ccg.parse.tagger

Examples of opennlp.ccg.parse.tagger.TaggedWord


                int cursor = -1;
                while(++cursor < taggedSent.size()) {
                    Word wdIn = inLine.get(cursor);
                    // word form...
                    out.write(wdIn.getForm());
                    TaggedWord posT = posTagging.get(cursor);
                    // print out number of POS tags, followed by tab-separated probabilized POS tagging.
                    out.write("\t" + posT.getPOSTagging().size());
                    for(Pair<Double,String> pt : posT.getPOSTagging()) {
                        out.write("\t" + pt.b + "\t" + pt.a);
                    }
                    // now print out number of and list of tab-separated, probabilized supertags.
                    out.write("\t" + taggedSent.get(cursor).size());
                    for(Pair<Double,String> stg : taggedSent.get(cursor)) {
View Full Code Here


            List<Pair<Double, String>> tmpTagging = new ArrayList<Pair<Double, String>>(tw.getPOSTagging().size());
            for (Pair<Double, String> tg : tw.getPOSTagging()) {
                tmpTagging.add(tg);
            }
            tmpTagging.subList(endIndex, tmpTagging.size()).clear();
            TaggedWord twTmp = new TaggedWord(w);
            twTmp.setPOSTagging(tmpTagging);
            res.add(twTmp);
        }
        return res;
    }
View Full Code Here

        List<List<Pair<Double,String>>> taggings = new ArrayList<List<Pair<Double,String>>>(sentence.size());
       
        Map<Integer, TaggedWord> sentMap = new HashMap<Integer, TaggedWord>(sentence.size());
        int ind = 0;
        for(Word w : sentence) {       
            sentMap.put(ind++, new TaggedWord(w));
        }
        List<Collection<Pair<String,Double>>> ftss = posFex.getSentenceFeatures(sentMap);
       
        double[] distro = null;       
       
        int wordIndex = 0;
        for(Collection<Pair<String,Double>> fts : ftss) {
           
            distro = tagMod.eval(fts);
            List<Pair<Double,Integer>> distroList = new ArrayList<Pair<Double,Integer>>(distro.length);
            ind = 0; for(double prob : distro) { distroList.add(new Pair<Double,Integer>(prob, ind++)); }           
            Collections.sort(distroList, comp);                       
            // widen beta a little bit (we're going to do some fwd-bwd rescoring inp a minute, but we don't
            // want to do the fwd-bwd alg over ALL possible tags -- too inefficient).
            List<Pair<Double,String>> tagging = new ArrayList<Pair<Double,String>>(distro.length);
            double best = distroList.get(0).a;
            double widenedBeta = beta/8;           
           
            String goldPOS = sentence.get(wordIndex).getPOS();
           
            for(Pair<Double,Integer> outcome : distroList) {
                if( (outcome.a >= (widenedBeta * best)) || (includeGold && tagMod.getOutcome(outcome.b).equals(goldPOS)) ) {
                   tagging.add(new Pair<Double,String>(outcome.a, tagMod.getOutcome(outcome.b)));
                } else {
                    if(!includeGold) {  // if not still potentially fishing for a gold POS tag, then break (they're in sorted order).
                        break;
                    }
                }
            }
            taggings.add(tagging);
            wordIndex++;
        }
        // rescore using forward-backward.
        taggings = posSeqMod.rescoreSequence(taggings);       
        // add these rescored taggings to the list of TaggedWord's.
        int wInd = 0;
        for(List<Pair<Double,String>> tagging : taggings) {
            TaggedWord tmpWd = new TaggedWord(sentence.get(wInd++));
            tmpWd.setPOSTagging(tagging);           
            result.add(tmpWd);
        }
        // now filter down to the beta-best.
        return betaBestFilter(result);
    }
View Full Code Here

    public List<TaggedWord> tagSentence(List<Word> sentence) {
        List<TaggedWord> result = new ArrayList<TaggedWord>(sentence.size());
        for(Word w : sentence) {           
            List<Pair<Double,String>> tmpTagging = new ArrayList<Pair<Double,String>>(1);
            tmpTagging.add(new Pair<Double,String>(1.0,w.getPOS()));           
            TaggedWord tmp = new TaggedWord(w);
            tmp.setPOSTagging(tmpTagging);
            result.add(tmp);           
        }
        return result;
    }
View Full Code Here

     * sentence-level contextual feature extractor. (VERY todo-ish, though.)
     */
    public Collection<Pair<String, Double>> getFeatures(Map<Integer, TaggedWord> sentence, Integer wordIndex, boolean training) {
        Collection<Pair<String, Double>> result = new ArrayList<Pair<String, Double>>(30);

        TaggedWord current, prev, prevPrev, next, nextNext;
        current = sentence.get(wordIndex);
        // -------- The left periphery ------------
        int wind = wordIndex.intValue();
        if (wind > 1) {
            prev = sentence.get(wind - 1);
            prevPrev = sentence.get(wind - 2);
        } else if (wind > 0) {
            prev = sentence.get(wind - 1);
            prevPrev = Constants.OOB;
        } else {
            prev = prevPrev = Constants.OOB;
        }

        // -------- The right periphery -----------
        int tempSize = sentence.size();
        if ((tempSize - (wind + 1)) >= 2) {
            next = sentence.get(wind + 1);
            nextNext = sentence.get(wind + 2);
        } else if (tempSize - (wind + 1) >= 1) {
            next = sentence.get(wind + 1);
            nextNext = Constants.OOB;
        } else {
            next = nextNext = Constants.OOB;
        }

        Double activation = Constants.one;
       
        if (training) {
            result.add(new Pair<String, Double>(current.getPOS(), activation));
        }

        // we do not use tag-sequence features in this model.
        // these are in a separate sequence model (n-gram model over POS sequences).
       
        // standard contextual features (word to the left, current word, word to the right, etc.).
        // these features are from Ratnaparkhi (1996).
        result.add(new Pair<String, Double>(curL + "=" + current.getForm(), activation));
        result.add(new Pair<String, Double>(prevL + "=" + prev.getForm(), activation));
        result.add(new Pair<String, Double>(prevPrevL + "=" + prevPrev.getForm(), activation));
        result.add(new Pair<String, Double>(nextL + "=" + next.getForm(), activation));
        result.add(new Pair<String, Double>(nextNextL + "=" + nextNext.getForm(), activation));
       
        // features that replace the tagging dictionary.
        // add real-valued (activation = prior log-prob) features for each of the beta-best prior
        // tags, given this word.
        if(posPrior != null) {
            List<Pair<Double,String>> priors = posPrior.getPriors(current.getWord());
            double beta = 0.1;
            double best = priors.get(0).a;
            String wform = current.getForm();
            for(Pair<Double,String> prior : priors) {
                if(prior.a > (beta * best)) {
                    // add the features PPOS=<POSTAG>:<log-prob> and PPOS_word=<POSTAG>_<wordForm>:<log-prob>.
                    result.add(new Pair<String,Double>(priorF + "=" + prior.b, prior.a));
                    result.add(new Pair<String,Double>(priorF + "_word" + "=" + prior.b + "_" + wform, prior.a));
                } else {
                    break;
                }
            }
        }
       
        // these are in addition to Ratnaparkhi's (1996) contextual features.
        // now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp).
        // (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token).
        // N.B. only use single-best POSs (maybe change later).
        TaggedWord[] wds = {prevPrev, prev, current, next, nextNext};

        for (int j = 1; j < wds.length; j++) {
            result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation));
            // also, if at the current word slot, add bigrams that straddle the current word.
            if (j == 2) {
                result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation));
            }
        }
       
        // affix features from Ratnaparkhi (1996).
        // if the word's length is > 4, then extract the 1-, 2-, 3- and 4-character affixes.       
        if(current.getForm().length() > 4) {
            StringBuffer prefixes = new StringBuffer(4), suffixes = new StringBuffer(4);
            char[] wdForm = current.getForm().toCharArray();
            // prefixes.
            int cursor = 0;
            for(cursor = 0; cursor < 4; cursor++) {
                prefixes.append(wdForm[cursor]);
                result.add(new Pair<String,Double>(prefix+"="+prefixes.toString(), Constants.one));
            }
            // suffixes.
            for(cursor = wdForm.length-1; cursor >= wdForm.length-5; cursor--) {
                suffixes.insert(0, wdForm[cursor]);
                result.add(new Pair<String,Double>(suffix+"="+suffixes.toString(), Constants.one));
            }
        }
       
        // now do "contains hyphen", "contains number", "contains uppercase letter" and contains fused NE connecter (_) features.
        // also from Ratnaparkhi (1996).       
        if(current.getForm().contains("-")) { result.add(new Pair<String,Double>(hyphen, Constants.one)); }
        if(current.getForm().matches(".*[0-9]+.*")) { result.add(new Pair<String,Double>(num, Constants.one)); }
        if(!current.getForm().toLowerCase().equals(current.getForm())) { result.add(new Pair<String,Double>(caps, Constants.one)); }
  // if we see a NE connector, this is likely a NNP (in English, e.g.).
        if(current.getForm().contains(neConnecter)) { result.add(new Pair<String,Double>(neConn, Constants.one)); }
        return result;
    }
View Full Code Here

                    // turn the sent into a map from integer string indices to Words.
                    int index = 0;
                    snt = new TreeMap<Integer, TaggedWord>();
                    for (Word w : sent) {
                        snt.put(index++, new TaggedWord(w));
                    }

                    // 'true' says "we're getting training feats"
                    for (Collection<Pair<String, Double>> sentFeatsWithActivation : fexer.getSentenceFeatures(snt, true)) {
                        try {
View Full Code Here

        }
        POSTagFex fexer = new POSTagFex(posPriorMod);       
        for(List<Word> sentence : corp) {
            Map<Integer, TaggedWord> sent = new HashMap<Integer, TaggedWord>(sentence.size());
            int index = 0;
            for(Word w : sentence) { sent.put(index++, new TaggedWord(w)); }
           
            List<Collection<Pair<String,Double>>> ftss = fexer.getSentenceFeatures(sent, true);
           
            for(Collection<Pair<String,Double>> fts : ftss) {
                index = 0;
View Full Code Here

     * for the word at index <tt>wordIndex</tt>
     */
    public Collection<Pair<String, Double>> getFeatures(Map<Integer, TaggedWord> sentence, Integer wordIndex, boolean training) {
        Collection<Pair<String, Double>> result = new ArrayList<Pair<String, Double>>(30);

        TaggedWord current, prev, prevPrev, next, nextNext;
        current = sentence.get(wordIndex);
        // -------- The left periphery ------------
        int wind = wordIndex.intValue();
        if (wind > 1) {
            prev = sentence.get(wind - 1);
            prevPrev = sentence.get(wind - 2);
        } else if (wind > 0) {
            prev = sentence.get(wind - 1);
            prevPrev = outOfBounds;
        } else {
            prev = prevPrev = outOfBounds;
        }

        // -------- The right periphery -----------
        int tempSize = sentence.size();
        if ((tempSize - (wind + 1)) >= 2) {
            next = sentence.get(wind + 1);
            nextNext = sentence.get(wind + 2);
        } else if (tempSize - (wind + 1) >= 1) {
            next = sentence.get(wind + 1);
            nextNext = outOfBounds;
        } else {
            next = nextNext = outOfBounds;
        }
        Double activation = new Double(1.0);
        if (training)
            result.add(new Pair<String, Double>(current.getSupertag(), activation));     
       
        result.add(new Pair<String, Double>(curL + current.getForm(), activation));       
        if(useMultiPOS) { for(Pair<Double,String> tg : current.getPOSTagging()) result.add(new Pair<String,Double>(curP + tg.b, tg.a)); }
        else { result.add(new Pair<String, Double>(curP + current.getPOS(), activation)); }
       
        result.add(new Pair<String, Double>(prevL + prev.getForm(), activation));       
        if(useMultiPOS && prev != Constants.OOB) { for(Pair<Double,String> tg : prev.getPOSTagging()) result.add(new Pair<String,Double>(prevP + tg.b, tg.a)); }
        else { result.add(new Pair<String, Double>(prevP + prev.getPOS(), activation)); }       
       
        result.add(new Pair<String, Double>(prevPrevL + prevPrev.getForm(), activation));       
        if(useMultiPOS && prevPrev != Constants.OOB) { for(Pair<Double,String> tg : prevPrev.getPOSTagging()) result.add(new Pair<String,Double>(prevPrevP + tg.b, tg.a)); }
        else { result.add(new Pair<String, Double>(prevPrevP + prevPrev.getPOS(), activation)); }
       
        result.add(new Pair<String, Double>(nextL + next.getForm(), activation));       
        if(useMultiPOS && next != Constants.OOB) { for(Pair<Double,String> tg : next.getPOSTagging()) result.add(new Pair<String,Double>(nextP + tg.b, tg.a)); }
        else { result.add(new Pair<String, Double>(nextP + next.getPOS(), activation)); }
       
        result.add(new Pair<String, Double>(nextNextL + nextNext.getForm(), activation));
        if(useMultiPOS && nextNext != Constants.OOB) { for(Pair<Double,String> tg : nextNext.getPOSTagging()) result.add(new Pair<String,Double>(nextNextP + tg.b, tg.a)); }
        else { result.add(new Pair<String, Double>(nextNextP + nextNext.getPOS(), activation)); }
       
        // now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp).
        // (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token).
        // N.B. only use single-best POSs (maybe change later).      
        TaggedWord[] wds = {prevPrev,   prev,       current, next,       nextNext};
       
        for (int j = 1; j < wds.length; j++) {
            // add bigram features (only for single-best POS).          
            result.add(new Pair<String,Double>(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation));
            result.add(new Pair<String,Double>(posfLabs[j - 1] + "|" + posfLabs[j] + "=" + wds[j - 1].getPOS() + "|" + wds[j].getPOS(), activation));
            // also, if at the current word slot, add bigrams that straddle the current word.
            if (j == 2) {
                result.add(new Pair<String,Double>(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation));
                result.add(new Pair<String,Double>(posfLabs[j - 1] + "|" + posfLabs[j + 1] + "=" + wds[j - 1].getPOS() + "|" + wds[j + 1].getPOS(), activation));
            }
        }
       
        // If the prior model is not null, extract a feature for the beta-best (beta = 0.1) classes
        // predicted by the prior model (for all output classes -- supertags -- seen with this word's
        // POS).
        // Extract prior features from these.
        if(priorMod != null) {
            priorMod.computePriors(current.getWord());
            for(Pair<String,Double> priorClassActivationPair : priorMod.getRestrictedBetaBestPriors(current.getWord(), 0.1)) { // TODO: make beta parameterizable.
                double act = Math.log(priorClassActivationPair.b);             
                String wd = current.getForm().intern(), pos = current.getPOS().intern();
                result.add(new Pair<String,Double>(priorST + priorClassActivationPair.a.intern(), act)); // log(prob)
                result.add(new Pair<String,Double>(wordPriorST + priorClassActivationPair.a.intern()+"_"+wd, act)); // log(prob)
                result.add(new Pair<String,Double>(wordPOSPriorST + priorClassActivationPair.a.intern()+"_"+wd+"_"+pos, act)); // log(prob)
                result.add(new Pair<String,Double>(POSPriorST + priorClassActivationPair.a.intern()+"_"+pos, act)); // log(prob)
                result.add(new Pair<String,Double>(antiPriorST + priorClassActivationPair.a.intern(), Math.log(1-Math.exp(act)))); // log(1-prob)
View Full Code Here

       
        for(List<Word> sentence : corp) {
            Map<Integer, TaggedWord> sent = new HashMap<Integer, TaggedWord>(sentence.size());
            int index = 0;               
            if(posT == null) {               
                for(Word w : sentence) { sent.put(index++, new TaggedWord(w)); }           
            } else {
                List<TaggedWord> posTagging = posT.tagSentence(sentence);
                for(TaggedWord tw : posTagging) { sent.put(index++, tw); }
            }
            List<Collection<Pair<String,Double>>> ftss = fexer.getSentenceFeatures(sent, true);           
View Full Code Here

TOP

Related Classes of opennlp.ccg.parse.tagger.TaggedWord

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.