Source Code of org.dbpedia.spotlight.lucene.disambiguate.MergedOccurrencesDisambiguator

/**
 * Copyright 2011 Pablo Mendes, Max Jakob
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.dbpedia.spotlight.lucene.disambiguate;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ScoreDoc;
import org.dbpedia.spotlight.disambiguate.Disambiguator;
import org.dbpedia.spotlight.disambiguate.ParagraphDisambiguator;
import org.dbpedia.spotlight.exceptions.*;
import org.dbpedia.spotlight.lucene.LuceneFeatureVector;
import org.dbpedia.spotlight.lucene.search.MergedOccurrencesContextSearcher;
import org.dbpedia.spotlight.model.*;
import org.dbpedia.spotlight.model.vsm.FeatureVector;


import java.io.IOException;
import java.util.*;


public class MergedOccurrencesDisambiguator implements Disambiguator {


    final Log LOG = LogFactory.getLog(this.getClass());


    MergedOccurrencesContextSearcher mMergedSearcher;


    public MergedOccurrencesDisambiguator(ContextSearcher searcher) throws IOException, ConfigurationException {
        //TODO this is horrible, but it's a temp fix until we organize the interfaces
        //FIXME
        if (searcher instanceof MergedOccurrencesContextSearcher) {
            mMergedSearcher = (MergedOccurrencesContextSearcher) searcher;
        } else {
            throw new ConfigurationException("You cannot use MergedOccurrencesDisambiguator with a searcher that is not MergedOccurrencesContextSearcher.");
        }
    }


    public MergedOccurrencesDisambiguator(MergedOccurrencesContextSearcher searcher) throws IOException {
        this.mMergedSearcher = searcher;
    }


    /*
      IDF of surface form
      */
    public Double importance(SurfaceForm sf) throws SearchException {
        long freqUnannotated = this.mMergedSearcher.getConceptNeighborhoodCount(sf);
        long totalDocs = 1;
        try {
            totalDocs = this.mMergedSearcher.getNumberOfResources();
        } catch (IOException e) {
            throw new SearchException("Error getting count of resources. ", e);
        }
        //long freqAnnotated = this.mMergedSearcher.getAmbiguity(sf);
        double prob = (freqUnannotated == 0) ?  0.0 : Math.log(totalDocs / (1+freqUnannotated)); //TODO throw exception: freqUnannotated==0 should not happen!
        //LOG.info(String.format("FreqAnnotated %s; FreqUnnanotated %s; Prob: %s", freqAnnotated, freqUnannotated, prob));
        LOG.info(String.format("FreqAnnotated %s; FreqUnnanotated %s; Prob: %s", 1, freqUnannotated, prob));
        return prob;
    }


    @Override
    @Deprecated
    public List<SurfaceFormOccurrence> spotProbability(List<SurfaceFormOccurrence> sfOccurrences) throws SearchException {
        return sfOccurrences;
    }


    public DBpediaResourceOccurrence disambiguate(SurfaceFormOccurrence sfOcc) throws SearchException, ItemNotFoundException, InputException  {
        List<DBpediaResourceOccurrence> occs = bestK(sfOcc,1);
        if (occs.size()==0)
            throw new ItemNotFoundException(String.format("Surface form not found: %s",sfOcc.surfaceForm().toString()));
        return occs.get(0);
    }


    public List<DBpediaResourceOccurrence> bestK(SurfaceFormOccurrence sfOccurrence, int k) throws SearchException, ItemNotFoundException, InputException {
        LOG.debug("Disambiguating "+sfOccurrence.surfaceForm());


        // search index for surface form
        ScoreDoc[] hits = mMergedSearcher.getHits(sfOccurrence);


        if (hits.length == 0) { //TODO this hack can be implemented correctly as an analyzer that sits within getQuery in LuceneManager.
            String sfName = sfOccurrence.surfaceForm().name().trim();
            if (sfName.toLowerCase().startsWith("the ")) {
                LOG.debug("Trying to HACK(the) -> not found in index: "+sfOccurrence);
                String newName = sfName.substring(3).trim();
                hits = mMergedSearcher.getHitsSurfaceFormHack(sfOccurrence, new SurfaceForm(newName));
                LOG.debug("New sfName="+newName+" hits="+hits.length);
            } else if (sfName.toLowerCase().startsWith("a ")) {
                LOG.debug("Trying to HACK(a) -> not found in index: "+sfOccurrence);
                String newName = sfName.substring(1).trim();
                hits = mMergedSearcher.getHitsSurfaceFormHack(sfOccurrence, new SurfaceForm(newName));
                LOG.debug("New sfName="+newName+" hits="+hits.length);
            }
            if (hits.length == 0 && sfName.toLowerCase().endsWith("'s")) {
                LOG.debug("Trying to HACK('s) -> not found in index: "+sfOccurrence);
                String newName = sfName.substring(0,sfName.length()-2).trim();
                hits = mMergedSearcher.getHitsSurfaceFormHack(sfOccurrence, new SurfaceForm(newName));
                LOG.debug("New sfName="+newName+" hits="+hits.length);
            }
            if (hits.length == 0 && sfName.toLowerCase().endsWith("s")) {
                LOG.debug("Trying to HACK(s) -> not found in index: "+sfOccurrence);
                String newName = sfName.substring(0,sfName.length()-1).trim();
                hits = mMergedSearcher.getHitsSurfaceFormHack(sfOccurrence, new SurfaceForm(newName));
                LOG.debug("New sfName="+newName+" hits="+hits.length);
            }
            if (hits.length == 0 && sfName.toLowerCase().endsWith("'")) {
                LOG.debug("Trying to HACK(') -> not found in index: "+sfOccurrence);
                String newName = sfName.substring(0,sfName.length()-1).trim();
                hits = mMergedSearcher.getHitsSurfaceFormHack(sfOccurrence, new SurfaceForm(newName));
                LOG.debug("New sfName="+newName+" hits="+hits.length);
            }
        }


        // Loop through all hits, build a map from URI to score
        List<DBpediaResourceOccurrence> rankedOccs = new LinkedList<DBpediaResourceOccurrence>();


        if (hits.length > 0) {
            for (int i=0; i < hits.length && i < k; i++) {
                DBpediaResource resource = mMergedSearcher.getDBpediaResource(hits[i].doc);
                //resource can be null! not handled here
                //if (resource==null)
                //    throw new ItemNotFoundException("Could not choose a URI for "+sfOcc.surfaceForm());


                Double score = new Double(hits[i].score);
                Double percentageOfSecond = new Double(-1);
                if (hits.length > i+1) {
                    percentageOfSecond = hits[i+1].score / score;
                }
                DBpediaResourceOccurrence resultOcc = new DBpediaResourceOccurrence("",
                        resource,
                        sfOccurrence.surfaceForm(),
                        sfOccurrence.context(),
                        sfOccurrence.textOffset(),
                        Provenance.Annotation(),
                        score,
                        percentageOfSecond,
                        score); //TODO abusing what was spotProb here. now we have contextual score. need better way to do this
                rankedOccs.add(resultOcc);
            }


            LOG.debug(String.format("Object creation time took %f ms.",mMergedSearcher.objectCreationTime/1000000.0));
            mMergedSearcher.objectCreationTime = 0;
        } else {
            LOG.debug(String.format("Not found in index: %s", sfOccurrence.surfaceForm().toString()));
        }


        return rankedOccs;
    }


    public List<DBpediaResourceOccurrence> disambiguate(List<SurfaceFormOccurrence> sfOccs) throws InputException {
        long startTime = System.currentTimeMillis();


        List<DBpediaResourceOccurrence> results = new LinkedList<DBpediaResourceOccurrence>();


        for (SurfaceFormOccurrence sfOcc : sfOccs) {
            try {
                results.add(disambiguate(sfOcc));
            } catch (ItemNotFoundException e) {
                LOG.error("Could not disambiguate "+sfOcc.surfaceForm()+": "+e);
            } catch (SearchException e) {
                LOG.error("Could not disambiguate "+sfOcc.surfaceForm()+": "+e);
            }
        }


        double totalSeconds = (System.currentTimeMillis() - startTime) / 1000.0;
        LOG.info("Total time of all disambiguations: "+totalSeconds+" s");
        LOG.info("Average time of one disambiguation: "+totalSeconds/sfOccs.size()+" s");  // counts a cache look-up as disambiguation
        
        return results;
    }


    @Override
    public String name() {
        return this.getClass().getSimpleName() + ":" + mMergedSearcher.getSimilarity().getClass().getSimpleName();
    }


    @Override
    public int ambiguity(SurfaceForm sf) throws SearchException {
        return mMergedSearcher.getAmbiguity(sf);
    }


    public int support(DBpediaResource res) throws SearchException {
        int n = 0;
        try {
         n = mMergedSearcher.getSupport(res);
        } catch (SearchException e) {
            if (!e.getCause().getMessage().equals("read past EOF"))
                throw e;
        }
        return n;
    }


    /**
     * Generates explanations for how a given SurfaceFormOccurrence has been disambiguated into a DBpediaResourceOccurrence
     * @param goldStandardOccurrence
     * @param nExplanations
     * @return a list of explanations
     * @throws SearchException
     */
    public List<Explanation> explain(DBpediaResourceOccurrence goldStandardOccurrence, int nExplanations) throws SearchException {
        return mMergedSearcher.explain(goldStandardOccurrence, nExplanations);
    }




    @Override
    public int contextTermsNumber(DBpediaResource resource) throws SearchException {
        int termsCount = 0;
        for (ScoreDoc hit : mMergedSearcher.getHits(resource)) {
            TermFreqVector vector = mMergedSearcher.getVector(hit.doc);
            termsCount += vector.getTerms().length;
        }
        return termsCount;
    }


    @Override
    public double averageIdf(Text context) throws IOException {
        return mMergedSearcher.getAverageIdf(context);
    }


}
Source Code of org.dbpedia.spotlight.lucene.disambiguate.MergedOccurrencesDisambiguator

Related Classes of org.dbpedia.spotlight.lucene.disambiguate.MergedOccurrencesDisambiguator