Package org.dbpedia.spotlight.spot

Source Code of org.dbpedia.spotlight.spot.CoOccurrenceBasedSelector

/*
* Copyright 2011 DBpedia Spotlight Development Team
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*
*  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
*/

package org.dbpedia.spotlight.spot;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dbpedia.spotlight.exceptions.InitializationException;
import org.dbpedia.spotlight.model.*;
import org.dbpedia.spotlight.spot.cooccurrence.ClassifierFactory;
import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClass;
import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClassification;
import org.dbpedia.spotlight.spot.cooccurrence.classification.SpotClassifier;
import org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProviderSQL;
import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPOS;
import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPattern;
import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterTermsize;
import org.dbpedia.spotlight.tagging.TaggedToken;
import org.dbpedia.spotlight.tagging.TaggedTokenProvider;

import java.util.LinkedList;
import java.util.List;

/**
* Spot selector based on co-occurrence data using two classifiers for unigram
* and ngram candidates.
*
* @author Joachim Daiber
*/
public class CoOccurrenceBasedSelector implements TaggedSpotSelector {

  private final Log LOG = LogFactory.getLog(this.getClass());


  /**
   * Creates a spot selector based on n-gram co-occurrence. A SpotterConfiguration object must be
   * passed as a parameter since the selector must use and initialize an occurrence
   * data provider and a factory for classifiers.
   *
   * @see org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProvider
   * @see ClassifierFactory
   *
   * @param spotterConfiguration SpotterConfiguration object with classifier paths and JDBC
   *       description of occurrence data provider.
     * @throws InitializationException Either the OccurrenceDataProvider or the ClassifierFactory
   *       could not be initialized.
   */
  public CoOccurrenceBasedSelector(SpotterConfiguration spotterConfiguration) throws InitializationException {
   
    LOG.info("Initializing spot occurrence data provider.");
    OccurrenceDataProviderSQL.initialize(spotterConfiguration);
    LOG.info("Done.");

    LOG.info("Initializing spot candidate classifiers.");
    new ClassifierFactory(spotterConfiguration.getCoOcSelectorClassifierUnigram(),
        spotterConfiguration.getCoOcSelectorClassifierNGram(),
        spotterConfiguration.getCoOcSelectorDatasource(),
        OccurrenceDataProviderSQL.getInstance()
      );
        LOG.info("Done.");
    }
   

    /**
   * Creates a spot selector based on n-gram co-occurrence. A SpotterConfiguration object must be
   * passed as a parameter since the selector must use and initialize an occurrence
   * data provider and a factory for classifiers.
   *
   * @see org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProvider
   * @see ClassifierFactory
   *
   * @param spotterConfiguration SpotterConfiguration object with classifier paths and JDBC
   *       description of occurrence data provider.
     * @param taggedTokenProvider TaggedTokenProvider used to create a tagged text to test the
     *          classifiers.
     * @throws InitializationException Either the OccurrenceDataProvider or the ClassifierFactory
   *       could not be initialized.
   */

    public CoOccurrenceBasedSelector(SpotterConfiguration spotterConfiguration, TaggedTokenProvider taggedTokenProvider) throws InitializationException {

        this(spotterConfiguration);

        //TODO Instead of doing a test classification here, we should properly check if the serialized model suits the WEKA instances that are produced from SurfaceFormOccurrences.
        LOG.info("Testing classifiers for co-occurrence based spot selector.");
    SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram();
    SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram();

        Text taggedText = new TaggedText("Bill Gates is a software developer from Berlin.", taggedTokenProvider);

        SurfaceFormOccurrence ngramOccurrence = new SurfaceFormOccurrence(new SurfaceForm("Bill Gates"),
                        taggedText, 0, Provenance.Undefined(), -1);

        SurfaceFormOccurrence unigramOccurrence = new SurfaceFormOccurrence(new SurfaceForm("Berlin"),
                        taggedText, 41, Provenance.Undefined(), -1);

        try {
            unigramClassifier.classify(unigramOccurrence);
            ngramClassifier.classify(ngramOccurrence);
        } catch (Exception e) {
            throw new InitializationException("An error occurred while classifying a test spot using the co-occurrence " +
                    "based spot selector. This is most probably caused by an outdated spot selector model. Please " +
                    "check the spot selector models defined 'org.dbpedia.spotlight.spot.cooccurrence.classifier.*'.", e);
        }
        LOG.info("Done.");

  }
 

  /**
   * Filter the list of surface form occurrences, removing all occurrences that are considered
   * common.
   *
   * @param surfaceFormOccurrences spotted surface form occurrences
   * @return List of non-common surface form occurrences
   */
  public List<SurfaceFormOccurrence> select(List<SurfaceFormOccurrence> surfaceFormOccurrences) {

    List<SurfaceFormOccurrence> selectedOccurrences = new LinkedList<SurfaceFormOccurrence>();

    FilterPOS filterPOS = new FilterPOS();
    FilterTermsize unigramFilter = new FilterTermsize(FilterTermsize.Termsize.unigram);
    FilterPattern filterPattern = new FilterPattern();

    SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram();
    SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram();

    assert unigramClassifier != null;
    assert ngramClassifier != null;

    //ngramClassifier.setVerboseMode(true);                                         f
    //unigramClassifier.setVerboseMode(true);
    List<String> decisions = new LinkedList<String>();

    for(SurfaceFormOccurrence surfaceFormOccurrence : surfaceFormOccurrences) {

            if (surfaceFormOccurrence.surfaceForm().name().trim().length()==0) {
                LOG.warn("I have an occurrence with empty surface form. :-O Ignoring.");
                LOG.error(surfaceFormOccurrence);
                continue;
            }

            if (! (surfaceFormOccurrence.context() instanceof TaggedText)) { //FIXME added this to avoid breaking, but code below will never run if we don't pass the taggedtext
                LOG.error(String.format("SurfaceFormOccurrence did not contain TaggedText. Cannot apply %s",this.getClass()));
       
                selectedOccurrences.add(surfaceFormOccurrence);
                continue;
            }


      if(unigramFilter.applies(surfaceFormOccurrence)) {

        /**
         * Unigram (n = 1)
         */

        if(!filterPOS.applies(surfaceFormOccurrence)) {

          /**
           * The Surface Form is on the POS blacklist, i.e. a single adjective,
           * verb, etc.
           */


          if(Character.isUpperCase(surfaceFormOccurrence.surfaceForm().name().charAt(0))){
            TaggedToken taggedToken = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence).get(0);

            /**
             * Add uppercase adjectives (e.g. Canadian tv star)
             */
            if(taggedToken.getPOSTag() != null && taggedToken.getPOSTag().startsWith("j"))
              selectedOccurrences.add(surfaceFormOccurrence);

          }else{
            decisions.add("Dropped by POS filter: " + surfaceFormOccurrence);

          }

        }else if(!filterPattern.applies(surfaceFormOccurrence)){
          decisions.add("Dropped by Pattern filter: " + surfaceFormOccurrence);
        }else{

                    SpotClassification spotClassification;
                    try {
                        spotClassification = unigramClassifier.classify(surfaceFormOccurrence);

                        if(spotClassification.getCandidateClass() == SpotClass.valid) {
                            selectedOccurrences.add(surfaceFormOccurrence);
                            //LOG.info(("Kept by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence);
                        }else{
                            decisions.add("Dropped by UnigramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence);
                        }

                    } catch (Exception e) {
                        LOG.error("Exception when classifying unigram candidate: " + e);
                    }

        }


      }else{

        /**
         * n > 1
         */

        SpotClassification spotClassification;
        try{
          spotClassification = ngramClassifier.classify(surfaceFormOccurrence);
        }catch (Exception e) {
                    LOG.error("Exception when classifying ngram candidate: " + e);
                    continue;
        }

        if(spotClassification.getCandidateClass() == SpotClass.valid) {
          selectedOccurrences.add(surfaceFormOccurrence);
          //LOG.info("Kept by nGramClassifier (Confidence: " + spotClassification.getConfidence() + "): " + surfaceFormOccurrence);
        }else{
          decisions.add("Dropped by NGramClassifier: " + surfaceFormOccurrence);
        }

      }

    }

        if (LOG.isDebugEnabled())
            for (String decision : decisions) {
                LOG.debug(decision);
            }
   
    return selectedOccurrences;
  }

}
TOP

Related Classes of org.dbpedia.spotlight.spot.CoOccurrenceBasedSelector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.