Package org.dbpedia.spotlight.spot.cooccurrence.filter

Examples of org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPattern


    List<SurfaceFormOccurrence> selectedOccurrences = new LinkedList<SurfaceFormOccurrence>();

    FilterPOS filterPOS = new FilterPOS();
    FilterTermsize unigramFilter = new FilterTermsize(FilterTermsize.Termsize.unigram);
    FilterPattern filterPattern = new FilterPattern();

    SpotClassifier unigramClassifier = ClassifierFactory.getClassifierInstanceUnigram();
    SpotClassifier ngramClassifier = ClassifierFactory.getClassifierInstanceNGram();

    assert unigramClassifier != null;
    assert ngramClassifier != null;

    //ngramClassifier.setVerboseMode(true);                                         f
    //unigramClassifier.setVerboseMode(true);
    List<String> decisions = new LinkedList<String>();

    for(SurfaceFormOccurrence surfaceFormOccurrence : surfaceFormOccurrences) {

            if (surfaceFormOccurrence.surfaceForm().name().trim().length()==0) {
                LOG.warn("I have an occurrence with empty surface form. :-O Ignoring.");
                LOG.error(surfaceFormOccurrence);
                continue;
            }

            if (! (surfaceFormOccurrence.context() instanceof TaggedText)) { //FIXME added this to avoid breaking, but code below will never run if we don't pass the taggedtext
                LOG.error(String.format("SurfaceFormOccurrence did not contain TaggedText. Cannot apply %s",this.getClass()));
       
                selectedOccurrences.add(surfaceFormOccurrence);
                continue;
            }


      if(unigramFilter.applies(surfaceFormOccurrence)) {

        /**
         * Unigram (n = 1)
         */

        if(!filterPOS.applies(surfaceFormOccurrence)) {

          /**
           * The Surface Form is on the POS blacklist, i.e. a single adjective,
           * verb, etc.
           */


          if(Character.isUpperCase(surfaceFormOccurrence.surfaceForm().name().charAt(0))){
            TaggedToken taggedToken = ((TaggedText) surfaceFormOccurrence.context()).taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence).get(0);

            /**
             * Add uppercase adjectives (e.g. Canadian tv star)
             */
            if(taggedToken.getPOSTag() != null && taggedToken.getPOSTag().startsWith("j"))
              selectedOccurrences.add(surfaceFormOccurrence);

          }else{
            decisions.add("Dropped by POS filter: " + surfaceFormOccurrence);

          }

        }else if(!filterPattern.applies(surfaceFormOccurrence)){
          decisions.add("Dropped by Pattern filter: " + surfaceFormOccurrence);
        }else{

                    SpotClassification spotClassification;
                    try {
View Full Code Here


    /** Filter the data set: */
    FilterTermsize filterTermsize = new FilterTermsize(FilterTermsize.Termsize.unigram, spotlightFactory.textUtil());
    filterTermsize.inverse();

    FilterPattern filterPattern = new FilterPattern();

    filters.add(filterTermsize);
    filters.add(filterPattern);

    /** Create a new header object: */
 
View Full Code Here

   
    instanceBuilder.setVerboseMode(true);

    filters.add(new FilterTermsize(FilterTermsize.Termsize.unigram));
    filters.add(new FilterPOS());
    filters.add(new FilterPattern());

    header = new Instances("UnigramTraining", buildAttributeList(), buildAttributeList().size());

  }
View Full Code Here

TOP

Related Classes of org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPattern

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.