Package org.dbpedia.spotlight.spot.cooccurrence.weka

Source Code of org.dbpedia.spotlight.spot.cooccurrence.weka.InstanceBuilderNGram

package org.dbpedia.spotlight.spot.cooccurrence.weka;

import org.dbpedia.spotlight.spot.cooccurrence.features.CandidateFeatures;
import org.dbpedia.spotlight.spot.cooccurrence.features.data.CandidateData;
import org.dbpedia.spotlight.spot.cooccurrence.features.data.CoOccurrenceData;
import org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProvider;
import org.dbpedia.spotlight.exceptions.ItemNotFoundException;
import org.dbpedia.spotlight.model.SurfaceFormOccurrence;
import org.dbpedia.spotlight.model.TaggedText;
import org.dbpedia.spotlight.tagging.TaggedToken;
import weka.core.Attribute;
import weka.core.Instance;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;


/**
* Abstract Builder for WEKA instances for n-grams.
*
* @author Joachim Daiber
*/
public abstract class InstanceBuilderNGram extends InstanceBuilder {

  public static Attribute count_corpus = new Attribute("count_corpus");
  public static Attribute count_web = new Attribute("count_web");
  public static Attribute contains_verb = new Attribute("contains_verb", Arrays.asList("no", "vb", "vbd", "vbg", "vbn", "be", "multiple"));
  public static Attribute term_case = new Attribute("case", Arrays.asList("all_lowercase", "mixed", "titlecase", "all_uppercase", "first_uppercase"));
  public static Attribute quoted = new Attribute("quoted", Arrays.asList("yes"));
  public static Attribute candidate_size = new Attribute("candidate_size");
  public static Attribute pre_pos = new Attribute("token_left", Arrays.asList("to", "verb", "a"));
  public static Attribute next_pos = new Attribute("token_right", Arrays.asList("of", "to", "be", "verb"));
  public static Attribute ends_with = new Attribute("ends_with", Arrays.asList("prep"));
  public static Attribute bigram_left_significance_web = new Attribute("bigram_left_signifance_web"); //, AttributesUnigram.BIGRAM_COUNTS_WEB);
  public static Attribute bigram_right_significance_web = new Attribute("bigram_right_significance_web"); //, AttributesUnigram.BIGRAM_COUNTS_WEB);
  public static Attribute trigram_left = new Attribute("trigram_left");
  public static Attribute trigram_right = new Attribute("trigram_right");


  /**
   * Default Thresholds:
   */
  protected long bigramLeftWebMin = 0;
  protected long bigramRightWebMin = 0;
  protected long trigramLeftWebMin = 0;
  protected long trigramMiddleWebMin = 0;
  protected long trigramRightWebMin = 0;


  /**
   * Create an instance builder for n-grams.
   *
   * @param dataProvider occurrence data provider
   */
  protected InstanceBuilderNGram(OccurrenceDataProvider dataProvider) {
    super(dataProvider);

  }

  @Override
  /** {@inheritDoc} */
  public ArrayList<Attribute> buildAttributeList() {

    ArrayList<Attribute> attributeList = new ArrayList<Attribute>();
    attributeList.addAll(Arrays.asList(
      contains_verb,
      term_case,
      candidate_size,
      pre_pos,
      next_pos,
      ends_with,
      quoted,
      bigram_left_significance_web,
      bigram_right_significance_web,
      trigram_left,
      trigram_right,
      candidate_class
    ));
   
    return attributeList;
  }


  @Override
  /** {@inheritDoc} */
  public Instance buildInstance(SurfaceFormOccurrence surfaceFormOccurrence, Instance instance) {

    TaggedText text = (TaggedText) surfaceFormOccurrence.context();
    List<TaggedToken> candidateTokens = text.taggedTokenProvider().getTaggedTokens(surfaceFormOccurrence);
    int termSize = candidateTokens.size();

    TaggedToken firstTaggedToken = candidateTokens.get(0);
    CandidateData firstTaggedTokenData = null;
    try {
      firstTaggedTokenData = dataProvider.getCandidateData(firstTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData secondTaggedTokenData = null;
    if(candidateTokens.size() > 1){
      TaggedToken secondTaggedToken = candidateTokens.get(1);
      try {
        secondTaggedTokenData = dataProvider.getCandidateData(secondTaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }

    TaggedToken lastTaggedToken = candidateTokens.get(candidateTokens.size()-1);
    CandidateData lastTaggedTokenData = null;
    try {
      lastTaggedTokenData = dataProvider.getCandidateData(lastTaggedToken.getToken());
    } catch (ItemNotFoundException e) {
      //No information about the token!
    }


    CandidateData lastBut1TaggedTokenData = null;

    if(candidateTokens.size() > 1) {
      TaggedToken lastBut1TaggedToken = candidateTokens.get(candidateTokens.size()-2);
      try {
        lastBut1TaggedTokenData = dataProvider.getCandidateData(lastBut1TaggedToken.getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token!
      }
    }


    /**
     * Left context
     */

    List<TaggedToken> leftContext = null;
    try {
      leftContext = text.taggedTokenProvider().getLeftContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData left1 = null;
    if(leftContext != null && leftContext.size() > 0) {
      try {
        String token;
        if(leftContext.size() == 1) {
          /**
           * There are no more tokens to the left, the token is sentence initial.
           */
          token = leftContext.get(0).getToken().toLowerCase();

        }else{
          token = leftContext.get(0).getToken();

        }
        left1 = dataProvider.getCandidateData(token);
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }


    /**
     * Right context
     */

    List<TaggedToken> rightContext = null;
    try {
      rightContext = text.taggedTokenProvider().getRightContext(surfaceFormOccurrence, 2);
    } catch (ItemNotFoundException ignored) {}

    CandidateData right1 = null;
    if(rightContext != null && rightContext.size() > 0) {
      try {
        right1 = dataProvider.getCandidateData(rightContext.get(0).getToken());
      } catch (ItemNotFoundException e) {
        //No information about the token
      }
    }



    /**
     * Features:
     */

    if(termSize == 2) {

      try {
        if(firstTaggedTokenData != null && secondTaggedTokenData != null) {
          CoOccurrenceData bigramData = dataProvider.getBigramData(firstTaggedTokenData, secondTaggedTokenData);

          //if (bigramData.getUnitCountWeb() > bigramLeftWebMin)
            instance.setValue(i(count_web, buildAttributeList()), bigramData.getUnitCountWeb());
        }
      } catch (ItemNotFoundException ignored) {}
      catch (ArrayIndexOutOfBoundsException ignored) {}



    }



    List<String> verbs = new LinkedList<String>();

    boolean allLowercase = surfaceFormOccurrence.surfaceForm().name().toLowerCase().equals(surfaceFormOccurrence.surfaceForm().name());
    boolean allUppercase = surfaceFormOccurrence.surfaceForm().name().toUpperCase().equals(surfaceFormOccurrence.surfaceForm().name());

    int capitalizedWords = 0;

    for(TaggedToken candidateToken : candidateTokens) {
      if(candidateToken.getPOSTag().startsWith("v") || candidateToken.getPOSTag().equals("be")) {
        verbs.add(candidateToken.getPOSTag());
      }

      if(Character.isUpperCase(candidateToken.getToken().charAt(0)))
        capitalizedWords++;
    }

    try{
      if(verbs.size() > 1)
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
      else if(verbs.size()==0)
        instance.setValue(i(contains_verb, buildAttributeList()), 0);
      else if(verbs.get(0).equals("vb"))
        instance.setValue(i(contains_verb, buildAttributeList()), 1);
      else if(verbs.get(0).equals("vbd"))
        instance.setValue(i(contains_verb, buildAttributeList()), 2);
      else if(verbs.get(0).equals("vbg"))
        instance.setValue(i(contains_verb, buildAttributeList()), 3);
      else if(verbs.get(0).equals("vbn"))
        instance.setValue(i(contains_verb, buildAttributeList()), 4);
      else if(verbs.get(0).equals("be"))
        instance.setValue(i(contains_verb, buildAttributeList()), 5);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try{
      if(allLowercase)
        instance.setValue(i(term_case, buildAttributeList()), 0);
      else if(allUppercase)
        instance.setValue(i(term_case, buildAttributeList()), 3);
      else if(capitalizedWords == candidateTokens.size())
        instance.setValue(i(term_case, buildAttributeList()), 2);
      else if(capitalizedWords == 1 && Character.isUpperCase(candidateTokens.get(0).getToken().charAt(0)))
        instance.setValue(i(term_case, buildAttributeList()), 4);
      else
        instance.setValue(i(term_case, buildAttributeList()), 1);


    } catch (ArrayIndexOutOfBoundsException ignored) {}


    try{
      instance.setValue(i(candidate_size, buildAttributeList()), termSize);
    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try {
      TaggedToken leftNeighbourToken = text.taggedTokenProvider().getLeftNeighbourToken(surfaceFormOccurrence);

      if(leftNeighbourToken.getPOSTag().equals("to")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 0);
      }
      else if(leftNeighbourToken.getPOSTag().matches("[mnf].*")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 1);
      }else if(leftNeighbourToken.getToken().matches("[aA][nN]?")) {
        instance.setValue(i(pre_pos, buildAttributeList()), 2);
      }

    } catch (ItemNotFoundException ignored) {

    } catch (ArrayIndexOutOfBoundsException ignored) {}


    try {

      if(leftContext.size() > 0) {

        if(leftContext.get(0).getPOSTag().equals("to")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 0);
        }
        else if(leftContext.get(0).getPOSTag().matches("[mnf].*")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 1);
        }else if(leftContext.get(0).getToken().matches("[aA][nN]?")) {
          instance.setValue(i(pre_pos, buildAttributeList()), 2);
        }
      }

    } catch (ArrayIndexOutOfBoundsException ignored) {}

    try{
      if (CandidateFeatures.quoted(surfaceFormOccurrence) == 1)
        instance.setValue(i(quoted, buildAttributeList()), 0);

    } catch (ArrayIndexOutOfBoundsException ignored) {}




    try {
      if(rightContext.size() > 0) {

        if(rightContext.get(0).getToken().equals("of")) {
          instance.setValue(i(next_pos, buildAttributeList()), 0);
        }else if(rightContext.get(0).getToken().equals("to")) {
          instance.setValue(i(next_pos, buildAttributeList()), 1);
        }else if(rightContext.get(0).getPOSTag().startsWith("be")) {
          instance.setValue(i(next_pos, buildAttributeList()), 2);
        }else if(rightContext.get(0).getPOSTag().startsWith("v")) {
          instance.setValue(i(next_pos, buildAttributeList()), 3);
        }
      }
    } catch (ArrayIndexOutOfBoundsException ignored) {}



    try {
      TaggedToken lastToken = candidateTokens.get(candidateTokens.size() - 1);


      if(lastToken.getPOSTag().equals("in")) {
        instance.setValue(i(ends_with, buildAttributeList()), 0);
      }
    } catch (ArrayIndexOutOfBoundsException ignored) {}



    /**
     * Co-Occurrence data of the left neighbour token:
     */


    if(left1 != null && firstTaggedTokenData != null && leftContext.size() > 0 && !leftContext.get(0).getPOSTag().matches(FUNCTION_WORD_PATTERN) && !leftContext.get(0).getPOSTag().equals("in")) {

      CoOccurrenceData bigramLeft = null;
      try {
        bigramLeft = dataProvider.getBigramData(left1, firstTaggedTokenData);
      } catch (ItemNotFoundException ignored) {}

      if(bigramLeft != null && bigramLeft.getUnitSignificanceWeb() > bigramLeftWebMin) {

        try{
          instance.setValue(i(bigram_left_significance_web, buildAttributeList()), bigramLeft.getUnitSignificanceWeb());
        } catch (ArrayIndexOutOfBoundsException ignored) {}

      }
    }

    /**
     * Co-Occurrence data for the left trigram
     */

    if(firstTaggedTokenData != null && secondTaggedTokenData != null && left1 != null) {

      CoOccurrenceData trigramLeft = null;
      try {
        trigramLeft = dataProvider.getTrigramData(left1, firstTaggedTokenData, secondTaggedTokenData);
      } catch (ItemNotFoundException ignored) {}

      if(trigramLeft != null && trigramLeft.getUnitCountWeb() > trigramLeftWebMin) {

        try{
          instance.setValue(i(trigram_left, buildAttributeList()), trigramLeft.getUnitCountWeb());
        } catch (ArrayIndexOutOfBoundsException ignored) {}

      }
    }


    if(lastTaggedTokenData != null && lastBut1TaggedTokenData != null && right1 != null) {

      CoOccurrenceData trigramRight = null;
      try {
        trigramRight = dataProvider.getTrigramData(lastBut1TaggedTokenData, lastTaggedTokenData, right1);
      } catch (ItemNotFoundException ignored) {}
      catch(NullPointerException ignored) {}

      if(trigramRight != null && trigramRight.getUnitCountWeb() > trigramRightWebMin) {

        try{
          instance.setValue(i(trigram_right, buildAttributeList()), trigramRight.getUnitCountWeb());
        } catch (ArrayIndexOutOfBoundsException ignored) {}

      }
    }



    /**
     * Co-Occurrence data of the right neighbour token:
     */

    if(lastTaggedTokenData != null && right1 != null && !rightContext.get(0).getPOSTag().matches(FUNCTION_WORD_PATTERN) && !rightContext.get(0).getPOSTag().equals("in")) {
      CoOccurrenceData bigramRight = null;
      try {
        bigramRight = dataProvider.getBigramData(lastTaggedTokenData, right1);
      } catch (ItemNotFoundException ignored) {}

      if(bigramRight != null && bigramRight.getUnitSignificanceWeb() > bigramRightWebMin) {

        try {
          instance.setValue(i(bigram_right_significance_web, buildAttributeList()), bigramRight.getUnitSignificanceWeb());
        } catch (ArrayIndexOutOfBoundsException ignored) {}

      }
    }


    if (verboseMode)
      explain(surfaceFormOccurrence, instance);

    return instance;
  }

}
TOP

Related Classes of org.dbpedia.spotlight.spot.cooccurrence.weka.InstanceBuilderNGram

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.