Package edu.stanford.nlp.pipeline

Source Code of edu.stanford.nlp.pipeline.AnnotatorFactories

package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;

import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.Properties;
import java.util.Set;

/**
* A companion to {@link AnnotatorFactory} defining the common annotators.
* These are primarily used in {@link StanfordCoreNLP#getDefaultAnnotatorPool(java.util.Properties, AnnotatorImplementations)}.
*
* @author Gabor Angeli
*/
public class AnnotatorFactories {

  public static AnnotatorFactory tokenize(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        String extraOptions = null;
        boolean keepNewline = Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY,
            "false"));
        if (properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY) != null) {
          keepNewline = true;
        }
        if (keepNewline) {
          extraOptions = "tokenizeNLs,";
        }
        return annotatorImplementation.tokenizer(properties, false, extraOptions);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        StringBuilder os = new StringBuilder();
        os.append("tokenize.whitespace:").append(properties.getProperty("tokenize.whitespace", "false"));
        if (properties.getProperty("tokenize.options") != null) {
          os.append(":tokenize.options:").append(properties.getProperty("tokenize.options"));
        }
        if (properties.getProperty("tokenize.language") != null) {
          os.append(":tokenize.language:").append(properties.getProperty("tokenize.language"));
        }
        if (properties.getProperty("tokenize.class") != null) {
          os.append(":tokenize.class:").append(properties.getProperty("tokenize.class"));
        }
        if (Boolean.valueOf(properties.getProperty("tokenize.whitespace",
            "false"))) {
          os.append(TokenizerAnnotator.EOL_PROPERTY + ":").append(properties.getProperty(TokenizerAnnotator.EOL_PROPERTY,
              "false"));
          os.append(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY + ":").append(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY,
              "false"));
          return os.toString();
        } else {
          os.append(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY + ":").append(Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY,
              "false")));
          os.append(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY + ":").append(properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK));
        }
        return os.toString();
      }
    };
  }





  public static AnnotatorFactory cleanXML(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        String xmlTags =
            properties.getProperty("clean.xmltags",
                CleanXmlAnnotator.DEFAULT_XML_TAGS);
        String sentenceEndingTags =
            properties.getProperty("clean.sentenceendingtags",
                CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS);
        String singleSentenceTags =
            properties.getProperty("clean.singlesentencetags",
                CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS);
        String allowFlawedString = properties.getProperty("clean.allowflawedxml");
        boolean allowFlawed = CleanXmlAnnotator.DEFAULT_ALLOW_FLAWS;
        if (allowFlawedString != null)
          allowFlawed = Boolean.valueOf(allowFlawedString);
        String dateTags =
            properties.getProperty("clean.datetags",
                CleanXmlAnnotator.DEFAULT_DATE_TAGS);
        String docIdTags =
            properties.getProperty("clean.docIdtags",
                CleanXmlAnnotator.DEFAULT_DOCID_TAGS);
        String docTypeTags =
            properties.getProperty("clean.docTypetags",
                CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS);
        String utteranceTurnTags =
            properties.getProperty("clean.turntags",
                CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS);
        String speakerTags =
            properties.getProperty("clean.speakertags",
                CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS);
        String docAnnotations =
            properties.getProperty("clean.docAnnotations",
                CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS);
        String tokenAnnotations =
            properties.getProperty("clean.tokenAnnotations",
                CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS);
        String sectionTags =
            properties.getProperty("clean.sectiontags",
                CleanXmlAnnotator.DEFAULT_SECTION_TAGS);
        String sectionAnnotations =
            properties.getProperty("clean.sectionAnnotations",
                CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS);
        String ssplitDiscardTokens =
            properties.getProperty("clean.ssplitDiscardTokens");
        CleanXmlAnnotator annotator = annotatorImplementation.cleanXML(properties, xmlTags,
            sentenceEndingTags,
            dateTags,
            allowFlawed);
        annotator.setSingleSentenceTagMatcher(singleSentenceTags);
        annotator.setDocIdTagMatcher(docIdTags);
        annotator.setDocTypeTagMatcher(docTypeTags);
        annotator.setDiscourseTags(utteranceTurnTags, speakerTags);
        annotator.setDocAnnotationPatterns(docAnnotations);
        annotator.setTokenAnnotationPatterns(tokenAnnotations);
        annotator.setSectionTagMatcher(sectionTags);
        annotator.setSectionAnnotationPatterns(sectionAnnotations);
        annotator.setSsplitDiscardTokensMatcher(ssplitDiscardTokens);
        return annotator;
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return "clean.xmltags:" +
            properties.getProperty("clean.xmltags",
                CleanXmlAnnotator.DEFAULT_XML_TAGS) +
            "clean.sentenceendingtags:" +
            properties.getProperty("clean.sentenceendingtags",
                CleanXmlAnnotator.DEFAULT_SENTENCE_ENDERS) +
            "clean.sentenceendingtags:" +
            properties.getProperty("clean.singlesentencetags",
                CleanXmlAnnotator.DEFAULT_SINGLE_SENTENCE_TAGS) +
            "clean.allowflawedxml:" +
            properties.getProperty("clean.allowflawedxml", "") +
            "clean.datetags:" +
            properties.getProperty("clean.datetags",
                CleanXmlAnnotator.DEFAULT_DATE_TAGS) +
            "clean.docidtags:" +
            properties.getProperty("clean.docid",
                CleanXmlAnnotator.DEFAULT_DOCID_TAGS) +
            "clean.doctypetags:" +
            properties.getProperty("clean.doctype",
                CleanXmlAnnotator.DEFAULT_DOCTYPE_TAGS) +
            "clean.turntags:" +
            properties.getProperty("clean.turntags",
                CleanXmlAnnotator.DEFAULT_UTTERANCE_TURN_TAGS) +
            "clean.speakertags:" +
            properties.getProperty("clean.speakertags",
                CleanXmlAnnotator.DEFAULT_SPEAKER_TAGS) +
            "clean.docAnnotations:" +
            properties.getProperty("clean.docAnnotations",
                CleanXmlAnnotator.DEFAULT_DOC_ANNOTATIONS_PATTERNS) +
            "clean.tokenAnnotations:" +
            properties.getProperty("clean.tokenAnnotations",
                CleanXmlAnnotator.DEFAULT_TOKEN_ANNOTATIONS_PATTERNS) +
            "clean.sectiontags:" +
            properties.getProperty("clean.sectiontags",
                CleanXmlAnnotator.DEFAULT_SECTION_TAGS) +
            "clean.sectionAnnotations:" +
            properties.getProperty("clean.sectionAnnotations",
                CleanXmlAnnotator.DEFAULT_SECTION_ANNOTATIONS_PATTERNS);
      }
    };
  }

  //
  // Sentence splitter: splits the above sequence of tokens into
  // sentences.  This is required when processing entire documents or
  // text consisting of multiple sentences.
  //
  public static AnnotatorFactory sentenceSplit(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        System.err.println(signature());
        boolean nlSplitting = Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
        if (nlSplitting) {
          boolean whitespaceTokenization = Boolean.valueOf(properties.getProperty("tokenize.whitespace", "false"));
          if (whitespaceTokenization) {
            if (System.getProperty("line.separator").equals("\n")) {
              return WordsToSentencesAnnotator.newlineSplitter(false, "\n");
            } else {
              // throw "\n" in just in case files use that instead of
              // the system separator
              return WordsToSentencesAnnotator.newlineSplitter(false, System.getProperty("line.separator"), "\n");
            }
          } else {
            return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken());
          }

        } else {
          // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
          String isOneSentence = properties.getProperty("ssplit.isOneSentence");
          if (Boolean.parseBoolean(isOneSentence)) { // this method treats null as false
            return WordsToSentencesAnnotator.nonSplitter(false);
          }

          // multi token sentence boundaries
          String boundaryMultiTokenRegex = properties.getProperty("ssplit.boundaryMultiTokenRegex");

          // Discard these tokens without marking them as sentence boundaries
          String tokenPatternsToDiscardProp = properties.getProperty("ssplit.tokenPatternsToDiscard");
          Set<String> tokenRegexesToDiscard = null;
          if (tokenPatternsToDiscardProp != null){
            String [] toks = tokenPatternsToDiscardProp.split(",");
            tokenRegexesToDiscard = Generics.newHashSet(Arrays.asList(toks));
          }
          // regular boundaries
          String boundaryTokenRegex = properties.getProperty("ssplit.boundaryTokenRegex");
          Set<String> boundariesToDiscard = null;

          // newline boundaries which are discarded.
          String bounds = properties.getProperty("ssplit.boundariesToDiscard");
          if (bounds != null) {
            String [] toks = bounds.split(",");
            boundariesToDiscard = Generics.newHashSet(Arrays.asList(toks));
          }
          Set<String> htmlElementsToDiscard = null;
          // HTML boundaries which are discarded
          bounds = properties.getProperty("ssplit.htmlBoundariesToDiscard");
          if (bounds != null) {
            String [] elements = bounds.split(",");
            htmlElementsToDiscard = Generics.newHashSet(Arrays.asList(elements));
          }
          String nlsb = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY,
              StanfordCoreNLP.DEFAULT_NEWLINE_IS_SENTENCE_BREAK);

          return annotatorImplementation.wordToSentences(properties,
              false, boundaryTokenRegex, boundariesToDiscard, htmlElementsToDiscard,
              nlsb, boundaryMultiTokenRegex, tokenRegexesToDiscard);
        }
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        StringBuilder os = new StringBuilder();
        if (Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"))) {
          os.append(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY + "=").append(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")).append("\n");
          os.append("tokenize.whitespace=").append(properties.getProperty("tokenize.whitespace", "false")).append("\n");
        } else {
          os.append(baseSignature(properties, StanfordCoreNLP.STANFORD_SSPLIT));
        }
        return os.toString();
      }
    };
  }

  //
  // POS tagger
  //
  public static AnnotatorFactory posTag(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        try {
          return annotatorImplementation.posTagger(properties);
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return POSTaggerAnnotator.signature(properties);
      }
    };
  }

  //
  // Lemmatizer
  //
  public static AnnotatorFactory lemma(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        return annotatorImplementation.morpha(properties, false);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        // nothing for this one
        return "";
      }
    };
  }

  //
  // NER
  //
  public static AnnotatorFactory nerTag(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        try {
          return annotatorImplementation.ner(properties);
        } catch (FileNotFoundException e) {
          throw new RuntimeIOException(e);
        }
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return "ner.model:" +
            properties.getProperty("ner.model", "") +
            NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY + ":" +
            properties.getProperty(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_PROPERTY,
                Boolean.toString(NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT)) +
            NumberSequenceClassifier.USE_SUTIME_PROPERTY + ":" +
            properties.getProperty(NumberSequenceClassifier.USE_SUTIME_PROPERTY,
                Boolean.toString(NumberSequenceClassifier.USE_SUTIME_DEFAULT));
      }
    };
  }

  //
  // Regex NER
  //
  public static AnnotatorFactory regexNER(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        return annotatorImplementation.tokensRegexNER(properties, "regexner");
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return PropertiesUtils.getSignature("regexner", properties, TokensRegexNERAnnotator.SUPPORTED_PROPERTIES);
      }
    };
  }

  //
  // Gender Annotator
  //
  public static AnnotatorFactory gender(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        return annotatorImplementation.gender(properties, false);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return "gender.firstnames:" +
            properties.getProperty("gender.firstnames",
                DefaultPaths.DEFAULT_GENDER_FIRST_NAMES);
      }
    };
  }


  //
  // True caser
  //
  public static AnnotatorFactory truecase(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;
      @Override
      public Annotator create() {
        String model = properties.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL);
        String bias = properties.getProperty("truecase.bias", TrueCaseAnnotator.DEFAULT_MODEL_BIAS);
        String mixed = properties.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
        return annotatorImplementation.trueCase(properties, model, bias, mixed, false);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return "truecase.model:" +
            properties.getProperty("truecase.model",
                DefaultPaths.DEFAULT_TRUECASE_MODEL) +
            "truecase.bias:" +
            properties.getProperty("truecase.bias",
                TrueCaseAnnotator.DEFAULT_MODEL_BIAS) +
            "truecase.mixedcasefile:" +
            properties.getProperty("truecase.mixedcasefile",
                DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST);
      }
    };
  }

  //
  // Parser
  //
  public static AnnotatorFactory parse(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;

      @Override
      public Annotator create() {
        return annotatorImplementation.parse(properties);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        String type = properties.getProperty("parse.type", "stanford");
        if (type.equalsIgnoreCase("stanford")) {
          return ParserAnnotator.signature("parse", properties);
        } else if (type.equalsIgnoreCase("charniak")) {
          return "parse.model:" +
              properties.getProperty("parse.model", "") +
              "parse.executable:" +
              properties.getProperty("parse.executable", "") +
              "parse.maxlen:" +
              properties.getProperty("parse.maxlen", "");
        } else {
          throw new RuntimeException("Unknown parser type: " + type +
              " (currently supported: stanford and charniak)");
        }
      }
    };
  }

  //
  // Coreference resolution
  //
  public static AnnotatorFactory coref(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;

      @Override
      public Annotator create() {
        return annotatorImplementation.coref(properties);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return DeterministicCorefAnnotator.signature(properties);
      }
    };
  }


  public static AnnotatorFactory relation(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;

      @Override
      public Annotator create() {
        return annotatorImplementation.relations(properties);
      }

      @Override
      public String additionalSignature() {
        // keep track of all relevant properties for this annotator here!
        return "sup.relation.verbose:" +
            properties.getProperty("sup.relation.verbose",
                "false") +
            properties.getProperty("sup.relation.model",
                DefaultPaths.DEFAULT_SUP_RELATION_EX_RELATION_MODEL);
      }
    };
  }

  public static AnnotatorFactory sentiment(Properties properties, final AnnotatorImplementations annotatorImplementation) {
    return new AnnotatorFactory(properties, annotatorImplementation) {
      private static final long serialVersionUID = 1L;

      @Override
      public Annotator create() {
        return annotatorImplementation.sentiment(properties, StanfordCoreNLP.STANFORD_SENTIMENT);
      }

      @Override
      public String additionalSignature() {
        return "sentiment.model=" + properties.get("sentiment.model");
      }
    };
  }

  public static AnnotatorFactory columnDataClassifier(Properties properties, final AnnotatorImplementations annotatorImpls) {
    return new AnnotatorFactory(properties, annotatorImpls) {
      @Override
      public Annotator create() {
        if(!properties.containsKey("loadClassifier"))
          throw new RuntimeException("Must load a classifier when creating a column data classifier annotator");
        return new ColumnDataClassifierAnnotator(properties);
      }

      @Override
      protected String additionalSignature() {
        return "classifier="+properties.get("loadClassifier="+properties.get("loadClassifier"));
      }
    };
  }

  //
  // Dependency parsing
  //
  public static AnnotatorFactory dependencies(Properties properties, final AnnotatorImplementations annotatorImpl) {
    return new AnnotatorFactory(properties, annotatorImpl) {
      @Override
      public Annotator create() {
        return annotatorImpl.dependencies(properties);
      }

      @Override
      protected String additionalSignature() {
        return "";
      }
    };
  }

}
TOP

Related Classes of edu.stanford.nlp.pipeline.AnnotatorFactories

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.