Package edu.stanford.nlp.tagger.maxent

Source Code of edu.stanford.nlp.tagger.maxent.TaggerConfig

package edu.stanford.nlp.tagger.maxent;

import edu.stanford.nlp.util.StringUtils;

import java.io.*;
import java.util.Map;
import java.util.Properties;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;

/**
* Reads and stores configuration information for a POS tagger.
*
* <i>Implementation note:</i> To add a new parameter: (1) define a default
* String value, (2) add it to defaultValues map, (3) add line to constructor,
* (4) add getter method, (5) add to dump() method, (6) add to printGenProps()
* method, (7) add to class javadoc of MaxentTagger.
*
@author William Morgan
@author Anna Rafferty
@author Michel Galley
*/
public class TaggerConfig extends Properties /* Inherits implementation of Serializable! */ {

  private static final long serialVersionUID = -4136407850147157497L;

  public enum Mode {
    TRAIN, TEST, TAG, DUMP
  }

  /* defaults. sentenceDelimiter might be null; the others all have non-null values. */
  public static final String
  SEARCH = "qn",
  TAG_SEPARATOR = "/",
  TOKENIZE = "true",
  DEBUG = "false",
  ITERATIONS = "100",
  ARCH = "",
  WORD_FUNCTION = "",
  RARE_WORD_THRESH = "5",
  MIN_FEATURE_THRESH = "5",
  CUR_WORD_MIN_FEATURE_THRESH = "2",
  RARE_WORD_MIN_FEATURE_THRESH = "10",
  VERY_COMMON_WORD_THRESH = "250",
  OCCURRING_TAGS_ONLY = "false",
  POSSIBLE_TAGS_ONLY = "false",
  SIGMA_SQUARED = String.valueOf(0.5),
  ENCODING = "UTF-8",
  LEARN_CLOSED_CLASS = "false",
  CLOSED_CLASS_THRESHOLD = "40",
  VERBOSE = "false",
  VERBOSE_RESULTS = "true",
  SGML = "false",
  LANG = "",
  TOKENIZER_FACTORY = "",
  XML_INPUT = "",
  TAG_INSIDE = "",
  APPROXIMATE = "-1.0",
  TOKENIZER_OPTIONS = "",
  DEFAULT_REG_L1 = "1.0",
  OUTPUT_FILE = "",
  OUTPUT_FORMAT = "slashTags",
  OUTPUT_FORMAT_OPTIONS = "",
  NTHREADS = "1";

  public static final String ENCODING_PROPERTY = "encoding",
  TAG_SEPARATOR_PROPERTY = "tagSeparator";


  private static final Map<String, String> defaultValues = Generics.newHashMap();
  static {
    defaultValues.put("arch", ARCH);
    defaultValues.put("wordFunction", WORD_FUNCTION);
    defaultValues.put("closedClassTags", "");
    defaultValues.put("closedClassTagThreshold", CLOSED_CLASS_THRESHOLD);
    defaultValues.put("search", SEARCH);
    defaultValues.put(TAG_SEPARATOR_PROPERTY, TAG_SEPARATOR);
    defaultValues.put("tokenize", TOKENIZE);
    defaultValues.put("debug", DEBUG);
    defaultValues.put("iterations", ITERATIONS);
    defaultValues.put("rareWordThresh", RARE_WORD_THRESH);
    defaultValues.put("minFeatureThresh", MIN_FEATURE_THRESH);
    defaultValues.put("curWordMinFeatureThresh", CUR_WORD_MIN_FEATURE_THRESH);
    defaultValues.put("rareWordMinFeatureThresh", RARE_WORD_MIN_FEATURE_THRESH);
    defaultValues.put("veryCommonWordThresh", VERY_COMMON_WORD_THRESH);
    defaultValues.put("occurringTagsOnly", OCCURRING_TAGS_ONLY);
    defaultValues.put("possibleTagsOnly", POSSIBLE_TAGS_ONLY);
    defaultValues.put("sigmaSquared", SIGMA_SQUARED);
    defaultValues.put(ENCODING_PROPERTY, ENCODING);
    defaultValues.put("learnClosedClassTags", LEARN_CLOSED_CLASS);
    defaultValues.put("verbose", VERBOSE);
    defaultValues.put("verboseResults", VERBOSE_RESULTS);
    defaultValues.put("openClassTags", "");
    defaultValues.put("lang", LANG);
    defaultValues.put("tokenizerFactory", TOKENIZER_FACTORY);
    defaultValues.put("xmlInput", XML_INPUT);
    defaultValues.put("tagInside", TAG_INSIDE);
    defaultValues.put("sgml", SGML);
    defaultValues.put("approximate", APPROXIMATE);
    defaultValues.put("tokenizerOptions", TOKENIZER_OPTIONS);
    defaultValues.put("regL1", DEFAULT_REG_L1);
    defaultValues.put("outputFile", OUTPUT_FILE);
    defaultValues.put("outputFormat", OUTPUT_FORMAT);
    defaultValues.put("outputFormatOptions", OUTPUT_FORMAT_OPTIONS);
    defaultValues.put("nthreads", NTHREADS);
  }

  /**
   * This constructor is just for creating an instance with default values.
   * Used internally.
   */
  private TaggerConfig() {
    super();
    this.putAll(defaultValues);
  }

  /**
   * We force you to pass in a TaggerConfig rather than any other
   * superclass so that we know the arg error checking has already occurred
   */
  public TaggerConfig(TaggerConfig old) {
    super(old);
  }

  public TaggerConfig(String... args) {
    this(StringUtils.argsToProperties(args));
  }

  public TaggerConfig(Properties props) {
    // load up the default properties
    this();

    /* Try and use the default properties from the model */
    // Properties modelProps = new Properties();
    // TaggerConfig oldConfig = new TaggerConfig(); // loads default values in oldConfig
    if (! props.containsKey("trainFile")) {
      String name = props.getProperty("model");
      if (name == null) {
        name = props.getProperty("dump");
      }
      if (name != null) {
        try {
          System.err.println("Loading default properties from tagger " + name);
          DataInputStream in = new DataInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(name));
          this.putAll(TaggerConfig.readConfig(in)); // overwrites defaults with any serialized values.
          in.close();
        } catch (Exception e) {
          throw new RuntimeIOException("No such trained tagger config file found: " + name);
        }
      }
    }

    setProperties(props);
  }

  public void setProperties(Properties props) {
    if (props.getProperty("") != null) {
      throw new RuntimeException("unknown argument(s): \"" + props.getProperty("") + '\"');
    }

    if (props.getProperty("genprops") != null) {
      printGenProps(System.out);
      System.exit(0);
    }

    if (props.containsKey("mode") && props.containsKey("file")) {
      this.setProperty("mode", props.getProperty("mode"));
      this.setProperty("file", props.getProperty("file"));
    } else if (props.containsKey("trainFile")) {
      //Training mode
      this.setProperty("mode", Mode.TRAIN.toString());
      this.setProperty("file", props.getProperty("trainFile", "").trim());
    } else if (props.containsKey("testFile")) {
      //Testing mode
      this.setProperty("mode", Mode.TEST.toString());
      this.setProperty("file", props.getProperty("testFile", "").trim());
    } else if (props.containsKey("textFile")) {
      //Tagging mode
      this.setProperty("mode", Mode.TAG.toString());
      this.setProperty("file", props.getProperty("textFile", "").trim());
    } else if (props.containsKey("dump")) {
      this.setProperty("mode", Mode.DUMP.toString());
      // this.setProperty("file", props.getProperty("dump").trim());
      props.setProperty("model", props.getProperty("dump").trim());
    } else {
      this.setProperty("mode", Mode.TAG.toString());
      this.setProperty("file", "stdin");
    }
    //for any mode other than train, we load a classifier, which means we load a config - model always needs to be specified
    //on command line/in props file
    //Get the path to the model (or the path where you'd like to save the model); this is necessary for training, testing, and tagging
    this.setProperty("model", props.getProperty("model", this.getProperty("model", "")).trim());
    if ( ! (this.getMode() == Mode.DUMP) && this.getProperty("model").equals("")) {
      throw new RuntimeException("'model' parameter must be specified");
    }

    this.setProperty("search", props.getProperty("search", this.getProperty("search")).trim().toLowerCase());
    String srch = this.getProperty("search");
    if ( ! (srch.equals("cg") || srch.equals("iis") || srch.equals("owlqn") || srch.equals("qn") || srch.equals("owlqn2"))) {
      throw new RuntimeException("'search' must be one of 'iis', 'cg', 'qn' or 'owlqn' or 'owlqn2': " + srch);
    }

    this.setProperty("sigmaSquared", props.getProperty("sigmaSquared", this.getProperty("sigmaSquared")));

    this.setProperty(TAG_SEPARATOR_PROPERTY, props.getProperty(TAG_SEPARATOR_PROPERTY, this.getProperty(TAG_SEPARATOR_PROPERTY)));

    this.setProperty("iterations", props.getProperty("iterations", this.getProperty("iterations")));
    this.setProperty("rareWordThresh", props.getProperty("rareWordThresh", this.getProperty("rareWordThresh")));
    this.setProperty("minFeatureThresh", props.getProperty("minFeatureThresh", this.getProperty("minFeatureThresh")));
    this.setProperty("curWordMinFeatureThresh", props.getProperty("curWordMinFeatureThresh", this.getProperty("curWordMinFeatureThresh")));
    this.setProperty("rareWordMinFeatureThresh", props.getProperty("rareWordMinFeatureThresh", this.getProperty("rareWordMinFeatureThresh")));
    this.setProperty("veryCommonWordThresh", props.getProperty("veryCommonWordThresh", this.getProperty("veryCommonWordThresh")));
    this.setProperty("occurringTagsOnly", props.getProperty("occurringTagsOnly", this.getProperty("occurringTagsOnly", OCCURRING_TAGS_ONLY)));
    this.setProperty("possibleTagsOnly", props.getProperty("possibleTagsOnly", this.getProperty("possibleTagsOnly")));

    this.setProperty("lang", props.getProperty("lang", this.getProperty("lang")));

    this.setProperty("openClassTags", props.getProperty("openClassTags", this.getProperty("openClassTags")).trim());
    this.setProperty("closedClassTags", props.getProperty("closedClassTags", this.getProperty("closedClassTags")).trim());

    this.setProperty("learnClosedClassTags", props.getProperty("learnClosedClassTags", this.getProperty("learnClosedClassTags")));

    this.setProperty("closedClassTagThreshold", props.getProperty("closedClassTagThreshold", this.getProperty("closedClassTagThreshold")));

    this.setProperty("arch", props.getProperty("arch", this.getProperty("arch")));
    if (this.getMode() == Mode.TRAIN && this.getProperty("arch").equals("")) {
      throw new IllegalArgumentException("No architecture specified; " +
                                         "set the -arch flag with " +
                                         "the features to be used");
    }

    this.setProperty("wordFunction", props.getProperty("wordFunction", this.getProperty("wordFunction", WORD_FUNCTION)));

    this.setProperty("tokenize", props.getProperty("tokenize", this.getProperty("tokenize")));
    this.setProperty("tokenizerFactory", props.getProperty("tokenizerFactory", this.getProperty("tokenizerFactory")));

    this.setProperty("debugPrefix", props.getProperty("debugPrefix", this.getProperty("debugPrefix", "")));
    this.setProperty("debug", props.getProperty("debug", DEBUG));

    this.setProperty(ENCODING_PROPERTY, props.getProperty(ENCODING_PROPERTY, this.getProperty(ENCODING_PROPERTY)));
    this.setProperty("sgml", props.getProperty("sgml", this.getProperty("sgml")));
    this.setProperty("verbose", props.getProperty("verbose", this.getProperty("verbose")));
    this.setProperty("verboseResults", props.getProperty("verboseResults", this.getProperty("verboseResults")));

    this.setProperty("regL1", props.getProperty("regL1", this.getProperty("regL1")));

    //this is a property that is stored (not like the general properties)
    this.setProperty("xmlInput", props.getProperty("xmlInput", this.getProperty("xmlInput")).trim());

    this.setProperty("tagInside", props.getProperty("tagInside", this.getProperty("tagInside"))); //this isn't something we save from time to time
    this.setProperty("approximate", props.getProperty("approximate", this.getProperty("approximate"))); //this isn't something we save from time to time
    this.setProperty("tokenizerOptions", props.getProperty("tokenizerOptions", this.getProperty("tokenizerOptions"))); //this isn't something we save from time to time
    this.setProperty("outputFile", props.getProperty("outputFile", this.getProperty("outputFile")).trim()); //this isn't something we save from time to time
    this.setProperty("outputFormat", props.getProperty("outputFormat", this.getProperty("outputFormat")).trim()); //this isn't something we save from time to time
    this.setProperty("outputFormatOptions", props.getProperty("outputFormatOptions", this.getProperty("outputFormatOptions")).trim()); //this isn't something we save from time to time
    this.setProperty("nthreads", props.getProperty("nthreads", this.getProperty("nthreads", NTHREADS)).trim());
    String sentenceDelimiter = props.getProperty("sentenceDelimiter", this.getProperty("sentenceDelimiter"));
    if (sentenceDelimiter != null) {
      // this isn't something we save from time to time.
      // It is only relevant when tagging text files.
      // In fact, we let this one be null, as it really is useful to
      // let the null value represent no sentence delimiter.
      this.setProperty("sentenceDelimiter", sentenceDelimiter);
    }
  }


  public String getModel() { return getProperty("model"); }

  public String getFile() { return getProperty("file"); }

  public String getOutputFile() { return getProperty("outputFile"); }

  public String getOutputFormat() { return getProperty("outputFormat"); }

  public String[] getOutputOptions() { return getProperty("outputFormatOptions").split("\\s*,\\s*"); }

  public boolean getOutputVerbosity() {
    return getOutputOptionsContains("verbose");
  }

  public boolean getOutputLemmas() {
    return getOutputOptionsContains("lemmatize");
  }

  public boolean keepEmptySentences() {
    return getOutputOptionsContains("keepEmptySentences");   
  }

  public boolean getOutputOptionsContains(String sought) {
    String[] options = getOutputOptions();
    for (String option : options) {
      if (option.equals(sought)) {
        return true;
      }
    }
    return false;
  }

  public String getSearch() { return getProperty("search"); }

  public double getSigmaSquared() { return Double.parseDouble(getProperty("sigmaSquared")); }

  public int getIterations() { return Integer.parseInt(getProperty("iterations")); }

  public int getRareWordThresh() { return Integer.parseInt(getProperty("rareWordThresh")); }

  public int getMinFeatureThresh() { return Integer.parseInt(getProperty("minFeatureThresh")); }

  public int getCurWordMinFeatureThresh() { return Integer.parseInt(getProperty("curWordMinFeatureThresh")); }

  public int getRareWordMinFeatureThresh() { return Integer.parseInt(getProperty("rareWordMinFeatureThresh")); }

  public int getVeryCommonWordThresh() { return Integer.parseInt(getProperty("veryCommonWordThresh")); }

  public boolean occurringTagsOnly() { return Boolean.parseBoolean(getProperty("occurringTagsOnly")); }

  public boolean possibleTagsOnly() { return Boolean.parseBoolean(getProperty("possibleTagsOnly")); }

  public String getLang() { return getProperty("lang"); }

  public String[] getOpenClassTags() {
    return wsvStringToStringArray(getProperty("openClassTags"));
  }

  public String[] getClosedClassTags() {
    return wsvStringToStringArray(getProperty("closedClassTags"));
  }

  private static String[] wsvStringToStringArray(String str) {
    if (str == null || str.equals("")) {
      return StringUtils.EMPTY_STRING_ARRAY;
    } else {
      return str.split("\\s+");
    }
  }

  public boolean getLearnClosedClassTags() { return Boolean.parseBoolean(getProperty("learnClosedClassTags")); }

  public int getClosedTagThreshold() { return Integer.parseInt(getProperty("closedClassTagThreshold")); }

  public String getArch() { return getProperty("arch"); }

  public String getWordFunction() { return getProperty("wordFunction"); }

  public boolean getDebug() { return Boolean.parseBoolean(getProperty("debug")); }

  public String getDebugPrefix() { return getProperty("debugPrefix"); }

  public String getTokenizerFactory() { return getProperty("tokenizerFactory"); }

  public static String getDefaultTagSeparator() { return TAG_SEPARATOR; }

  public final String getTagSeparator() { return getProperty(TAG_SEPARATOR_PROPERTY); }

  public boolean getTokenize() { return Boolean.parseBoolean(getProperty("tokenize")); }

  public String getEncoding() { return getProperty(ENCODING_PROPERTY); }

  public double getRegL1() { return Double.parseDouble(getProperty("regL1")); }

  public String[] getXMLInput() {
    return wsvStringToStringArray(getProperty("xmlInput"));
  }

  public boolean getVerbose() { return Boolean.parseBoolean(getProperty("verbose")); }

  public boolean getVerboseResults() { return Boolean.parseBoolean(getProperty("verboseResults")); }

  public boolean getSGML() { return Boolean.parseBoolean(getProperty("sgml")); }

  public int getNThreads() { return Integer.parseInt(getProperty("nthreads")); }


  /** Return a regex of XML elements to tag inside of.  This may return an
   *  empty String, but never null.
   *
   * @return A regex of XML elements to tag inside of
   */
  public String getTagInside() {
    String str = getProperty("tagInside");
    if (str == null) {
      return "";
    }
    return str;
  }

  public String getTokenizerOptions() { return getProperty("tokenizerOptions"); }

  public boolean getTokenizerInvertible() {
    String tokenizerOptions = getTokenizerOptions();
    if (tokenizerOptions != null &&
        tokenizerOptions.matches("(^|.*,)invertible=true"))
      return true;
    return getOutputVerbosity() || getOutputLemmas();
  }

  /**
   * Returns a default score to be used for each tag that is incompatible with
   * the current word (e.g., the tag CC for the word "apple"). Using a default
   * score may slightly decrease performance for some languages (e.g., Chinese and
   * German), but allows the tagger to run considerably faster (since the computation
   * of the normalization term Z requires much less feature extraction). This approximation
   * does not decrease performance in English (on the WSJ). If this function returns
   * 0.0, the tagger will compute exact scores.
   *
   * @return default score
   */
  public double getDefaultScore() {
    String approx = getProperty("approximate");
    if ("false".equalsIgnoreCase(approx)) {
      return -1.0;
    } else if ("true".equalsIgnoreCase(approx)) {
      return 1.0;
    } else {
      return Double.parseDouble(approx);
    }
  }


  public void dump() { dump(new PrintWriter(System.err)); }

  public void dump(PrintStream stream) {
    PrintWriter pw = new PrintWriter(stream);
    dump(pw);
  }

  private void dump(PrintWriter pw) {
    pw.println("                   model = " + getProperty("model"));
    pw.println("                    arch = " + getProperty("arch"));
    pw.println("            wordFunction = " + getProperty("wordFunction"));
    if (this.getMode() == Mode.TRAIN || this.getMode() == Mode.DUMP) {
      pw.println("               trainFile = " + getProperty("file"));
    } else if (this.getMode() == Mode.TAG) {
      pw.println("                textFile = " + getProperty("file"));
    } else if (this.getMode() == Mode.TEST) {
      pw.println("                testFile = " + getProperty("file"));
    }

    pw.println("         closedClassTags = " + getProperty("closedClassTags"));
    pw.println(" closedClassTagThreshold = " + getProperty("closedClassTagThreshold"));
    pw.println(" curWordMinFeatureThresh = " + getProperty("curWordMinFeatureThresh"));
    pw.println("                   debug = " + getProperty("debug"));
    pw.println("             debugPrefix = " + getProperty("debugPrefix"));
    pw.println("            " + TAG_SEPARATOR_PROPERTY + " = " +
               getProperty(TAG_SEPARATOR_PROPERTY));
    pw.println("                " + ENCODING_PROPERTY + " = " +
               getProperty(ENCODING_PROPERTY));
    pw.println("              iterations = " + getProperty("iterations"));
    pw.println("                    lang = " + getProperty("lang"));
    pw.println("    learnClosedClassTags = " + getProperty("learnClosedClassTags"));
    pw.println("        minFeatureThresh = " + getProperty("minFeatureThresh"));
    pw.println("           openClassTags = " + getProperty("openClassTags"));
    pw.println("rareWordMinFeatureThresh = " + getProperty("rareWordMinFeatureThresh"));
    pw.println("          rareWordThresh = " + getProperty("rareWordThresh"));
    pw.println("                  search = " + getProperty("search"));
    pw.println("                    sgml = " + getProperty("sgml"));
    pw.println("            sigmaSquared = " + getProperty("sigmaSquared"));
    pw.println("                   regL1 = " + getProperty("regL1"));
    pw.println("               tagInside = " + getProperty("tagInside"));
    pw.println("                tokenize = " + getProperty("tokenize"));
    pw.println("        tokenizerFactory = " + getProperty("tokenizerFactory"));
    pw.println("        tokenizerOptions = " + getProperty("tokenizerOptions"));
    pw.println("                 verbose = " + getProperty("verbose"));
    pw.println("          verboseResults = " + getProperty("verboseResults"));
    pw.println("    veryCommonWordThresh = " + getProperty("veryCommonWordThresh"));
    pw.println("                xmlInput = " + getProperty("xmlInput"));
    pw.println("              outputFile = " + getProperty("outputFile"));
    pw.println("            outputFormat = " + getProperty("outputFormat"));
    pw.println("     outputFormatOptions = " + getProperty("outputFormatOptions"));
    pw.println("                nthreads = " + getProperty("nthreads"));
    pw.flush();
  }

  @Override
  public String toString() {
    StringWriter sw = new StringWriter(200);
    PrintWriter pw = new PrintWriter(sw);
    dump(pw);
    return sw.toString();
  }

  /**
   * This returns the sentence delimiter used when tokenizing text
   * using the tokenizer requested in this config.  In general, it is
   * assumed the tokenizer doesn't need a sentence delimiter.... If you
   * use the whitespace tokenizer, though, a newline breaks sentences.
   *
   * @return A null String unless tokenize is false and then the String
   */
  public String getSentenceDelimiter() {
    String delimiter = getProperty("sentenceDelimiter");
    if (delimiter == null && !getTokenize()) {
      delimiter = "\n";
    }
    return delimiter;
  }

  /**
   * Returns whether or not we should use stdin for reading when
   * tagging data.  For now, this returns true iff the filename given
   * was "stdin".
   * (TODO: kind of ugly)
   */
  public boolean useStdin() {
    return getFile().trim().equalsIgnoreCase("stdin");
  }

  /**
   * Prints out the automatically generated props file - in its own
   * method to make code above easier to read
   */
  private static void printGenProps(PrintStream out) {
    out.println("## Sample properties file for maxent tagger. This file is used for three main");
    out.println("## operations: training, testing, and tagging. It may also be used to dump");
    out.println("## the contents of a model.");
    out.println("## To train or test a model, or to tag something, run:");
    out.println("##   java edu.stanford.nlp.tagger.maxent.MaxentTagger -prop <properties file>");
    out.println("## Arguments can be overridden on the commandline, e.g.:");
    out.println("##   java ....MaxentTagger -prop <properties file> -testFile /other/file ");
    out.println();

    out.println("# Model file name (created at train time; used at tag and test time)");
    out.println("# (you can leave this blank and specify it on the commandline with -model)");
    out.println("# model = ");
    out.println();

    out.println("# Path to file to be operated on (trained from, tested against, or tagged)");
    out.println("# Specify -textFile <filename> to tag text in the given file, -trainFile <filename> to");
    out.println("# to train a model using data in the given file, or -testFile <filename> to test your");
    out.println("# model using data in the given file.  Alternatively, you may specify");
    out.println("# -dump <filename> to dump the parameters stored in a model or ");
    out.println("# -convertToSingleFile <filename> to save an old, multi-file model (specified as -model)");
    out.println("# to the new single file format.  The new model will be saved in the file filename.");
    out.println("# If you choose to convert an old file, you must specify ");
    out.println("# the correct 'arch' parameter used to create the original model.");
    out.println("# trainFile = ");
    out.println();

    out.println("# Path to outputFile to write tagged output to.");
    out.println("# If empty, stdout is used.");
    out.println("# outputFile = " + OUTPUT_FILE);
    out.println();

    out.println("# Output format. One of: slashTags (default), xml, or tsv");
    out.println("# outputFormat = " + OUTPUT_FORMAT);
    out.println();

    out.println("# Output format options. Comma separated list.");
    out.println("# currently \"lemmatize\" and \"keepEmptySentences\" are supported.");
    out.println("# outputFormatOptions = " + OUTPUT_FORMAT_OPTIONS);
    out.println();

    out.println("# Tag separator character that separates word and pos tags");
    out.println("# (for both training and test data) and used for");
    out.println("# separating words and tags in slashTags format output.");
    out.println("# tagSeparator = " + TAG_SEPARATOR);
    out.println();

    out.println("# Encoding format in which files are stored.  If left blank, UTF-8 is assumed.");
    out.println("# encoding = " + ENCODING);
    out.println();

    out.println("# A couple flags for controlling the amount of output:");
    out.println("# - print extra debugging information:");
    out.println("# verbose = " + VERBOSE);
    out.println("# - print intermediate results:");
    out.println("# verboseResults = " + VERBOSE_RESULTS);

    out.println("######### parameters for tag and test operations #########");
    out.println();

    out.println("# Class to use for tokenization. Default blank value means Penn Treebank");
    out.println("# tokenization.  If you'd like to just assume that tokenization has been done,");
    out.println("# and the input is whitespace-tokenized, use");
    out.println("# edu.stanford.nlp.process.WhitespaceTokenizer or set tokenize to false.");
    out.println("# tokenizerFactory = ");
    out.println();

    out.println("# Options to the tokenizer.  A comma separated list.");
    out.println("# This depends on what the tokenizer supports.");
    out.println("# For PTBTokenizer, you might try options like americanize=false");
    out.println("# or asciiQuotes (for German!).");
    out.println("# tokenizerOptions = ");
    out.println();
    out.println("# Whether to tokenize text for tag and test operations. Default is true.");
    out.println("# If false, your text must already be whitespace tokenized.");
    out.println("# tokenize = " + TOKENIZE);
    out.println();

    out.println("# Write debugging information (words, top words, unknown words). Useful for");
    out.println("# error analysis. Default is false.");
    out.println("# debug = "+ DEBUG);
    out.println();

    out.println("# Prefix for debugging output (if debug == true). Default is to use the");
    out.println("# filename from 'file'");
    out.println("# debugPrefix = ");
    out.println();

    out.println("######### parameters for training  #########");
    out.println();

    out.println("# model architecture: This is one or more comma separated strings, which");
    out.println("# specify which extractors to use. Some of them take one or more integer");
    out.println("# or string ");
    out.println("# (file path) arguments in parentheses, written as m, n, and s below:");
    out.println("# 'left3words', 'left5words', 'bidirectional', 'bidirectional5words',");
    out.println("# 'generic', 'sighan2005', 'german', 'words(m,n)', 'wordshapes(m,n)',");
    out.println("# 'biwords(m,n)', 'lowercasewords(m,n)', 'vbn(n)', distsimconjunction(s,m,n)',");
    out.println("# 'naacl2003unknowns', 'naacl2003conjunctions', 'distsim(s,m,n)',");
    out.println("# 'suffix(n)', 'prefix(n)', 'prefixsuffix(n)', 'capitalizationsuffix(n)',");
    out.println("# 'wordshapes(m,n)', 'unicodeshapes(m,n)', 'unicodeshapeconjunction(m,n)',");
    out.println("# 'lctagfeatures', 'order(k)', 'chinesedictionaryfeatures(s)'.");
    out.println("# These keywords determines the features extracted.  'generic' is language independent.");
    out.println("# distsim: Distributional similarity classes can be an added source of information");
    out.println("# about your words. An English distsim file is included, or you can use your own.");
    out.println("# arch = ");
    out.println();
    out.println("# 'wordFunction'.  A function applied to the text before training or tagging.");
    out.println("# For example, edu.stanford.nlp.util.LowercaseFunction");
    out.println("# This function turns all the words into lowercase");
    out.println("# The function must implement java.util.function.Function<String, String>");
    out.println("# Blank means no preprocessing function");
    out.println("# wordFunction = ");
    out.println();


    out.println("# 'language'.  This is really the tag set which is used for the");
    out.println("# list of open-class tags, and perhaps deterministic  tag");
    out.println("# expansion). Currently we have 'english', 'arabic', 'german', 'chinese'");
    out.println("# or 'polish' predefined. For your own language, you can specify ");
    out.println("# the same information via openClassTags or closedClassTags below");
    out.println("# (only ONE of these three options may be specified). ");
    out.println("# 'english' means UPenn English treebank tags. 'german' is STTS");
    out.println("# 'chinese' is CTB, and Arabic is an expanded Bies mapping from the ATB");
    out.println("# 'polish' means some tags that some guy on the internet once used. ");
    out.println("# See the TTags class for more information.");
    out.println("# lang = ");
    out.println();

    out.println("# a space-delimited list of open-class parts of speech");
    out.println("# alternatively, you can specify language above to use a pre-defined list or specify the closed class tags (below)");
    out.println("# openClassTags = ");
    out.println();

    out.println("# a space-delimited list of closed-class parts of speech");
    out.println("# alternatively, you can specify language above to use a pre-defined list or specify the open class tags (above)");
    out.println("# closedClassTags = ");
    out.println();

    out.println("# A boolean indicating whether you would like the trained model to set POS tags as closed");
    out.println("# based on their frequency in training; default is false.  The frequency threshold can be set below. ");
    out.println("# This option is ignored if any of {openClassTags, closedClassTags, lang} are specified.");
    out.println("# learnClosedClassTags = ");
    out.println();

    out.println("# Used only if learnClosedClassTags=true.  Tags that have fewer tokens than this threshold are");
    out.println("# considered closed in the trained model.");
    out.println("# closedClassTagThreshold = ");
    out.println();

    out.println("# search method for optimization. Normally use the default 'qn'. choices: 'qn' (quasi-Newton),");
    out.println("# 'cg' (conjugate gradient, 'owlqn' (L1 regularization) or 'iis' (improved iterative scaling)");
    out.println("# search = " + SEARCH);
    out.println();

    out.println("# for conjugate gradient or quasi-Newton search, sigma-squared smoothing/regularization");
    out.println("# parameter. if left blank, the default is 0.5, which is usually okay");
    out.println("# sigmaSquared = " + SIGMA_SQUARED);
    out.println();

    out.println("# for OWLQN search, regularization");
    out.println("# parameter. if left blank, the default is 1.0, which is usually okay");
    out.println("# regL1 = " + DEFAULT_REG_L1);
    out.println();

    out.println("# For improved iterative scaling, the number of iterations, otherwise ignored");
    out.println("# iterations = " + ITERATIONS);
    out.println();

    out.println("# rare word threshold. words that occur less than this number of");
    out.println("# times are considered rare words.");
    out.println("# rareWordThresh = " + RARE_WORD_THRESH);
    out.println();

    out.println("# minimum feature threshold. features whose history appears less");
    out.println("# than this number of times are ignored.");
    out.println("# minFeatureThresh = " + MIN_FEATURE_THRESH);
    out.println();

    out.println("# current word feature threshold. words that occur more than this");
    out.println("# number of times will generate features with all of their occurring");
    out.println("# tags.");
    out.println("# curWordMinFeatureThresh = " + CUR_WORD_MIN_FEATURE_THRESH);
    out.println();

    out.println("# rare word minimum feature threshold. features of rare words whose histories");
    out.println("# appear less than this times will be ignored.");
    out.println("# rareWordMinFeatureThresh = " + RARE_WORD_MIN_FEATURE_THRESH);
    out.println();

    out.println("# very common word threshold. words that occur more than this number of");
    out.println("# times will form an equivalence class by themselves. ignored unless");
    out.println("# you are using equivalence classes.");
    out.println("# veryCommonWordThresh = " + VERY_COMMON_WORD_THRESH);
    out.println();

    out.println("# sgml = ");
    out.println("# tagInside = ");
    out.println();

    out.println("# testFile and textFile can use multiple threads to process text.");
    out.println("# nthreads = " + NTHREADS);
  }

  public Mode getMode() {
    if (!containsKey("mode")) {
      return null;
    }
    return Mode.valueOf(getProperty("mode"));
  }


  /** Serialize the TaggerConfig.
   *
   * @param os Where to write this TaggerConfig
   * @throws IOException If any IO problems
   */
  public void saveConfig(OutputStream os) throws IOException {
    ObjectOutputStream out = new ObjectOutputStream(os);
    out.writeObject(this);
  }


  /** Read in a TaggerConfig.
   *
   * @param stream Where to read from
   * @return The TaggerConfig
   * @throws IOException Misc IOError
   * @throws ClassNotFoundException Class error
   */
  public static TaggerConfig readConfig(DataInputStream stream)
    throws IOException, ClassNotFoundException
  {
    ObjectInputStream in = new ObjectInputStream(stream);
    return (TaggerConfig) in.readObject();
  }


}
TOP

Related Classes of edu.stanford.nlp.tagger.maxent.TaggerConfig

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.