//GlobalHolder -- StanfordMaxEnt, A Maximum Entropy Toolkit
//Copyright (c) 2002-2008 Leland Stanford Junior University
//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
// Support/Questions: java-nlp-user@lists.stanford.edu
// Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml
package edu.stanford.nlp.tagger.maxent;
import edu.stanford.nlp.io.InDataStreamFile;
import edu.stanford.nlp.io.OutDataStreamFile;
import edu.stanford.nlp.io.PrintFile;
import edu.stanford.nlp.maxent.iis.LambdaSolve;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import java.io.*;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
/** This class holds many global variables and other things that are used by
* the Stanford MaxEnt Part-of-speech Tagger package.
*
* @author Kristina Toutanova
* @author Anna Rafferty
* @author Michel Galley
* @version 1.1
*/
public class GlobalHolder {
private GlobalHolder() {}
static TaggerExperiments domain;
static final Dictionary dict = new Dictionary();
static TTags tags;
static byte[][] fnumArr;
static LambdaSolveTagger prob;
static HashMap<FeatureKey,Integer> fAssociations = new HashMap<FeatureKey,Integer>();
static final TemplateHash tFeature = new TemplateHash();
//static PairsHolder pairs = new PairsHolder();
static Extractors extractors;
static Extractors extractorsRare;
static final AmbiguityClasses ambClasses = new AmbiguityClasses();
static final CollectionTaggerOutputs collectionTaggers = new CollectionTaggerOutputs(0);
static final boolean alltags = false;
static final HashMap<String, HashSet<String>> tagTokens = new HashMap<String, HashSet<String>>();
static final int RARE_WORD_THRESH = 5;
static final int MIN_FEATURE_THRESH = 5;
static final int CUR_WORD_MIN_FEATURE_THRESH = 2;
static final int RARE_WORD_MIN_FEATURE_THRESH = 10;
static final int VERY_COMMON_WORD_THRESH = 250;
static final boolean OCCURRING_TAGS_ONLY = false;
static final boolean POSSIBLE_TAGS_ONLY = false;
static double defaultScore;
/**
* Determines which words are considered rare. All words with count
* in the training data strictly less than this number (standardly, < 5) are
* considered rare.
*/
private static int rareWordThresh = RARE_WORD_THRESH;
/**
* Determines which features are included in the model. The model
* includes features that occurred strictly more times than this number
* (standardly, > 5) in the training data. Here I look only at the
* history (not the tag), so the history appearing this often is enough.
*/
static int minFeatureThresh = MIN_FEATURE_THRESH;
/**
* This is a special threshold for the current word feature.
* Only words that have occurred strictly > this number of times
* in total will generate word features with all of their occurring tags.
* The traditional default was 2.
*/
static int curWordMinFeatureThresh = CUR_WORD_MIN_FEATURE_THRESH;
/**
* Determines which rare word features are included in the model.
* The features for rare words have a strictly higher support than
* this number are included. Traditional default is 10.
*/
static int rareWordMinFeatureThresh = RARE_WORD_MIN_FEATURE_THRESH;
/**
* If using tag equivalence classes on following words, words that occur
* strictly more than this number of times (in total with any tag)
* are sufficiently frequent to form an equivalence class
* by themselves. (Not used unless using equivalence classes.)
*/
static int veryCommonWordThresh = VERY_COMMON_WORD_THRESH;
static int xSize;
static int ySize;
static boolean occuringTagsOnly = OCCURRING_TAGS_ONLY;
static boolean possibleTagsOnly = POSSIBLE_TAGS_ONLY;
private static boolean initted = false;
static final boolean VERBOSE = false;
public static LambdaSolve getLambdaSolve() {
return prob;
}
protected static void init() {
init(null);
}
protected static void init(TaggerConfig config) {
if (initted) return;
extractors = new Extractors();
extractorsRare = new Extractors();
String lang, arch;
String[] openClassTags, closedClassTags;
if (config == null) {
lang = "english";
arch = "left3words";
openClassTags = StringUtils.EMPTY_STRING_ARRAY;
closedClassTags = StringUtils.EMPTY_STRING_ARRAY;
} else {
lang = config.getLang();
arch = config.getArch();
openClassTags = config.getOpenClassTags();
closedClassTags = config.getClosedClassTags();
if (((openClassTags.length > 0) && !lang.equals("")) || ((closedClassTags.length > 0) && !lang.equals("")) || ((closedClassTags.length > 0) && (openClassTags.length > 0))) {
throw new RuntimeException("At least two of lang (\"" + lang + "\"), openClassTags (length " + openClassTags.length + ": " + Arrays.toString(openClassTags) + ")," +
"and closedClassTags (length " + closedClassTags.length + ": " + Arrays.toString(closedClassTags) + ") specified---you must choose one!");
} else if ((openClassTags.length == 0) && lang.equals("") && (closedClassTags.length == 0) && ! config.getLearnClosedClassTags()) {
System.err.println("warning: no language set, no open-class tags specified, and no closed-class tags specified; assuming ALL tags are open class tags");
}
}
if (openClassTags.length > 0) {
tags = new TTags();
tags.setOpenClassTags(openClassTags);
} else if (closedClassTags.length > 0) {
tags = new TTags();
tags.setClosedClassTags(closedClassTags);
} else {
tags = new TTags(lang);
}
defaultScore = lang.equals("english") ? 1.0 : 0.0;
if (config != null) {
rareWordThresh = config.getRareWordThresh();
minFeatureThresh = config.getMinFeatureThresh();
curWordMinFeatureThresh = config.getCurWordMinFeatureThresh();
rareWordMinFeatureThresh = config.getRareWordMinFeatureThresh();
veryCommonWordThresh = config.getVeryCommonWordThresh();
occuringTagsOnly = config.occuringTagsOnly();
possibleTagsOnly = config.possibleTagsOnly();
// System.err.println("occuringTagsOnly: "+occuringTagsOnly);
// System.err.println("possibleTagsOnly: "+possibleTagsOnly);
if(config.getDefaultScore() >= 0)
defaultScore = config.getDefaultScore();
}
if (config != null && config.getMode() == TaggerConfig.Mode.TRAIN) {
// initialize the extractors based on the arch variable
// you only need to do this when training; otherwise they will be
// restored from the serialized file
extractors.init(ExtractorFrames.getExtractorFrames(arch));
extractorsRare.init(ExtractorFramesRare.getExtractorFramesRare(arch));
}
initted = true;
}
protected static int getNum(FeatureKey s) {
return getNum(s, fAssociations);
}
private static int getNum(FeatureKey s, HashMap<FeatureKey, Integer> fAssocs) {
Integer num = fAssocs.get(s); // hprof: 15% effective running time
if (num == null) {
return -1;
} else {
return num;
}
}
// serialize the ExtractorFrames and ExtractorFramesRare in filename
private static void saveExtractors(OutputStream os) throws IOException {
ObjectOutputStream out = new ObjectOutputStream(os);
System.out.println(extractors.toString() + "\nrare" + extractorsRare.toString());
out.writeObject(extractors);
out.writeObject(extractorsRare);
}
// Read the extractors from a filename.
private static void readExtractors(String filename) throws Exception {
InputStream in = new BufferedInputStream(new FileInputStream(filename));
readExtractors(in);
in.close();
}
// Read the extractors from a stream.
private static void readExtractors(InputStream file) throws IOException, ClassNotFoundException {
ObjectInputStream in = new ObjectInputStream(file);
extractors = (Extractors) in.readObject();
extractorsRare = (Extractors) in.readObject();
extractors.initTypes();
extractorsRare.initTypes();
int left = extractors.leftContext();
int left_u = extractorsRare.leftContext();
if (left_u > left) {
left = left_u;
}
TestSentence.leftContext = left;
int right = extractors.rightContext();
int right_u = extractorsRare.rightContext();
if (right_u > right) {
right = right_u;
}
TestSentence.rightContext = right;
}
/* ----
* NOT USED
* This reads the .assoc file. It is only used by LambdaSolveTagger.java
* The same associations also appear in the main file, and are read by
* read(), read_prev().
*
* @param modelFilename The string .assoc is appended and feature
* associations are then read from this file
* @return The feature associations HashMap, or null if there is an error
*
protected static HashMap<FeatureKey, Integer> readAssociations(String modelFilename) {
try {
HashMap<FeatureKey, Integer> fAssocs = new HashMap<FeatureKey, Integer>();
String filename = modelFilename + ".assoc";
InDataStreamFile rf = new InDataStreamFile(filename);
int sizeAssoc = rf.readInt();
for (int i = 0; i < sizeAssoc; i++) {
int numF = rf.readInt();
FeatureKey fK = new FeatureKey();
fK.read(rf);
fAssocs.put(fK, numF);
}
rf.close();
return fAssocs;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
---- */
/* ----------- unused
public static void release_mem() {
// release dict
dict.release();
pairs.release();
tHistories.release();
tFeature.release();
fAssociations.clear();
}
-------------- */
protected static void saveModel(String filename, TaggerConfig config) {
try {
OutDataStreamFile file = new OutDataStreamFile(filename);
config.saveConfig(file);
file.writeInt(xSize);
file.writeInt(ySize);
dict.save(file);
tags.save(file);
saveExtractors(file);
file.writeInt(fAssociations.size());
for (Map.Entry<FeatureKey,Integer> item : fAssociations.entrySet()) {
int numF = item.getValue();
file.writeInt(numF);
FeatureKey fk = item.getKey();
fk.save(file);
}
LambdaSolve.save_lambdas(file, prob.lambda);
file.close();
} catch (IOException ioe) {
System.err.println("Error saving tagger to file " + filename);
ioe.printStackTrace();
}
}
/**
* This method is provided for backwards compatibility with the old tagger. It reads
* a tagger that was saved as multiple files into the current format and saves it back
* out as a single file, newFilename.
*
* @param filename The name of the holder file, which is also used as a prefix for other filenames
* @param newFilename The name of the new one-file model that will be written
* @param config tagger configuration file
* @return true (whether this operation succeeded; always true
* @throws Exception ??
*/
protected static boolean convertMultifileTagger(String filename, String newFilename, TaggerConfig config) throws Exception {
InDataStreamFile rf = new InDataStreamFile(filename);
GlobalHolder.init(config);
if (VERBOSE) {
System.err.println(" length of holder " + new File(filename).length());
}
xSize = rf.readInt();
ySize = rf.readInt();
dict.read(filename + ".dict");
if (VERBOSE) {
System.err.println(" dictionary read ");
}
tags.read(filename + ".tags");
readExtractors(filename + ".ex");
dict.setAmbClasses();
int[] numFA = new int[extractors.getSize() + extractorsRare.getSize()];
int sizeAssoc = rf.readInt();
PrintFile pfVP = null;
if (VERBOSE) {
pfVP = new PrintFile("pairs.txt");
}
for (int i = 0; i < sizeAssoc; i++) {
int numF = rf.readInt();
FeatureKey fK = new FeatureKey();
fK.read(rf);
numFA[fK.num]++;
fAssociations.put(fK, numF);
}
if (VERBOSE) {
pfVP.close();
}
if (VERBOSE) {
for (int k = 0; k < numFA.length; k++) {
System.err.println(" Number of features of kind " + k + ' ' + numFA[k]);
}
}
prob = new LambdaSolveTagger(filename + ".prob");
if (VERBOSE) {
System.err.println(" prob read ");
}
saveModel(newFilename, config);
rf.close();
return true;
}
/** This reads the complete tagger from a single model stored in a file, at a URL,
* or as a resource
* in a jar file, and inits the tagger using a
* combination of the properties passed in and parameters from the file.
* <p>
* <i>Note for the future:</i> This assumes that the TaggerConfig in the file
* has already been read and used. This work is done inside the
* constructor of TaggerConfig. It might be better to refactor
* things so that is all done inside this method, but for the moment
* it seemed better to leave working code alone [cdm 2008].
*
* @param config The tagger config
* @param modelFileOrUrl The name of the model file. This routine opens and closes it.
* @throws Exception If I/O errors, etc.
*/
protected static TaggerConfig readModelAndInit(TaggerConfig config, String modelFileOrUrl, boolean printLoading) throws Exception {
// first check can open file ... or else leave with exception
DataInputStream rf = config.getTaggerDataInputStream(modelFileOrUrl);
// if (VERBOSE) {
// System.err.println(" length of model holder " + new File(modelFileOrUrl).length());
// }
TaggerConfig tc = readModelAndInit(config, rf, printLoading);
rf.close();
return tc;
}
/** This reads the complete tagger from a single model file, and inits
* the tagger using a combination of the properties passed in and
* parameters from the file.
* <p>
* <i>Note for the future: This assumes that the TaggerConfig in the file
* has already been read and used. It might be better to refactor
* things so that is all done inside this method, but for the moment
* it seemed better to leave working code alone [cdm 2008].</i>
*
* @param config The tagger config
* @param rf DataInputStream to read from. It's the caller's job to open and close this stream.
* @throws IOException If I/O errors
* @throws ClassNotFoundException If serialization errors
*/
protected static TaggerConfig readModelAndInit(TaggerConfig config, DataInputStream rf,
boolean printLoading) throws IOException, ClassNotFoundException {
Timing t = new Timing();
if (printLoading) t.doing("Reading POS tagger model from " + config.getModel());
// then init tagger
init(config);
TaggerConfig ret = TaggerConfig.readConfig(rf); // taggerconfig in file has already been put into config in constructor of TaggerConfig, so usually just read past it.
xSize = rf.readInt();
ySize = rf.readInt();
dict.read(rf);
if (VERBOSE) {
System.err.println(" dictionary read ");
}
tags.read(rf);
System.out.println(tags.index);
readExtractors(rf);
dict.setAmbClasses();
int[] numFA = new int[extractors.getSize() + extractorsRare.getSize()];
int sizeAssoc = rf.readInt();
// init the Hash at the right size for efficiency (avoid resizing ops)
// mg2008: sizeAssoc defines the number of keys, whereas specifying
// sizeAssoc as argument defines an initial size.
// Unless load factor is >= 1, fAssociations is guaranteed to resize at least once.
//fAssociations = new HashMap<FeatureKey,Integer>(sizeAssoc);
fAssociations = new HashMap<FeatureKey,Integer>(sizeAssoc*2);
if (VERBOSE) System.err.printf("Reading %d feature keys...\n",sizeAssoc);
PrintFile pfVP = null;
if (VERBOSE) {
pfVP = new PrintFile("pairs.txt");
}
for (int i = 0; i < sizeAssoc; i++) {
int numF = rf.readInt();
FeatureKey fK = new FeatureKey();
fK.read(rf);
numFA[fK.num]++;
fAssociations.put(fK, numF);
}
if (VERBOSE) {
pfVP.close();
}
if (VERBOSE) {
for (int k = 0; k < numFA.length; k++) {
System.err.println(" Number of features of kind " + k + ' ' + numFA[k]);
}
}
prob = new LambdaSolveTagger(rf);
if (VERBOSE) {
System.err.println(" prob read ");
}
if (printLoading) t.done();
return ret;
}
protected static void dumpModel() {
assert fAssociations.size() == prob.lambda.length;
for (Map.Entry<FeatureKey,Integer> fk : fAssociations.entrySet()) {
System.out.println(fk.getKey() + ": " + prob.lambda[fk.getValue()]);
}
}
protected static boolean isRare(String word) {
return dict.sum(word) < rareWordThresh;
}
protected static boolean isSeen(String word) {
return dict.sum(word) >= 1;
}
public static TTags getTags() {
return tags;
}
public static void main(String[] args) {
try {
TaggerConfig gh = readModelAndInit(new TaggerConfig(args), args[1], false);
//dumpModel();
//System.out.println(getTags());
getTags().show();
} catch (Exception e){
System.err.println(e.getMessage());
}
//
}
}