/**
* Copyright (c) 2009/09-2012/08, Regents of the University of Colorado
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Copyright 2012/09-2013/04, 2013/11-Present, University of Massachusetts Amherst
* Copyright 2013/05-2013/10, IPSoft Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.clearnlp.component.morph;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.w3c.dom.Element;
import com.clearnlp.constituent.CTLibEn;
import com.clearnlp.dependency.DEPNode;
import com.clearnlp.morphology.AbstractAffixMatcher;
import com.clearnlp.morphology.MPLib;
import com.clearnlp.morphology.MPLibEn;
import com.clearnlp.morphology.MPTag;
import com.clearnlp.morphology.english.EnglishAffixMatcherFactory;
import com.clearnlp.morphology.english.EnglishInflection;
import com.clearnlp.morphology.english.EnglishSuffixMatcher;
import com.clearnlp.pattern.PTLib;
import com.clearnlp.util.UTInput;
import com.clearnlp.util.UTOutput;
import com.clearnlp.util.UTXml;
import com.clearnlp.util.map.Prob2DMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* English morphological analyzer.
* @since 1.3.0
* @author Jinho D. Choi ({@code jdchoi77@gmail.com})
*/
public class EnglishMPAnalyzer extends AbstractMPAnalyzer
{
final String PATH = "dictionary/english/";
final String VERB = "verb";
final String NOUN = "noun";
final String ADJECTIVE = "adjective";
final String ADVERB = "adverb";
final String EXT_BASE = ".base";
final String EXT_EXCEPTION = ".exc";
final String INFLECTION_SUFFIX = PATH + "inflection_suffix.xml";
final String DERIVATION_SUFFIX = PATH + "derivation_suffix.xml";
final String ABBREVIATOIN_RULE = PATH + "abbreviation.rule";
final String CARDINAL_BASE = PATH + "cardinal.base";
final String ORDINAL_BASE = PATH + "ordinal.base";
final String FIELD_DELIM = "_";
private EnglishInflection inf_verb;
private EnglishInflection inf_noun;
private EnglishInflection inf_adjective;
private EnglishInflection inf_adverb;
/** Abbreviation replacement rules */
private Map<String,String> rule_abbreviation;
private Set<String> base_cardinal;
/** Ordinal base-forms */
private Set<String> base_ordinal;
// ====================================== CONSTRUCTORS ======================================
/** Constructs an English morphological analyzer from the dictionary in a classpath. */
public EnglishMPAnalyzer()
{
Element inflection = UTXml.getDocumentElement(UTInput.getInputStreamsFromClasspath(INFLECTION_SUFFIX));
try
{
inf_verb = getInflectionRules(inflection, VERB , CTLibEn.POS_VB, MPTag.IVX);
inf_noun = getInflectionRules(inflection, NOUN , CTLibEn.POS_NN, MPTag.INX);
inf_adjective = getInflectionRules(inflection, ADJECTIVE, CTLibEn.POS_JJ, MPTag.IJX);
inf_adverb = getInflectionRules(inflection, ADVERB , CTLibEn.POS_RB, MPTag.IRX);
base_cardinal = UTInput.getStringSet(UTInput.getInputStreamsFromClasspath(CARDINAL_BASE));
base_ordinal = UTInput.getStringSet(UTInput.getInputStreamsFromClasspath(ORDINAL_BASE));
rule_abbreviation = getAbbreviationMap(UTInput.getInputStreamsFromClasspath(ABBREVIATOIN_RULE));
}
catch (IOException e) {e.printStackTrace();}
}
private EnglishInflection getInflectionRules(Element eInflection, String type, String basePOS, String irregularPOS) throws IOException
{
Element eAffixes = UTXml.getFirstElementByTagName(eInflection, type);
InputStream baseStream = UTInput.getInputStreamsFromClasspath(PATH + type + EXT_BASE);
InputStream exceptionStream = UTInput.getInputStreamsFromClasspath(PATH + type + EXT_EXCEPTION);
Map<String,String> exceptionMap = (exceptionStream != null) ? UTInput.getStringMap(exceptionStream, PTLib.SPACE) : null;
List<AbstractAffixMatcher> affixMatchers = new EnglishAffixMatcherFactory().createAffixMatchers(eAffixes);
Set<String> baseSet = UTInput.getStringSet(baseStream);
return new EnglishInflection(basePOS, baseSet, exceptionMap, affixMatchers);
}
public EnglishMPAnalyzer(ZipFile file)
{
try
{
Element inflection = UTXml.getDocumentElement(file.getInputStream(new ZipEntry(INFLECTION_SUFFIX)));
inf_verb = getInflectionRules(file, inflection, VERB , CTLibEn.POS_VB, MPTag.IVX);
inf_noun = getInflectionRules(file, inflection, NOUN , CTLibEn.POS_NN, MPTag.INX);
inf_adjective = getInflectionRules(file, inflection, ADJECTIVE, CTLibEn.POS_JJ, MPTag.IJX);
inf_adverb = getInflectionRules(file, inflection, ADVERB , CTLibEn.POS_RB, MPTag.IRX);
base_cardinal = UTInput.getStringSet(file.getInputStream(new ZipEntry(CARDINAL_BASE)));
base_ordinal = UTInput.getStringSet(file.getInputStream(new ZipEntry(ORDINAL_BASE)));
rule_abbreviation = getAbbreviationMap(file.getInputStream(new ZipEntry(ABBREVIATOIN_RULE)));
}
catch (IOException e) {e.printStackTrace();}
}
private EnglishInflection getInflectionRules(ZipFile file, Element eInflection, String type, String basePOS, String irregularPOS) throws IOException
{
Element eAffixes = UTXml.getFirstElementByTagName(eInflection, type);
InputStream baseStream = file.getInputStream(new ZipEntry(PATH + type + EXT_BASE));
InputStream exceptionStream = file.getInputStream(new ZipEntry(PATH + type + EXT_EXCEPTION));
Map<String,String> exceptionMap = (exceptionStream != null) ? UTInput.getStringMap(exceptionStream, PTLib.SPACE) : null;
List<AbstractAffixMatcher> affixMatchers = new EnglishAffixMatcherFactory().createAffixMatchers(eAffixes);
Set<String> baseSet = UTInput.getStringSet(baseStream);
return new EnglishInflection(basePOS, baseSet, exceptionMap, affixMatchers);
}
// private void initDerivationRules()
// {
// Element derivation = UTXml.getDocumentElement(UTInput.getInputStreamsFromClasspath(DTEnglish.DERIVATION_SUFFIX));
// Map<String,Set<String>> baseMap = getBaseMap();
//
// der_verb = getDerivationRules(derivation, DTEnglish.VERB , baseMap);
// der_noun = getDerivationRules(derivation, DTEnglish.NOUN , baseMap);
// der_adjective = getDerivationRules(derivation, DTEnglish.ADJECTIVE, baseMap);
// der_adverb = getDerivationRules(derivation, DTEnglish.ADVERB , baseMap);
// }
//
// private EnglishDerivation getDerivationRules(Element eDerivation, String type, Map<String,Set<String>> baseMap)
// {
// Element eAffixes = UTXml.getFirstElementByTagName(eDerivation, type);
// List<AbstractAffixMatcher> affixMatchers = new EnglishAffixMatcherFactory().createAffixMatchers(eAffixes);
//
// return new EnglishDerivation(baseMap, affixMatchers);
// }
//
// private Map<String,Set<String>> getBaseMap()
// {
// Map<String,Set<String>> baseMap = Maps.newHashMap();
//
// baseMap.put(inf_verb .getBasePOS(), inf_verb.getBaseSet());
// baseMap.put(inf_noun .getBasePOS(), inf_noun.getBaseSet());
// baseMap.put(inf_adjective.getBasePOS(), inf_adjective.getBaseSet());
// baseMap.put(inf_adverb .getBasePOS(), inf_adverb.getBaseSet());
//
// return baseMap;
// }
private Map<String,String> getAbbreviationMap(InputStream stream) throws IOException
{
BufferedReader fin = new BufferedReader(new InputStreamReader(stream));
Map<String,String> map = Maps.newHashMap();
String line, abbr, pos, key, base;
String[] tmp;
while ((line = fin.readLine()) != null)
{
tmp = PTLib.splitSpace(line.trim());
abbr = tmp[0];
pos = tmp[1];
base = tmp[2];
key = abbr + FIELD_DELIM + pos;
map.put(key, base);
}
return map;
}
@Override
/**
* Analyzes the lemma and morphemes of the word-form in the specific node.
* PRE: the word-form and the POS tag of the node.
*/
public void analyze(DEPNode node)
{
if (node.lowerSimplifiedForm == null)
node.lowerSimplifiedForm = MPLib.getSimplifiedLowercaseWordForm(node.form);
if (node.pos.equals(CTLibEn.POS_NNP))
{
node.lemma = node.form.toLowerCase();
return;
}
if ((node.lemma = getAbbreviation(node.lowerSimplifiedForm, node.pos)) != null)
return;
if ((node.lemma = getBaseFormFromInflection(node.lowerSimplifiedForm, node.pos)) == null)
node.lemma = node.lowerSimplifiedForm;
if (!node.isPos(CTLibEn.POS_NNPS))
{
if (isCardinal(node.lemma)) node.setLemma(MPTag.LEMMA_CARDINAL);
else if (isOrdinal(node.lemma)) node.setLemma(MPTag.LEMMA_ORDINAL);
}
}
/** Called by {@link #analyze(DEPNode)}. */
private String getAbbreviation(String form, String pos)
{
String key = form + FIELD_DELIM + pos;
return rule_abbreviation.get(key);
}
/** @param form the lower simplified word-form. */
private String getBaseFormFromInflection(String form, String pos)
{
if (MPLibEn.isVerb(pos))
return inf_verb.getBaseForm(form, pos);
if (MPLibEn.isNoun(pos))
return inf_noun.getBaseForm(form, pos);
if (MPLibEn.isAdjective(pos))
return inf_adjective.getBaseForm(form, pos);
if (MPLibEn.isAdverb(pos))
return inf_adverb.getBaseForm(form, pos);
return null;
}
private boolean isCardinal(String form)
{
return base_cardinal.contains(form);
}
private boolean isOrdinal(String form)
{
return form.equals("0st") || form.equals("0nd") || form.equals("0rd") || form.equals("0th") || base_ordinal.contains(form);
}
// ------------------------------------ EVALUATION ------------------------------------
public void check(String outputDir)
{
try
{
check(outputDir, VERB);
check(outputDir, NOUN);
check(outputDir, ADJECTIVE);
check(outputDir, ADVERB);
}
catch (IOException e) {e.printStackTrace();}
}
private void check(String outputDir, String pos) throws IOException
{
// BufferedReader fin = UTInput.createBufferedFileReader(outputDir+"/"+pos+".exc.removed");
// PrintStream fout = UTOutput.createPrintBufferedFileStream(outputDir+"/"+pos+".exc.kept");
// String f, m, p;
// String[] tmp;
// String line;
//
// while ((line = fin.readLine()) != null)
// {
// tmp = PTLib.splitSpace(line);
// f = tmp[0];
// m = tmp[1];
// p = tmp[2];
//
// if (!m.equals(getLemma(f, p)))
// fout.println(f+" "+m);
// }
//
// fin.close();
// fout.close();
}
public void trim(String outputDir)
{
try
{
trim(outputDir, VERB , inf_verb);
trim(outputDir, NOUN , inf_noun);
trim(outputDir, ADJECTIVE, inf_adjective);
trim(outputDir, ADVERB , inf_adverb);
}
catch (Exception e) {e.printStackTrace();}
}
private void trim(String outputDir, String pos, EnglishInflection inflection) throws Exception
{
PrintStream fBaseRemoved = UTOutput.createPrintBufferedFileStream(outputDir+"/"+pos+".base.removed");
PrintStream fExcRemoved = UTOutput.createPrintBufferedFileStream(outputDir+"/"+pos+".exc.removed");
PrintStream fBase = UTOutput.createPrintBufferedFileStream(outputDir+"/"+pos+".base");
PrintStream fExc = UTOutput.createPrintBufferedFileStream(outputDir+"/"+pos+".exc");
Set<String> sAccept = UTInput.getStringSet(UTInput.getInputStreamsFromClasspath(PATH + pos + ".accept"));
Set<String> baseSet = inflection.getBaseSet();
Map<String,String> excMap = inflection.getExceptionMap();
// Morpheme baseMorphem;
List<String> list;
String base;
System.out.println(pos+":");
System.out.println(" original : "+baseSet.size()+" "+excMap.size());
// add base forms in the exception map to the base set
baseSet.addAll(excMap.values());
System.out.println("+ from exception: "+baseSet.size()+" "+excMap.size());
// remove base forms in the exception map from the base set
for (String form : Sets.newHashSet(baseSet))
{
if (!sAccept.contains(form) && (base = excMap.get(form)) != null && !base.equals(form))
{
baseSet.remove(form);
fBaseRemoved.println(form);
}
}
fBaseRemoved.close();
System.out.println("- from exception: "+baseSet.size()+" "+excMap.size());
// remove exception forms in the exception map that can be inflected by rules
for (String form : Sets.newHashSet(excMap.keySet()))
{
base = excMap.get(form);
// for (EnglishMPToken token : inflection.getInflectionsFromSuffixes(form))
// {
// baseMorphem = token.getBaseMorpheme();
//
// if (baseMorphem.isForm(base))
// {
// excMap.remove(form);
// fExcRemoved.println(form+" "+base+" "+getPOS(pos, token.getInflectionMorpheme().getPOS()));
// break;
// }
// }
}
fExcRemoved.close();
System.out.println("- inflected excs: "+baseSet.size()+" "+excMap.size());
// print a new base set
list = Lists.newArrayList(baseSet);
Collections.sort(list);
for (String key : list)
fBase.println(key);
fBase.close();
// print a new exception map
list = Lists.newArrayList(excMap.keySet());
Collections.sort(list);
for (String key : list)
fExc.println(key+" "+excMap.get(key));
fExc.close();
}
public void evaluateInflection(InputStream in) throws Exception
{
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
Map<String,Map<String,Prob2DMap>> smap = Maps.newHashMap();
EnglishInflection inflection;
String line, f, m, p;
String[] t;
while ((line = reader.readLine()) != null)
{
t = PTLib.splitTabs(line);
f = t[0];
m = t[1];
p = t[2];
if (MPLibEn.isVerb(p)) inflection = inf_verb;
else if (MPLibEn.isNoun(p)) inflection = inf_noun;
else if (MPLibEn.isAdjective(p)) inflection = inf_adjective;
else if (MPLibEn.isAdverb(p)) inflection = inf_adverb;
else continue;
for (AbstractAffixMatcher matcher : inflection.getSuffixMatchers())
((EnglishSuffixMatcher)matcher).evaluateInflection(smap, inflection.getBaseSet(), m, f, p);
}
printEvaluation(smap);
}
private void printEvaluation(Map<String,Map<String,Prob2DMap>> smap)
{
Map<String,Prob2DMap> rmap;
List<String> skeys, rkeys;
Prob2DMap map;
skeys = Lists.newArrayList(smap.keySet());
Collections.sort(skeys);
for (String skey : skeys)
{
System.out.println(skey);
rmap = smap.get(skey);
rkeys = Lists.newArrayList(rmap.keySet());
Collections.sort(rkeys);
for (String rkey : rkeys)
{
map = rmap.get(rkey);
for (String key : map.keySet())
System.out.printf("%s\t%s\t%s\t%d\n", rkey, key, Arrays.toString(map.getProb1D(key)), map.getTotal1D(key));
}
}
}
// static public void main(String[] args)
// {
// EnglishMPAnalyzer morph = new EnglishMPAnalyzer();
//
// try
// {
// morph.trim(args[0]);
// morph.check(args[0]);
// morph.evaluateInflection(new FileInputStream(args[0]));
// }
// catch (Exception e) {e.printStackTrace();}
// }
}