package edu.stanford.nlp.patterns.surface;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Execution;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Triple;
import java.nio.charset.Charset;
import java.util.*;
/**
* Created by sonalg on 10/27/14.
*/
public class SurfacePatternFactory extends PatternFactory{
/**
* Use POS tag restriction in the target term: One of this and
* <code>addPatWithoutPOS</code> has to be true.
*/
@Execution.Option(name = "usePOS4Pattern")
public static boolean usePOS4Pattern = true;
/**
* Add patterns without POS restriction as well: One of this and
* <code>usePOS4Pattern</code> has to be true.
*/
@Execution.Option(name = "addPatWithoutPOS")
public static boolean addPatWithoutPOS = true;
/**
* Consider contexts longer or equal to these many tokens.
*/
@Execution.Option(name = "minWindow4Pattern")
public static int minWindow4Pattern = 2;
/**
* Consider contexts less than or equal to these many tokens -- total of left
* and right contexts be can double of this.
*/
@Execution.Option(name = "maxWindow4Pattern")
public static int maxWindow4Pattern = 4;
/**
* Consider contexts on the left of a token.
*/
@Execution.Option(name = "usePreviousContext")
public static boolean usePreviousContext = true;
/**
* Consider contexts on the right of a token.
*/
@Execution.Option(name = "useNextContext")
public static boolean useNextContext = false;;
/**
* If the whole (either left or right) context is just stop words, add the
* pattern only if number of tokens is equal or more than this. This is get
* patterns like "I am on X" but ignore "on X".
*/
@Execution.Option(name = "numMinStopWordsToAdd")
public static int numMinStopWordsToAdd = 3;
/**
* Adds the parent's tag from the parse tree to the target phrase in the patterns
*/
@Execution.Option(name = "useTargetParserParentRestriction")
public static boolean useTargetParserParentRestriction = false;
/**
* If the NER tag of the context tokens is not the background symbol,
* generalize the token with the NER tag
*/
@Execution.Option(name = "useContextNERRestriction")
public static boolean useContextNERRestriction = false;
/**
* Ignore words like "a", "an", "the" when matching a pattern.
*/
@Execution.Option(name = "useFillerWordsInPat")
public static boolean useFillerWordsInPat = true;
public static enum Genre {
PREV, NEXT, PREVNEXT
};
static Token fw, sw;
public static void setUp(Properties props){
Execution.fillOptions(SurfacePatternFactory.class, props);
Execution.fillOptions(SurfacePattern.class, props);
if (!addPatWithoutPOS && !usePOS4Pattern) {
throw new RuntimeException(
"addPatWithoutPOS and usePOS4Pattern both cannot be false ");
}
fw = new Token();
if (useFillerWordsInPat) {
fw.setEnvBindRestriction("$FILLER");
fw.setNumOcc(0,2);
}
sw = new Token();
if (useStopWordsBeforeTerm) {
sw.setEnvBindRestriction("$STOPWORD");
sw.setNumOcc(0, 2);
}
}
public static Set<SurfacePattern> getContext(List<CoreLabel> sent, int i) {
Set<SurfacePattern> prevpatterns = new HashSet<SurfacePattern>();
Set<SurfacePattern> nextpatterns = new HashSet<SurfacePattern>();
Set<SurfacePattern> prevnextpatterns = new HashSet<SurfacePattern>();
CoreLabel token = sent.get(i);
String tag = null;
if (usePOS4Pattern) {
String fulltag = token.tag();
tag = fulltag.substring(0, Math.min(fulltag.length(), 2));
}
String nerTag = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) {
List<Token> previousTokens = new ArrayList<Token>();
List<String> originalPrev = new ArrayList<String>(), originalNext = new ArrayList<String>();
List<Token> nextTokens = new ArrayList<Token>();
int numStopWordsprev = 0, numStopWordsnext = 0;
// int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
int numNonStopWordsNext = 0, numNonStopWordsPrev = 0;
boolean useprev = false, usenext = false;
PatternToken twithoutPOS = null;
if (addPatWithoutPOS) {
twithoutPOS = new PatternToken(tag, false,
numWordsCompound > 1, numWordsCompound,
nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
}
PatternToken twithPOS = null;
if (usePOS4Pattern) {
twithPOS = new PatternToken(tag, true,
numWordsCompound > 1, numWordsCompound,
nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.get(CoreAnnotations.GrandparentAnnotation.class));
}
if (usePreviousContext) {
// int j = Math.max(0, i - 1);
int j = i - 1;
int numTokens = 0;
while (numTokens < maxWin && j >= 0) {
// for (int j = Math.max(i - maxWin, 0); j < i; j++) {
CoreLabel tokenj = sent.get(j);
String tokenjStr;
if (useLemmaContextTokens)
tokenjStr = tokenj.lemma();
else
tokenjStr = tokenj.word();
// do not use this word in context consideration
if (useFillerWordsInPat
&& fillerWords.contains(tokenj.word().toLowerCase())) {
j--;
continue;
}
// if (!tokenj.containsKey(answerClass.get(label))) {
// throw new RuntimeException("how come the class "
// + answerClass.get(label) + " for token "
// + tokenj.word() + " in " + sent + " is not set");
// }
Triple<Boolean, Token, String> tr = getContextTokenStr(tokenj);
boolean isLabeledO = tr.first;
Token strgeneric = tr.second;
String strOriginal = tr.third;
if (!isLabeledO) {
// numPrevTokensSpecial++;
previousTokens.add(0, strgeneric);
// previousTokens.add(0,
// "[{answer:"
// + tokenj.get(answerClass.get(label)).toString()
// + "}]");
originalPrev.add(0, strOriginal);
numNonStopWordsPrev++;
} else if (tokenj.word().startsWith("http")) {
useprev = false;
previousTokens.clear();
originalPrev.clear();
break;
} else {
Token str = SurfacePattern.getContextToken(tokenj);
previousTokens.add(0, str);
originalPrev.add(0, tokenjStr);
if (doNotUse(tokenjStr, ConstantsAndVariables.getStopWords())) {
numStopWordsprev++;
} else
numNonStopWordsPrev++;
}
numTokens++;
j--;
}
}
if (useNextContext) {
int numTokens = 0;
int j = i + 1;
while (numTokens < maxWin && j < sent.size()) {
// for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
CoreLabel tokenj = sent.get(j);
String tokenjStr;
if (useLemmaContextTokens)
tokenjStr = tokenj.lemma();
else
tokenjStr = tokenj.word();
// do not use this word in context consideration
if (useFillerWordsInPat
&& fillerWords.contains(tokenj.word().toLowerCase())) {
j++;
continue;
}
// if (!tokenj.containsKey(answerClass.get(label))) {
// throw new RuntimeException(
// "how come the dict annotation for token " + tokenj.word()
// + " in " + sent + " is not set");
// }
Triple<Boolean, Token, String> tr = getContextTokenStr(tokenj);
boolean isLabeledO = tr.first;
Token strgeneric = tr.second;
String strOriginal = tr.third;
// boolean isLabeledO = tokenj.get(answerClass.get(label))
// .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
if (!isLabeledO) {
// numNextTokensSpecial++;
numNonStopWordsNext++;
nextTokens.add(strgeneric);
// nextTokens.add("[{" + label + ":"
// + tokenj.get(answerClass.get(label)).toString()
// + "}]");
originalNext.add(strOriginal);
// originalNextStr += " "
// + tokenj.get(answerClass.get(label)).toString();
} else if (tokenj.word().startsWith("http")) {
usenext = false;
nextTokens.clear();
originalNext.clear();
break;
} else {// if (!tokenj.word().matches("[.,?()]")) {
Token str = SurfacePattern.getContextToken(tokenj);
nextTokens.add(str);
originalNext.add(tokenjStr);
if (doNotUse(tokenjStr, ConstantsAndVariables.getStopWords())) {
numStopWordsnext++;
} else
numNonStopWordsNext++;
}
j++;
numTokens++;
}
}
// String prevContext = null, nextContext = null;
// int numNonSpecialPrevTokens = previousTokens.size()
// - numPrevTokensSpecial;
// int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;
Token[] prevContext = null;
//String[] prevContext = null;
//String[] prevOriginalArr = null;
// if (previousTokens.size() >= minWindow4Pattern
// && (numStopWordsprev < numNonSpecialPrevTokens ||
// numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
if (previousTokens.size() >= minWindow4Pattern
&& (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) {
// prevContext = StringUtils.join(previousTokens, fw);
List<Token> prevContextList = new ArrayList<Token>();
List<String> prevOriginal = new ArrayList<String>();
for (Token p : previousTokens) {
prevContextList.add(p);
if (!fw.isEmpty())
prevContextList.add(fw);
}
// add fw and sw to the the originalprev
for (String p : originalPrev) {
prevOriginal.add(p);
if (!fw.isEmpty())
prevOriginal.add(" FW ");
}
if (!sw.isEmpty()) {
prevContextList.add(sw);
prevOriginal.add(" SW ");
}
// String str = prevContext + fw + sw;
if (isASCII(StringUtils.join(prevOriginal))) {
prevContext = prevContextList.toArray(new Token[0]);
//prevOriginalArr = prevOriginal.toArray(new String[0]);
if (previousTokens.size() >= minWindow4Pattern) {
if (twithoutPOS != null) {
SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
null, Genre.PREV);
prevpatterns.add(pat);
}
if (twithPOS != null) {
SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
null, Genre.PREV);
prevpatterns.add(patPOS);
}
}
useprev = true;
}
}
Token[] nextContext = null;
//String [] nextOriginalArr = null;
// if (nextTokens.size() > 0
// && (numStopWordsnext < numNonSpecialNextTokens ||
// numNonSpecialNextTokens > numMinStopWordsToAdd)) {
if (nextTokens.size() > 0
&& (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) {
// nextContext = StringUtils.join(nextTokens, fw);
List<Token> nextContextList = new ArrayList<Token>();
List<String> nextOriginal = new ArrayList<String>();
if (!sw.isEmpty()) {
nextContextList.add(sw);
nextOriginal.add(" SW ");
}
for (Token n : nextTokens) {
if (!fw.isEmpty())
nextContextList.add(fw);
nextContextList.add(n);
}
for (String n : originalNext) {
if (!fw.isEmpty())
nextOriginal.add(" FW ");
nextOriginal.add(n);
}
if (nextTokens.size() >= minWindow4Pattern) {
nextContext = nextContextList.toArray(new Token[0]);
//nextOriginalArr = nextOriginal.toArray(new String[0]);
if (twithoutPOS != null) {
SurfacePattern pat = new SurfacePattern(null, twithoutPOS,
nextContext, Genre.NEXT);
nextpatterns.add(pat);
}
if (twithPOS != null) {
SurfacePattern patPOS = new SurfacePattern(null, twithPOS,
nextContext, Genre.NEXT);
nextpatterns.add(patPOS);
}
}
usenext = true;
}
if (useprev && usenext) {
// String strprev = prevContext + fw + sw;
// String strnext = sw + fw + nextContext;
if (previousTokens.size() + nextTokens.size() >= minWindow4Pattern) {
if (twithoutPOS != null) {
SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS,
nextContext, Genre.PREVNEXT);
prevnextpatterns.add(pat);
}
if (twithPOS != null) {
SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS,
nextContext, Genre.PREVNEXT);
prevnextpatterns.add(patPOS);
}
}
}
}
// Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
// prevpatterns, nextpatterns, prevnextpatterns);
// System.out.println("For word " + sent.get(i) + " in sentence " + sent +
// " prev patterns are " + prevpatterns);
// System.out.println("For word " + sent.get(i) + " in sentence " + sent +
// " next patterns are " + nextpatterns);
// System.out.println("For word " + sent.get(i) + " in sentence " + sent +
// " prevnext patterns are " + prevnextpatterns);
//getPatternIndex().finishCommit();
return CollectionUtils.unionAsSet(prevpatterns, nextpatterns, prevnextpatterns);
}
static Triple<Boolean, Token, String> getContextTokenStr(CoreLabel tokenj) {
Token strgeneric = new Token();
String strOriginal = "";
boolean isLabeledO = true;
// for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
// if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
// isLabeledO = false;
// if (strOriginal.isEmpty()) {
// strOriginal = e.getKey();
// } else {
// strOriginal += "|" + e.getKey();
// }
// strgeneric.addRestriction(e.getKey(), e.getKey());
// }
// }
for (Map.Entry<String, Class> e : ConstantsAndVariables.getGeneralizeClasses().entrySet()) {
if (!tokenj.get(e.getValue()).equals(ConstantsAndVariables.backgroundSymbol)) {
isLabeledO = false;
if (strOriginal.isEmpty()) {
strOriginal = e.getKey();
} else {
strOriginal += "|" + e.getKey();
}
strgeneric.addORRestriction(e.getValue(), e.getKey());
}
}
if (useContextNERRestriction) {
String nerTag = tokenj
.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (nerTag != null
&& !nerTag.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) {
isLabeledO = false;
if (strOriginal.isEmpty()) {
strOriginal = nerTag;
} else {
strOriginal += "|" + nerTag;
}
strgeneric.addORRestriction(CoreAnnotations.NamedEntityTagAnnotation.class, nerTag);
}
}
return new Triple<Boolean, Token, String>(isLabeledO, strgeneric,
strOriginal);
}
public static boolean isASCII(String text) {
Charset charset = Charset.forName("US-ASCII");
String checked = new String(text.getBytes(charset), charset);
return checked.equals(text);// && !text.contains("+") &&
// !text.contains("*");// && !
// text.contains("$") && !text.contains("\"");
}
}