/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.eval;
import org.apache.tika.io.IOUtils;
import org.languagetool.JLanguageTool;
import org.languagetool.language.BritishEnglish;
import org.languagetool.language.English;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.languagemodel.LuceneLanguageModel;
import org.languagetool.rules.ConfusionProbabilityRule;
import org.languagetool.rules.ConfusionSetLoader;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.en.EnglishConfusionProbabilityRule;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
/**
* Runs LanguageTool's confusion rule on Wikipedia-extracted sentences that we assume to be correct.
* @since 2.7
*/
class RealWordFalseAlarmEvaluator {
private static final boolean EVAL_MODE = true; // set to false to get data for homophones-info.txt
private static final int MAX_SENTENCES = 1000;
private static final int MAX_ERROR_DISPLAY = 50;
// the minimum number of sentences in homophones-info.txt, items with less sentences will be ignored (eval mode only):
private static final int MIN_SENTENCES = 0;
// maximum error rate of a homophone in homophones-info.txt, items with a larger error rate will be ignored (eval mode only):
private static final float MAX_ERROR_RATE = 10;
private final JLanguageTool langTool;
private final ConfusionProbabilityRule confusionRule;
private final Map<String,ConfusionProbabilityRule.ConfusionSet> confusionSet;
private final LanguageModel languageModel;
private int globalSentenceCount;
private int globalRuleMatches;
RealWordFalseAlarmEvaluator(File languageModelIndexDir) throws IOException {
InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
ConfusionSetLoader confusionSetLoader;
if (EVAL_MODE) {
InputStream infoStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones-info.txt");
confusionSetLoader = new ConfusionSetLoader(infoStream, MIN_SENTENCES, MAX_ERROR_RATE);
} else {
confusionSetLoader = new ConfusionSetLoader();
}
confusionSet = confusionSetLoader.loadConfusionSet(inputStream);
langTool = new JLanguageTool(new BritishEnglish());
//langTool.activateDefaultPatternRules();
List<Rule> rules = langTool.getAllActiveRules();
for (Rule rule : rules) {
langTool.disableRule(rule.getId());
}
languageModel = new LuceneLanguageModel(languageModelIndexDir);
confusionRule = new EnglishConfusionProbabilityRule(JLanguageTool.getMessageBundle(), languageModel, new English());
langTool.addRule(confusionRule);
}
void close() {
if (languageModel != null) {
languageModel.close();
}
}
void run(File dir) throws IOException {
if (EVAL_MODE) {
System.out.println("Running in eval mode, no 'DATA' lines will be printed, only a subset of the homophones will be used.");
} else {
System.out.println("grep for '^DATA;' to get results in CVS format:");
System.out.println("DATA;word;sentence_count;errors_found;errors_percent");
}
File[] files = dir.listFiles();
//noinspection ConstantConditions
int fileCount = 1;
for (File file : files) {
if (!file.getName().endsWith(".txt")) {
System.out.println("Ignoring " + file + ", does not match *.txt");
continue;
}
try (FileInputStream fis = new FileInputStream(file)) {
System.out.println("===== Working on " + file.getName() + " (" + fileCount + "/" + files.length + ") =====");
checkLines(IOUtils.readLines(fis), file.getName().replace(".txt", ""));
fileCount++;
}
}
System.out.println("==============================");
System.out.println(globalSentenceCount + " sentences checked");
System.out.println(globalRuleMatches + " errors found");
float percentage = ((float)globalRuleMatches/(float)globalSentenceCount*100);
System.out.printf("%.2f%% of sentences have a match\n", percentage);
}
private void checkLines(List<String> lines, String name) throws IOException {
ConfusionProbabilityRule.ConfusionSet subConfusionSet = confusionSet.get(name);
if (subConfusionSet == null) {
System.out.println("Skipping '" + name + "', homophone not loaded");
return;
}
confusionRule.setConfusionSet(subConfusionSet);
int sentenceCount = 0;
int ruleMatches = 0;
for (String line : lines) {
List<RuleMatch> matches = langTool.check(line);
sentenceCount++;
globalSentenceCount++;
if (matches.size() > 0) {
Set<String> suggestions = new HashSet<>();
for (RuleMatch match : matches) {
//System.out.println(" " + match + ": " + match.getSuggestedReplacements());
suggestions.addAll(match.getSuggestedReplacements());
ruleMatches++;
globalRuleMatches++;
}
if (ruleMatches <= MAX_ERROR_DISPLAY) {
System.out.println("[" + name + "] " + line + " => " + suggestions);
}
}
if (sentenceCount > MAX_SENTENCES) {
System.out.println("Max sentences (" + MAX_SENTENCES + ") reached, stopping");
break;
}
}
System.out.println(sentenceCount + " sentences checked");
System.out.println(ruleMatches + " errors found");
float percentage = ((float)ruleMatches/(float)sentenceCount*100);
System.out.printf("%.2f%% of sentences have a match\n", percentage);
if (!EVAL_MODE) {
System.out.printf(Locale.ENGLISH, "DATA;%s;%d;%d;%.2f\n\n", name, sentenceCount, ruleMatches, percentage);
}
}
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: " + RealWordFalseAlarmEvaluator.class.getSimpleName() + " <languageModel> <sentenceDirectory>");
System.out.println(" <languageModel> is a Lucene index with ngram frequency information");
System.out.println(" <sentenceDirectory> is a directory with filenames like 'xx.txt' where 'xx' is the homophone");
System.exit(1);
}
RealWordFalseAlarmEvaluator evaluator = new RealWordFalseAlarmEvaluator(new File(args[0]));
File dir = new File(args[1]);
if (!dir.isDirectory()) {
throw new RuntimeException("Not a directory: " + dir);
}
evaluator.run(dir);
evaluator.close();
}
}