Examples of org.languagetool.rules.ConfusionSetLoader

org.languagetool.rules.ConfusionSetLoader
Loads a confusion set from a plain text file (UTF-8). Expects a file where there is one confusion set per line, words separated by commas. Also optionally loads information about the quality of the confusion sets from another file so confusion sets that might not produce good results can be ignored. @since 2.7

  public RuleCreator(float minErrorProb) {
    this.minErrorProb = minErrorProb;
  }


  private void run(File homophoneOccurrences, String homophonePath) throws IOException {
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
    Map<String,ConfusionProbabilityRule.ConfusionSet> confusionSetMap = confusionSetLoader.loadConfusionSet(inputStream);
    initMaps(homophoneOccurrences);
    int groupCount = 0;
    if (XML_MODE) {
      System.out.println("<rules lang='en'>\n");
      System.out.println("<category name='Auto-generated rules'>\n");

View Full Code Here

    return result;
  }


  private void run(String homophonePath) throws IOException {
    System.err.println("Loading homophones from " + homophonePath + ", minimum occurrence: " + MIN_COUNT);
    ConfusionSetLoader confusionSetLoader = new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(homophonePath);
    Map<String,ConfusionProbabilityRule.ConfusionSet> map = confusionSetLoader.loadConfusionSet(inputStream);
    Set<String> confusionTerms = map.keySet();
    dumpOccurrences(confusionTerms);
  }

View Full Code Here

 * @since 2.7
 */
final class ConfusionSetUrlGenerator {


  public static void main(String[] args) throws IOException {
    ConfusionSetLoader confusionSetLoader =  new ConfusionSetLoader();
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
    Map<String,ConfusionProbabilityRule.ConfusionSet> map = confusionSetLoader.loadConfusionSet(inputStream);
    String url = "http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-2gram-20120701-<XX>.gz";
    Set<String> nameSet = new HashSet<>();
    for (String s : map.keySet()) {
      if (s.length() < 2) {
        nameSet.add(s.substring(0, 1).toLowerCase() + "_");

View Full Code Here

    System.out.println(" (Lookups is the number of lookups needed to see which word in the homophones set " +
            "is more common. Actually even more ngram lookups will be needed, depending on what ngrams we have.)");
  }


  private Map<String, ConfusionProbabilityRule.ConfusionSet> getConfusionSet() throws IOException {
    ConfusionSetLoader loader = new ConfusionSetLoader();
    InputStream homophoneStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
    return loader.loadConfusionSet(homophoneStream);
  }

View Full Code Here

  private int globalSentenceCount;
  private int globalRuleMatches;


  RealWordFalseAlarmEvaluator(File languageModelIndexDir) throws IOException {
    InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
    ConfusionSetLoader confusionSetLoader;
    if (EVAL_MODE) {
      InputStream infoStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones-info.txt");
      confusionSetLoader =  new ConfusionSetLoader(infoStream, MIN_SENTENCES, MAX_ERROR_RATE);
    } else {
      confusionSetLoader =  new ConfusionSetLoader();
    }
    confusionSet = confusionSetLoader.loadConfusionSet(inputStream);
    langTool = new JLanguageTool(new BritishEnglish());
    //langTool.activateDefaultPatternRules();
    List<Rule> rules = langTool.getAllActiveRules();
    for (Rule rule : rules) {
      langTool.disableRule(rule.getId());

View Full Code Here

TOP

Related Classes of org.languagetool.rules.ConfusionSetLoader

org.languagetool.dev.ConfusionSetCoverage

org.languagetool.dev.ConfusionSetUrlGenerator

org.languagetool.dev.eval.RealWordFalseAlarmEvaluator

org.languagetool.dev.HomophoneOccurrenceDumper

org.languagetool.dev.RuleCreator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.