Source Code of org.languagetool.rules.de.AgreementRule

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.de;


import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.German;
import org.languagetool.rules.Category;
import org.languagetool.rules.Example;
import org.languagetool.rules.RuleMatch;
import org.languagetool.tagging.de.AnalyzedGermanToken;
import org.languagetool.tagging.de.GermanTagger;
import org.languagetool.tagging.de.GermanToken;
import org.languagetool.tagging.de.GermanToken.POSType;
import org.languagetool.tools.StringTools;


import java.io.IOException;
import java.util.*;


/**
 * Simple agreement checker for German noun phrases. Checks agreement in:
 * 
 * <ul>
 *  <li>DET/PRO NOUN: e.g. "mein Auto", "der Mann", "die Frau" (correct), "die Haus" (incorrect)</li>
 *  <li>DET/PRO ADJ NOUN: e.g. "der riesige Tisch" (correct), "die riesigen Tisch" (incorrect)</li> 
 * </ul>
 * 
 * Note that this rule only checks agreement inside the noun phrase, not whether
 * e.g. the correct case is used. For example, "Es ist das Haus dem Mann" is not
 * detected as incorrect.
 *
 * <p>TODO: the implementation could use a re-write that first detects the relevant noun phrases and then checks agreement
 *  
 * @author Daniel Naber
 */
public class AgreementRule extends GermanRule {


  private final German language;


  private enum GrammarCategory {
    KASUS("Kasus (Fall: Wer/Was, Wessen, Wem, Wen/Was - Beispiel: 'das Fahrrads' statt 'des Fahrrads')"),
    GENUS("Genus (männlich, weiblich, sächlich - Beispiel: 'der Fahrrad' statt 'das Fahrrad')"),
    NUMERUS("Numerus (Einzahl, Mehrzahl - Beispiel: 'das Fahrräder' statt 'die Fahrräder')");
    
    private final String displayName;
    private GrammarCategory(String displayName) {
      this.displayName = displayName;
    }
  }


  /*
   * City names are incoherently tagged in the Morphy data. To avoid
   * false alarms on phrases like "das Berliner Auto" we have to
   * explicitly add these adjective readings to "Berliner" and to all
   * other potential city names:
   */
  private static final String[] ADJ_READINGS = {
    // singular:
    "ADJ:NOM:SIN:MAS:GRU", "ADJ:NOM:SIN:NEU:GRU", "ADJ:NOM:SIN:FEM:GRU",    // das Berliner Auto
    "ADJ:GEN:SIN:MAS:GRU", "ADJ:GEN:SIN:NEU:GRU", "ADJ:GEN:SIN:FEM:GRU",    // des Berliner Autos 
    "ADJ:DAT:SIN:MAS:GRU", "ADJ:DAT:SIN:NEU:GRU", "ADJ:DAT:SIN:FEM:GRU",    // dem Berliner Auto
    "ADJ:AKK:SIN:MAS:GRU", "ADJ:AKK:SIN:NEU:GRU", "ADJ:AKK:SIN:FEM:GRU",    // den Berliner Bewohner
    // plural:
    "ADJ:NOM:PLU:MAS:GRU", "ADJ:NOM:PLU:NEU:GRU", "ADJ:NOM:PLU:FEM:GRU",    // die Berliner Autos
    "ADJ:GEN:PLU:MAS:GRU", "ADJ:GEN:PLU:NEU:GRU", "ADJ:GEN:PLU:FEM:GRU",    // der Berliner Autos 
    "ADJ:DAT:PLU:MAS:GRU", "ADJ:DAT:PLU:NEU:GRU", "ADJ:DAT:PLU:FEM:GRU",    // den Berliner Autos
    "ADJ:AKK:PLU:MAS:GRU", "ADJ:AKK:PLU:NEU:GRU", "ADJ:AKK:PLU:FEM:GRU",    // den Berliner Bewohnern
  };
  
  /*
   * The heuristic of maybeAddAdjectiveReadings considers every noun ending with "er" as city name.
   * The nouns in this list are NOT considered as city names.
   * NOTE: Only nouns for which cutting off the final "er" produces a valid noun must be added to this list.
   */
  private static final Set<String> ER_TO_BE_IGNORED = new HashSet<>(Arrays.asList(
    "Alter",
    "Kinder",
    "Rinder"
  ));
  
  private static final Set<String> VIELE_WENIGE_LOWERCASE = new HashSet<>(Arrays.asList(
    "viele",
    "vieler",
    "wenige",
    "weniger",
    "einige",
    "einiger",
    "mehrerer",
    "mehrere"
  ));
  
  private static final Set<String> REL_PRONOUN = new HashSet<>();
  static {
    REL_PRONOUN.add("der");
    REL_PRONOUN.add("die");
    REL_PRONOUN.add("das");
    REL_PRONOUN.add("dessen");
    REL_PRONOUN.add("deren");
    REL_PRONOUN.add("dem");
    REL_PRONOUN.add("den");
    REL_PRONOUN.add("denen");
    REL_PRONOUN.add("welche");
    REL_PRONOUN.add("welcher");
    REL_PRONOUN.add("welchen");
    REL_PRONOUN.add("welchem");
    REL_PRONOUN.add("welches");
  }


  private static final Set<String> PREPOSITIONS = new HashSet<>();
  static {
    PREPOSITIONS.add("in");
    PREPOSITIONS.add("auf");
    PREPOSITIONS.add("an");
    PREPOSITIONS.add("ab");
    PREPOSITIONS.add("für");
    PREPOSITIONS.add("zu");
    PREPOSITIONS.add("bei");
    PREPOSITIONS.add("nach");
    PREPOSITIONS.add("über");
    PREPOSITIONS.add("von");
    PREPOSITIONS.add("mit");
    PREPOSITIONS.add("durch");
    // TODO: add more
  }
  
  private static final Set<String> PRONOUNS_TO_BE_IGNORED = new HashSet<>(Arrays.asList(
    "ich",
    "dir",
    "du",
    "er", "sie", "es",
    "wir",
    "mir",
    "uns",
    "ihnen",
    "euch",
    "ihm",
    "ihr",
    "ihn",
    "dessen",
    "deren",
    "denen",
    "sich",
    "unser",
    "aller",
    "man",
    "beide",
    "beiden",
    "beider",
    "wessen",
    "a",
    "alle",
    "etwas",
    "was",
    "wer",
    "jenen",      // "...und mit jenen anderer Arbeitsgruppen verwoben"
    "diejenigen",
    "jemand",
    "niemand"
  ));
  
  private static final Set<String> NOUNS_TO_BE_IGNORED = new HashSet<>(Arrays.asList(
    "Prozent",   // Plural "Prozente", trotzdem ist "mehrere Prozent" korrekt
    "Gramm",
    "Kilogramm"
  ));
    
  public AgreementRule(final ResourceBundle messages, German language) {
    this.language = language;
    if (messages != null) {
      super.setCategory(new Category(messages.getString("category_grammar")));
    }
    addExamplePair(Example.wrong("<marker>Der Haus</marker> wurde letztes Jahr gebaut."),
                   Example.fixed("<marker>Das Haus</marker> wurde letztes Jahr gebaut"));
  }
  
  @Override
  public String getId() {
    return "DE_AGREEMENT";
  }


  @Override
  public String getDescription() {
    return "Kongruenz von Nominalphrasen (unvollständig!), z.B. 'mein kleiner(kleines) Haus'";
  }


  @Override
  public RuleMatch[] match(final AnalyzedSentence sentence) {
    final List<RuleMatch> ruleMatches = new ArrayList<>();
    final AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();    
    for (int i = 0; i < tokens.length; i++) {
      //defaulting to the first reading
      //TODO: check for all readings
      //and replace GermanTokenReading
      final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
      if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) {
        continue;
      }
      //AnalyzedGermanToken analyzedToken = new AnalyzedGermanToken(tokens[i]);
      
      final AnalyzedTokenReadings tokenReadings = tokens[i];
      final boolean relevantPronoun = isRelevantPronoun(tokens, i);
     
      boolean ignore = couldBeRelativeClause(tokens, i);
      if (i > 0) {
        final String prevToken = tokens[i-1].getToken().toLowerCase();
        if ((tokens[i].getToken().equals("eine") || tokens[i].getToken().equals("einen"))
            && (prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das") || prevToken.equals("des") || prevToken.equals("dieses"))) {
          // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken; "auf der einen Seite"
          ignore = true;
        }
      }
      
      // avoid false alarm on "nichts Gutes" and "alles Gute"
      if (tokenReadings.getToken().equals("nichts") || tokenReadings.getToken().equals("alles")
          || tokenReadings.getToken().equals("dies")) {
        ignore = true;
      }


      // avoid false alarm on "Art. 1" and "bisherigen Art. 1" (Art. = Artikel):
      boolean detAbbrev = i < tokens.length-2 && tokens[i+1].getToken().equals("Art") && tokens[i+2].getToken().equals(".");
      boolean detAdjAbbrev = i < tokens.length-3 && tokens[i+2].getToken().equals("Art") && tokens[i+3].getToken().equals(".");
      boolean followingParticiple = i < tokens.length-3 && tokens[i+2].hasPartialPosTag("PA1"); //  "einen Hochwasser führenden Fluss"
      if (detAbbrev || detAdjAbbrev || followingParticiple) {
        ignore = true;
      }


      if ((GermanHelper.hasReadingOfType(tokenReadings, POSType.DETERMINER) || relevantPronoun) && !ignore) {
        int tokenPos = i + 1; 
        if (tokenPos >= tokens.length) {
          break;
        }
        AnalyzedTokenReadings nextToken = tokens[tokenPos];
        nextToken = maybeAddAdjectiveReadings(nextToken, tokens, tokenPos);
        if (isNonPredicativeAdjective(nextToken) || isParticiple(nextToken)) {
          tokenPos = i + 2; 
          if (tokenPos >= tokens.length) {
            break;
          }
          if (GermanHelper.hasReadingOfType(tokens[tokenPos], POSType.NOMEN)) {
            // TODO: add a case (checkAdjNounAgreement) for special cases like "deren",
            // e.g. "deren komisches Geschenke" isn't yet detected as incorrect
            if (i >= 2 && GermanHelper.hasReadingOfType(tokens[i-2], POSType.ADJEKTIV)
                       && "als".equals(tokens[i-1].getToken())
                       && "das".equals(tokens[i].getToken())) {
              // avoid false alarm for e.g. "weniger farbenprächtig als das anderer Papageien"
              continue;
            }
            final RuleMatch ruleMatch = checkDetAdjNounAgreement(tokens[i],
                nextToken, tokens[i+2]);
            if (ruleMatch != null) {
              ruleMatches.add(ruleMatch);
            }
          }
        } else if (GermanHelper.hasReadingOfType(nextToken, POSType.NOMEN)) {
          final RuleMatch ruleMatch = checkDetNounAgreement(tokens[i], tokens[i+1]);
          if (ruleMatch != null) {
            ruleMatches.add(ruleMatch);
          }
        }
      }
           
    } // for each token
    return toRuleMatchArray(ruleMatches);
  }


  private boolean isNonPredicativeAdjective(AnalyzedTokenReadings tokensReadings) {
    for (AnalyzedToken reading : tokensReadings.getReadings()) {
      AnalyzedGermanToken germanReading = new AnalyzedGermanToken(reading);
      if (germanReading.getType() == POSType.ADJEKTIV && !germanReading.getPOSTag().contains("PRD")) {
        return true;
      }
    }
    return false;
  }


  private boolean isParticiple(AnalyzedTokenReadings tokensReadings) {
    for (AnalyzedToken reading : tokensReadings.getReadings()) {
      AnalyzedGermanToken germanReading = new AnalyzedGermanToken(reading);
      if (germanReading.getType() == POSType.PARTIZIP) {
        return true;
      }
    }
    return false;
  }


  private boolean isRelevantPronoun(AnalyzedTokenReadings[] tokens, int pos) {
    final AnalyzedTokenReadings analyzedToken = tokens[pos];
    boolean relevantPronoun = GermanHelper.hasReadingOfType(analyzedToken, POSType.PRONOMEN);
    // avoid false alarms:
    final String token = tokens[pos].getToken();
    if (pos > 0 && tokens[pos-1].getToken().equalsIgnoreCase("vor") && tokens[pos].getToken().equalsIgnoreCase("allem")) {
      relevantPronoun = false;
    } else if (PRONOUNS_TO_BE_IGNORED.contains(token.toLowerCase())) {
      relevantPronoun = false;
    }
    return relevantPronoun;
  }


  // see the comment at ADJ_READINGS:
  private AnalyzedTokenReadings maybeAddAdjectiveReadings(AnalyzedTokenReadings nextToken,
      AnalyzedTokenReadings[] tokens, int tokenPos) {
    final String nextTerm = nextToken.getToken();
    // Just a heuristic: nouns and proper nouns that end with "er" are considered
    // city names:
    if (nextTerm.endsWith("er") && tokens.length > tokenPos+1 && !ER_TO_BE_IGNORED.contains(nextTerm)) {
      final AnalyzedTokenReadings nextNextToken = tokens[tokenPos+1];
      final GermanTagger tagger = (GermanTagger)language.getTagger();
      try {
        final AnalyzedTokenReadings nextATR = tagger.lookup(nextTerm.substring(0, nextTerm.length()-2));
        final AnalyzedTokenReadings nextNextATR = tagger.lookup(nextNextToken.getToken());
        //System.err.println("nextATR: " + nextATR);
        //System.err.println("nextNextATR: " + nextNextATR);
        // "Münchner": special case as cutting off last two characters doesn't produce city name:
        if ("Münchner".equals(nextTerm) ||
            (nextATR != null &&
            // tagging in Morphy for cities is not coherent:
            (GermanHelper.hasReadingOfType(nextATR, POSType.PROPER_NOUN) || GermanHelper.hasReadingOfType(nextATR, POSType.NOMEN) &&
            nextNextATR != null && GermanHelper.hasReadingOfType(nextNextATR, POSType.NOMEN)))) {
          final AnalyzedToken[] adjReadings = new AnalyzedToken[ADJ_READINGS.length];
          for (int j = 0; j < ADJ_READINGS.length; j++) {
            adjReadings[j] = new AnalyzedToken(nextTerm, ADJ_READINGS[j], null);
          }
          nextToken = new AnalyzedTokenReadings(adjReadings, nextToken.getStartPos());
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
    return nextToken;
  }


  // TODO: improve this so it only returns true for real relative clauses
  private boolean couldBeRelativeClause(AnalyzedTokenReadings[] tokens, int pos) {
    boolean comma;
    boolean relPronoun;
    if (pos >= 1) {
      // avoid false alarm: "Das Wahlrecht, das Frauen zugesprochen bekamen." etc:
      comma = tokens[pos-1].getToken().equals(",");
      final String term = tokens[pos].getToken().toLowerCase();
      relPronoun = REL_PRONOUN.contains(term);
      if (comma && relPronoun) {
        return true;
      }
    }
    if (pos >= 2) {
      // avoid false alarm: "Der Mann, in dem quadratische Fische schwammen."
      comma = tokens[pos-2].getToken().equals(",");
      final String term1 = tokens[pos-1].getToken().toLowerCase();
      final String term2 = tokens[pos].getToken().toLowerCase();
      final boolean prep = PREPOSITIONS.contains(term1);
      relPronoun = REL_PRONOUN.contains(term2);
      return comma && prep && relPronoun;
    }
    return false;
  }


  private RuleMatch checkDetNounAgreement(final AnalyzedTokenReadings token1,
      final AnalyzedTokenReadings token2) {
    if (NOUNS_TO_BE_IGNORED.contains(token2.getToken())) {
      return null;
    }
    final Set<String> set1 = getAgreementCategories(token1);
    if (set1 == null) {
      return null;  // word not known, assume it's correct
    }
    final Set<String> set2 = getAgreementCategories(token2);
    if (set2 == null) {
      return null;
    }
    set1.retainAll(set2);
    RuleMatch ruleMatch = null;
    if (set1.size() == 0 && !isException(token1, token2)) {
      final List<String> errorCategories = getCategoriesCausingError(token1, token2);
      final String errorDetails = errorCategories.size() > 0 ?
              StringTools.listToString(errorCategories, " und ") : "Kasus, Genus oder Numerus";
      final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Artikel und Nomen " +
            "bezüglich " + errorDetails + ".";
      final String shortMsg = "Möglicherweise keine Übereinstimmung bezüglich " + errorDetails;
      ruleMatch = new RuleMatch(this, token1.getStartPos(),
              token2.getStartPos() + token2.getToken().length(), msg, shortMsg);
      final AgreementSuggestor suggestor = new AgreementSuggestor(language.getSynthesizer(), token1, token2);
      final List<String> suggestions = suggestor.getSuggestions();
      ruleMatch.setSuggestedReplacements(suggestions);
    }
    return ruleMatch;
  }


  private boolean isException(AnalyzedTokenReadings token1, AnalyzedTokenReadings token2) {
    String phrase = token1.getToken() + " " + token2.getToken();
    return "allen Grund".equals(phrase); 
  }


  private List<String> getCategoriesCausingError(AnalyzedTokenReadings token1, AnalyzedTokenReadings token2) {
    final List<String> categories = new ArrayList<>();
    final List<GrammarCategory> categoriesToCheck = Arrays.asList(GrammarCategory.KASUS, GrammarCategory.GENUS, GrammarCategory.NUMERUS);
    for (GrammarCategory category : categoriesToCheck) {
      if (agreementWithCategoryRelaxation(token1, token2, category)) {
        categories.add(category.displayName);
      }
    }
    return categories;
  }


  private RuleMatch checkDetAdjNounAgreement(final AnalyzedTokenReadings token1,
      final AnalyzedTokenReadings token2, final AnalyzedTokenReadings token3) {
    final Set<String> set = retainCommonCategories(token1, token2, token3, null);
    RuleMatch ruleMatch = null;
    if (set.size() == 0) {
      // TODO: more detailed error message:
      final String msg = "Möglicherweise fehlende grammatische Übereinstimmung zwischen Artikel, Adjektiv und " +
            "Nomen bezüglich Kasus, Numerus oder Genus. Beispiel: 'mein kleiner Haus' " +
            "statt 'mein kleines Haus'";
      final String shortMsg = "Möglicherweise keine Übereinstimmung bezüglich Kasus, Numerus oder Genus";
      ruleMatch = new RuleMatch(this, token1.getStartPos(), 
          token3.getStartPos()+token3.getToken().length(), msg, shortMsg);
    }
    return ruleMatch;
  }


  private boolean agreementWithCategoryRelaxation(final AnalyzedTokenReadings token1,
                                                  final AnalyzedTokenReadings token2, final GrammarCategory categoryToRelax) {
    final Set<GrammarCategory> categoryToRelaxSet;
    if (categoryToRelax != null) {
      categoryToRelaxSet = Collections.singleton(categoryToRelax);
    } else {
      categoryToRelaxSet = Collections.emptySet();
    }
    final Set<String> set1 = getAgreementCategories(token1, categoryToRelaxSet, true);
    if (set1 == null) {
      return true;  // word not known, assume it's correct
    }
    final Set<String> set2 = getAgreementCategories(token2, categoryToRelaxSet, true);
    if (set2 == null) {
      return true;      
    }
    set1.retainAll(set2);
    return set1.size() > 0;
  }
  
  private Set<String> retainCommonCategories(final AnalyzedTokenReadings token1, 
      final AnalyzedTokenReadings token2, final AnalyzedTokenReadings token3,
      final GrammarCategory categoryToRelax) {
    final Set<GrammarCategory> categoryToRelaxSet;
    if (categoryToRelax == null) {
      categoryToRelaxSet = Collections.singleton(categoryToRelax);
    } else {
      categoryToRelaxSet = Collections.emptySet();
    }
    final Set<String> set1 = getAgreementCategories(token1, categoryToRelaxSet, true);
    if (set1 == null) {
      return null;  // word not known, assume it's correct
    }
    final boolean skipSol = !VIELE_WENIGE_LOWERCASE.contains(token1.getToken().toLowerCase());
    final Set<String> set2 = getAgreementCategories(token2, categoryToRelaxSet, skipSol);
    if (set2 == null) {
      return null;
    }
    final Set<String> set3 = getAgreementCategories(token3, categoryToRelaxSet, true);
    if (set3 == null) {
      return null;
    }
    set1.retainAll(set2);
    set1.retainAll(set3);
    return set1;
  }


  private Set<String> getAgreementCategories(final AnalyzedTokenReadings aToken) {
    return getAgreementCategories(aToken, new HashSet<GrammarCategory>(), false);
  }
  
  /** Return Kasus, Numerus, Genus of those forms with a determiner. */
  private Set<String> getAgreementCategories(final AnalyzedTokenReadings aToken, Set<GrammarCategory> omit, boolean skipSol) {
    final Set<String> set = new HashSet<>();
    final List<AnalyzedToken> readings = aToken.getReadings();
    for (AnalyzedToken tmpReading : readings) {
      if (skipSol && tmpReading.getPOSTag() != null && tmpReading.getPOSTag().endsWith(":SOL")) {
        // SOL = alleinstehend - needs to be skipped so we find errors like "An der roter Ampel."
        continue;
      }
      final AnalyzedGermanToken reading = new AnalyzedGermanToken(tmpReading);
      if (reading.getCasus() == null && reading.getNumerus() == null &&
          reading.getGenus() == null) {
        continue;
      }
      if (reading.getGenus() == GermanToken.Genus.ALLGEMEIN && 
              reading.getPOSTag() != null && !reading.getPOSTag().endsWith(":STV")) {  // STV: stellvertretend (!= begleitend)
        // genus=ALG in the original data. Not sure if this is allowed, but expand this so
        // e.g. "Ich Arbeiter" doesn't get flagged as incorrect:
        set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, omit));
        set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, omit));
        set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, omit));
      } else {
        set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), omit));
      }
    }
    return set;
  }


  private String makeString(GermanToken.Kasus casus, GermanToken.Numerus num, GermanToken.Genus gen,
      Set<GrammarCategory> omit) {
    final List<String> l = new ArrayList<>();
    if (casus != null && !omit.contains(GrammarCategory.KASUS)) {
      l.add(casus.toString());
    }
    if (num != null && !omit.contains(GrammarCategory.NUMERUS)) {
      l.add(num.toString());
    }
    if (gen != null && !omit.contains(GrammarCategory.GENUS)) {
      l.add(gen.toString());
    }
    return StringTools.listToString(l, "/");
  }


  @Override
  public void reset() {
  }


}
Source Code of org.languagetool.rules.de.AgreementRule

Related Classes of org.languagetool.rules.de.AgreementRule