Source Code of org.languagetool.tagging.de.GermanTagger

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tagging.de;


import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;


import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;


import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tagging.ManualTagger;
import org.languagetool.tagging.Tagger;
import org.languagetool.tokenizers.de.GermanCompoundTokenizer;
import org.languagetool.tools.StringTools;


/**
 * German part-of-speech tagger, requires data file in <code>de/german.dict</code> in the classpath.
 * The POS tagset is described in
 * <a href="https://github.com/languagetool-org/languagetool/blob/master/languagetool-language-modules/de/src/main/resources/org/languagetool/resource/de/tagset.txt">tagset.txt</a>
 *
 * @author Marcin Milkowski, Daniel Naber
 */
public class GermanTagger implements Tagger {


  private static final String DICT_FILENAME = "/de/german.dict";
  private static final String USER_DICT_FILENAME = "/de/added.txt";


  private volatile Dictionary dictionary;
  private volatile ManualTagger manualTagger;
  private volatile GermanCompoundTokenizer compoundTokenizer;


  public GermanTagger() {
  }


  protected void initialize() throws IOException {
    final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(DICT_FILENAME);
    dictionary = Dictionary.read(url);
    manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME));
    compoundTokenizer = new GermanCompoundTokenizer();
  }


  protected void initializeIfRequired() throws IOException {
    // Lazy initialize all fields when needed and only once.
    Dictionary dict = dictionary;
    ManualTagger mTagger = manualTagger;
    GermanCompoundTokenizer gTokenizer = compoundTokenizer;
    if (dict == null || mTagger == null || gTokenizer == null) {
      synchronized (this) {
        dict = dictionary;
        mTagger = manualTagger;
        gTokenizer = compoundTokenizer;
        if (dict == null || mTagger == null || gTokenizer == null) {
          initialize();
        }
      }
    }
  }


  public AnalyzedTokenReadings lookup(final String word) throws IOException {
    final List<AnalyzedTokenReadings> result = tag(Collections.singletonList(word), false);
    final AnalyzedTokenReadings atr = result.get(0);
    if (atr.getAnalyzedToken(0).getPOSTag() == null) {
      return null;
    }
    return atr;
  }


  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {
    return tag(sentenceTokens, true);
  }


  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens, final boolean ignoreCase) throws IOException {
    initializeIfRequired();


    boolean firstWord = true;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;


    final IStemmer morfologik = new DictionaryLookup(dictionary);


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      String[] taggerTokens = lexiconLookup(word, morfologik);
      if (firstWord && taggerTokens == null && ignoreCase) { // e.g. "Das" -> "das" at start of sentence
        taggerTokens = lexiconLookup(word.toLowerCase(), morfologik);
        firstWord = false;
      }
      if (taggerTokens != null) {
        tagWord(taggerTokens, word, l);
      } else {
        // word not known, try to decompose it and use the last part for POS tagging:
        if (!StringTools.isEmpty(word.trim())) {
          final List<String> compoundParts = compoundTokenizer.tokenize(word);
          if (compoundParts.size() <= 1) {
            l.add(new AnalyzedToken(word, null, null));
          } else {
            // last part governs a word's POS:
            String lastPart = compoundParts.get(compoundParts.size()-1);
            if (StringTools.startsWithUppercase(word)) {
              lastPart = StringTools.uppercaseFirstChar(lastPart);
            }
            taggerTokens = lexiconLookup(lastPart, morfologik);
            if (taggerTokens != null) {
              tagWord(taggerTokens, word, l, compoundParts);
            } else {
              l.add(new AnalyzedToken(word, null, null));
            }
          }
        } else {
          l.add(new AnalyzedToken(word, null, null));
        }
      }


      tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
      pos += word.length();
    }
    return tokenReadings;
  }


  private void tagWord(String[] taggerTokens, String word, List<AnalyzedToken> l) {
    tagWord(taggerTokens, word, l, null);
  }


  /**
   * @param compoundParts all compound parts of the complete word or <code>null</code>,
   *   if the original input is not a compound
   */
  private void tagWord(String[] taggerTokens, String word, List<AnalyzedToken> l,
          List<String> compoundParts) {
    int i = 0;
    while (i < taggerTokens.length) {
      // Lametyzator returns data as String[]
      // first lemma, then annotations
      if (compoundParts != null) {
        // was originally a compound word
        final List<String> allButLastPart = compoundParts.subList(0, compoundParts.size() - 1);
        final String lemma = StringTools.listToString(allButLastPart, "")
            + StringTools.lowercaseFirstChar(taggerTokens[i]);
        l.add(new AnalyzedToken(word, taggerTokens[i + 1], lemma));
      } else {
        l.add(new AnalyzedToken(word, taggerTokens[i + 1], taggerTokens[i]));
      }
      i = i + 2;
    }
  }


  private String[] lexiconLookup(final String word, final IStemmer morfologik) {
    try {
      final String[] posTagsFromUserDict = manualTagger.lookup(word);
      final List<WordData> posTagsFromDict = morfologik.lookup(word);
      if (posTagsFromUserDict != null && !posTagsFromDict.isEmpty()) {
        final String[] allPosTags = new String[posTagsFromUserDict.length + posTagsFromDict.size() * 2];
        int i = 0;
        for (WordData wd : posTagsFromDict) {
          allPosTags[i] = wd.getStem().toString();
          allPosTags[i + 1] = wd.getTag().toString();
          i = i + 2;
        }
        System.arraycopy(posTagsFromUserDict, 0, allPosTags, posTagsFromDict.size() * 2, posTagsFromUserDict.length);
        return allPosTags;
      } else if (posTagsFromUserDict == null && !posTagsFromDict.isEmpty()) {
        final String[] allPosTags = new String[posTagsFromDict.size() * 2];
        int i = 0;
        for (WordData wd : posTagsFromDict) {
          allPosTags[i] = wd.getStem().toString();
          allPosTags[i + 1] = wd.getTag().toString();
          i = i + 2;
        }
        return allPosTags;
      } else {
        return posTagsFromUserDict;
      }
    } catch (Exception e) {
      throw new RuntimeException("Error looking up word '" + word + "'", e);
    }
  }


  @Override
  public final AnalyzedTokenReadings createNullToken(final String token, final int startPos) {
    return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos);
  }


  @Override
  public AnalyzedToken createToken(String token, String posTag) {
    return new AnalyzedToken(token, posTag, null);
  }


}
Source Code of org.languagetool.tagging.de.GermanTagger

Related Classes of org.languagetool.tagging.de.GermanTagger