Source Code of org.languagetool.tagging.BaseTagger

/* LanguageTool, a natural language style checker
 * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tagging;


import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;


import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;


import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tools.StringTools;


/**
 * Base tagger using Morfologik binary dictionaries.
 *
 * @author Marcin Milkowski
 */
public abstract class BaseTagger implements Tagger {


  protected Locale conversionLocale = Locale.getDefault();


  private boolean tagLowercaseWithUppercase = true;
  private volatile Dictionary dictionary;


  /**
   * Get the filename, e.g., {@code /en/english.dict}.
   */
  public abstract String getFileName();


  public void setLocale(Locale locale) {
    conversionLocale = locale;
  }


  protected Dictionary getDictionary() throws IOException {
    Dictionary dict = dictionary;
    if (dict == null) {
      synchronized (this) {
        dict = dictionary;
        if (dict == null) {
          final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(getFileName());
          dictionary = dict = Dictionary.read(url);
        }
      }
    }
    return dict;
  }


  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens)
      throws IOException {
    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word));
      lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord));
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);


      //normal case
      addTokens(taggerTokens, l);


      //tag non-lowercase (alluppercase or startuppercase), but not mixedcase word with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        addTokens(lowerTaggerTokens, l);
      }


      //tag lowercase word with startuppercase word tags
      if (tagLowercaseWithUppercase) {
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
          if (isLowercase) {
            upperTaggerTokens = asAnalyzedTokenList(word,
                dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
            if (!upperTaggerTokens.isEmpty()) {
              addTokens(upperTaggerTokens, l);
            }
          }
        }
      }


      // Additional language-dependent-tagging
      if (l.isEmpty()) {
        List<AnalyzedToken> additionalTaggedTokens = additionalTags(word);
        addTokens(additionalTaggedTokens, l);
      }


      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }


      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }


    return tokenReadings;
  }


  protected List<AnalyzedToken> asAnalyzedTokenList(final String word, final List<WordData> wdList) {
    final List<AnalyzedToken> aTokenList = new ArrayList<>();
    for (WordData wd : wdList) {
      aTokenList.add(asAnalyzedToken(word, wd));
    }
    return aTokenList;
  }


  protected AnalyzedToken asAnalyzedToken(final String word, final WordData wd) {
    String tag = StringTools.asString(wd.getTag());
    // Remove frequency data from tags (if exists)
    // The frequency data is in the last byte after a separator
    if (dictionary.metadata.isFrequencyIncluded() && tag.length()>2) {
      tag = tag.substring(0, tag.length()-2);
    }
    return new AnalyzedToken(
        word,
        tag,
        StringTools.asString(wd.getStem()));
  }


  //please do not make protected, this breaks other languages
  private void addTokens(final List<AnalyzedToken> taggedTokens, final List<AnalyzedToken> l) {
    if (taggedTokens != null) {
      for (AnalyzedToken at : taggedTokens) {
        l.add(at);
      }
    }
  }


  @Override
  public final AnalyzedTokenReadings createNullToken(final String token, final int startPos) {
    return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos);
  }


  @Override
  public AnalyzedToken createToken(String token, String posTag) {
    return new AnalyzedToken(token, posTag, null);
  }


  public void dontTagLowercaseWithUppercase() {
    tagLowercaseWithUppercase = false;
  }


  /*
   *  Additional tagging in some language-dependent circumstances
   */
  public List<AnalyzedToken> additionalTags(String word) {
    return null;
  }


}
Source Code of org.languagetool.tagging.BaseTagger

Related Classes of org.languagetool.tagging.BaseTagger