Source Code of org.languagetool.Language

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool;


import org.apache.commons.lang.StringUtils;
import org.languagetool.chunking.Chunker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.language.Contributor;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.Rule;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleLoader;
import org.languagetool.rules.patterns.Unifier;
import org.languagetool.rules.patterns.UnifierConfiguration;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
import org.languagetool.tagging.disambiguation.xx.DemoDisambiguator;
import org.languagetool.tagging.xx.DemoTagger;
import org.languagetool.tokenizers.*;
import org.languagetool.tools.MultiKeyProperties;
import org.languagetool.tools.StringTools;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.net.URL;
import java.util.*;


/**
 * Base class for any supported language (English, German, etc). Language classes
 * are detected at runtime by searching the classpath for files named
 * {@code META-INF/org/languagetool/language-module.properties}. Those file(s)
 * need to contain a key {@code languageClasses} which specifies the fully qualified
 * class name(s), e.g. {@code org.languagetool.language.English}. Use commas to specify 
 * more than one class.
 */
public abstract class Language {


  private static final String PROPERTIES_PATH = "META-INF/org/languagetool/language-module.properties";
  private static final String PROPERTIES_KEY = "languageClasses";
  
  private static List<Language> externalLanguages = new ArrayList<>();


  private boolean isExternalLanguage = false;


  private List<String> externalRuleFiles = new ArrayList<>();
  private List<PatternRule> patternRules;
  
  /**
   * All languages supported by LanguageTool. This includes at least a "demo" language
   * for testing.
   */
  public static Language[] LANGUAGES = getLanguages();


  private static Language[] getLanguages() {
    final List<Language> languages = new ArrayList<>();
    final Set<String> languageClassNames = new HashSet<>();
    try {
      final Enumeration<URL> propertyFiles = Language.class.getClassLoader().getResources(PROPERTIES_PATH);
      while (propertyFiles.hasMoreElements()) {
        final URL url = propertyFiles.nextElement();
        try (InputStream inputStream = url.openStream()) {
          // We want to be able to read properties file with duplicate key, as produced by
          // Maven when merging files:
          final MultiKeyProperties props = new MultiKeyProperties(inputStream);
          final List<String> classNamesStr = props.getProperty(PROPERTIES_KEY);
          if (classNamesStr == null) {
            throw new RuntimeException("Key '" + PROPERTIES_KEY + "' not found in " + url);
          }
          for (String classNames : classNamesStr) {
            final String[] classNamesSplit = classNames.split("\\s*,\\s*");
            for (String className : classNamesSplit) {
              if (languageClassNames.contains(className)) {
                // avoid duplicates - this way we are robust against problems with the maven assembly
                // plugin which aggregates files more than once (in case the deployment descriptor
                // contains both <format>zip</format> and <format>dir</format>):
                continue;
              }
              languages.add(createLanguageObjects(url, className));
              languageClassNames.add(className);
            }
          }
        }
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    return languages.toArray(new Language[languages.size()]);
  }


  private static Language createLanguageObjects(URL url, String className) {
    try {
      final Class<?> aClass = Class.forName(className);
      final Constructor<?> constructor = aClass.getConstructor();
      return (Language) constructor.newInstance();
    } catch (ClassNotFoundException e) {
      throw new RuntimeException("Class '" + className + "' specified in " + url + " could not be found in classpath", e);
    } catch (Exception e) {
      throw new RuntimeException("Object for class '" + className + "' specified in " + url + " could not created", e);
    }
  }


  /**
   * All languages supported by LanguageTool, but without the demo language.
   */
  public static final Language[] REAL_LANGUAGES = getRealLanguages();


  /**
   * Returns all languages supported by LanguageTool but without the demo language.
   * In contrast to Language.REAL_LANGUAGES contains external languages as well.
   * @return All supported languages.
   * @since 2.6
   */
  public static Language[] getRealLanguages() {
    List<Language> result = new ArrayList<>();
    for (Language lang : LANGUAGES) {
      if (!"xx".equals(lang.getShortName())) {  // skip demo language
        result.add(lang);
      }
    }
    return result.toArray(new Language[result.size()]);
  }


  private static final Language[] BUILTIN_LANGUAGES = LANGUAGES;


  private static final Disambiguator DEMO_DISAMBIGUATOR = new DemoDisambiguator();
  private static final Tagger DEMO_TAGGER = new DemoTagger();
  private static final SentenceTokenizer SENTENCE_TOKENIZER = new SimpleSentenceTokenizer();
  private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer();
  
  private UnifierConfiguration unifierConfiguration = new UnifierConfiguration();
  private UnifierConfiguration disambiguationUnifierConfiguration = new UnifierConfiguration();


  // -------------------------------------------------------------------------


  /**
   * Get this language's two character code, e.g. <code>en</code> for English.
   * The country parameter (e.g. "US"), if any, is not returned.
   * @return language code
   */
  public abstract String getShortName();


  /**
   * Get this language's name in English, e.g. <code>English</code> or
   * <code>German (Germany)</code>.
   * @return language name
   */
  public abstract String getName();


  /**
   * Set this language's name in English.
   * @since 2.6
   */
  public abstract void setName(final String name);
  
  /**
   * Get this language's country options , e.g. <code>US</code> (as in <code>en-US</code>) or
   * <code>PL</code> (as in <code>pl-PL</code>).
   * @return String[] - array of country options for the language.
   */
  public abstract String[] getCountries();


  /**
   * Get this language's variant, e.g. <code>valencia</code> (as in <code>ca-ES-valencia</code>)
   * or <code>null</code>.
   * Attention: not to be confused with "country" option
   * @return String - variant for the language.
   * @since 2.3
   */
  public String getVariant() {
    return null;
  }
  
  /**
   * Get enabled rules different from the default ones for this language variant. 
   * 
   * @return enabled rules for the language variant.
   * @since 2.4
   */
  public List<String> getDefaultEnabledRulesForVariant() {
    return new ArrayList<>();
  }


  /**
   * Get disabled rules different from the default ones for this language variant. 
   * 
   * @return disabled rules for the language variant.
   * @since 2.4
   */
  public List<String> getDefaultDisabledRulesForVariant() {
    return new ArrayList<>();
  }
  /**
   * Get the name(s) of the maintainer(s) for this language or <code>null</code>.
   */
  public abstract Contributor[] getMaintainers();


  /**
   * Get the rules classes that should run for texts in this language.
   * @since 1.4 (signature modified in 2.7)
   */
  public abstract List<Rule> getRelevantRules(ResourceBundle messages) throws IOException;


  // -------------------------------------------------------------------------


  /**
   * @param indexDir directory with a '3grams' sub directory which contains a Lucene index with 3gram occurrence counts
   * @return a LanguageModel or {@code null} if this language doesn't support one
   * @since 2.7
   */
  public LanguageModel getLanguageModel(File indexDir) throws IOException {
    return null;
  }


  /**
   * Get a list of rules that require a {@link LanguageModel}. Returns an empty list for
   * languages that don't have such rules.
   * @since 2.7
   */
  public List<Rule> getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel) throws IOException {
    return Collections.emptyList();
  }


  /**
   * Get this language's Java locale, not considering the country code.
   */
  public Locale getLocale() {
    return new Locale(getShortName());
  }


  /**
   * Get this language's Java locale, considering language code and country code (if any).
   * @since 2.1
   */
  public Locale getLocaleWithCountryAndVariant() {
    if (getCountries().length > 0) {
      if (getVariant() != null) {
        return new Locale(getShortName(), getCountries()[0], getVariant());
      }
      else {
        return new Locale(getShortName(), getCountries()[0]);
      }
    } else {
      return getLocale();
    }
  }


  /**
   * Get the location of the rule file(s) in a form like {@code /org/languagetool/rules/de/grammar.xml}.
   */
  public List<String> getRuleFileNames() {
    final List<String> ruleFiles = new ArrayList<>();
    ruleFiles.addAll(getExternalRuleFiles());
    final ResourceDataBroker dataBroker = JLanguageTool.getDataBroker();
    ruleFiles.add(dataBroker.getRulesDir()
            + "/" + getShortName() + "/" + JLanguageTool.PATTERN_FILE);
    if (getShortNameWithCountryAndVariant().length() > 2) {
      final String fileName = getShortName() + "/"
              + getShortNameWithCountryAndVariant()
              + "/" + JLanguageTool.PATTERN_FILE;
      if (dataBroker.ruleFileExists(fileName)) {
        ruleFiles.add(dataBroker.getRulesDir() + "/" + fileName);
      }
    }
    return ruleFiles;
  }


  /**
   * @since 2.6
   */
  public List<String> getExternalRuleFiles() {
    return externalRuleFiles;
  }


  /**
   * Adds an external rule file to the language. After running this method,
   * one has to run JLanguageTool.activateDefaultPatternRules() to make sure
   * that all external rules are activated.
   *
   * @param externalRuleFile Absolute file path to rules.
   * @since 2.6
   */
  public void addExternalRuleFile(String externalRuleFile) {
    externalRuleFiles.add(externalRuleFile);
  }




  /**
   * Languages that have country variants need to overwrite this to select their most common variant.
   * @return default country variant or {@code null}
   * @since 1.8
   */
  public Language getDefaultLanguageVariant() {
    return null;
  }


  /**
   * Get this language's part-of-speech disambiguator implementation or {@code null}.
   */
  public Disambiguator getDisambiguator() {
    return DEMO_DISAMBIGUATOR;
  }


  /**
   * Get this language's part-of-speech tagger implementation. The tagger must not 
   * be {@code null}, but it can be a trivial pseudo-tagger that only assigns {@code null} tags.
   */
  public Tagger getTagger() {
    return DEMO_TAGGER;
  }


  /**
   * Get this language's sentence tokenizer implementation.
   */
  public SentenceTokenizer getSentenceTokenizer() {
    return SENTENCE_TOKENIZER;
  }


  /**
   * Get this language's word tokenizer implementation.
   */
  public Tokenizer getWordTokenizer() {
    return WORD_TOKENIZER;
  }


  /**
   * Get this language's chunker implementation or {@code null}.
   * @since 2.3
   */
  public Chunker getChunker() {
    return null;
  }


  /**
   * Get this language's part-of-speech synthesizer implementation or {@code null}.
   */
  public Synthesizer getSynthesizer() {
    return null;
  }


  /**
   * Get this language's feature unifier.
   * @return Feature unifier for analyzed tokens.
   */
  public Unifier getUnifier() {
    return unifierConfiguration.createUnifier();
  }
  
  /**
   * Get this language's feature unifier used for disambiguation.
   * Note: it might be different from the normal rule unifier.
   * @return Feature unifier for analyzed tokens.
   */
  public Unifier getDisambiguationUnifier() {
    return disambiguationUnifierConfiguration.createUnifier();
  }


  /**
   * @since 2.3
   */
  public UnifierConfiguration getUnifierConfiguration() {
    return unifierConfiguration;
  }


  /**
   * @since 2.3
   */
  public UnifierConfiguration getDisambiguationUnifierConfiguration() {
    return disambiguationUnifierConfiguration;
  }
  
  /**
   * Get the name of the language translated to the current locale,
   * if available. Otherwise, get the untranslated name.
   */
  public final String getTranslatedName(final ResourceBundle messages) {
    try {
      return messages.getString(getShortNameWithCountryAndVariant());
    } catch (final MissingResourceException e) {
      try {
        return messages.getString(getShortName());
      } catch (final MissingResourceException e1) {
        return getName();
      }
    }
  }
  
  /**
   * Get the short name of the language with country and variant (if any), if it is
   * a single-country language. For generic language classes, get only a two- or
   * three-character code.
   * @since 1.8
   */
  public final String getShortNameWithCountryAndVariant() {
    String name = getShortName();
    if (getCountries().length == 1 
            && !name.contains("-x-")) {   // e.g. "de-DE-x-simple-language"
      name += "-" + getCountries()[0];
      if (getVariant() != null) {   // e.g. "ca-ES-valencia"
        name += "-" + getVariant();
      }
    }
    return name;
  }
  
  
  /**
   * Start symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}.
   * Note that the array must be of equal length as {@link #getUnpairedRuleEndSymbols()} and the sequence of
   * starting symbols must match exactly the sequence of ending symbols.
   * @deprecated will be moved to GenericUnpairedBracketsRule (deprecated since 2.8)
   */
  @Deprecated
  public String[] getUnpairedRuleStartSymbols() {
    return new String[]{ "[", "(", "{", "\"", "'" };
  }


  /**
   * End symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}.
   * @deprecated will be moved to GenericUnpairedBracketsRule (deprecated since 2.8)
   * @see #getUnpairedRuleStartSymbols()
   */
  @Deprecated
  public String[] getUnpairedRuleEndSymbols() {
    return new String[]{ "]", ")", "}", "\"", "'" };
  }
  
  // -------------------------------------------------------------------------


  /**
   * Get the pattern rules as defined in the files returned by {@link #getRuleFileNames()}.
   * @since 2.7
   */
  @Experimental
  List<PatternRule> getPatternRules() throws IOException {
    if (patternRules == null) {
      patternRules = new ArrayList<>();
      PatternRuleLoader ruleLoader = new PatternRuleLoader();
      for (String fileName : getRuleFileNames()) {
        InputStream is = this.getClass().getResourceAsStream(fileName);
        patternRules.addAll(ruleLoader.getRules(is, fileName));
      }
    }
    return patternRules;
  }
  
  // -------------------------------------------------------------------------
  
  /**
   * Re-inits the built-in languages and adds the specified ones.
   */
  public static void reInit(final List<Language> languages) {
    LANGUAGES = new Language[BUILTIN_LANGUAGES.length + languages.size()];
    int i = BUILTIN_LANGUAGES.length;
    System.arraycopy(BUILTIN_LANGUAGES, 0,
        LANGUAGES, 0, BUILTIN_LANGUAGES.length);
    for (final Language lang : languages) {
      LANGUAGES[i++] = lang;
    }
    externalLanguages = languages;
  }


  /**
   * Return languages that are not built-in but have been added manually.
   */
  public static List<Language> getExternalLanguages() {
    return externalLanguages;
  }
  
  /**
   * Return all languages supported by LanguageTool.
   * @return A list of all languages, including external ones and country variants (e.g. {@code en-US})
   */
  public static List<Language> getAllLanguages() {
    final List<Language> langList = new ArrayList<>();
    Collections.addAll(langList, LANGUAGES);
    langList.addAll(externalLanguages);
    return langList;
  }


  /**
   * Get the Language object for the given language name.
   *
   * @param languageName e.g. <code>English</code> or <code>German</code> (case is significant)
   * @return a Language object or {@code null} if there is no such language
   */
  public static Language getLanguageForName(final String languageName) {
    for (Language element : Language.LANGUAGES) {
      if (languageName.equals(element.getName())) {
        return element;
      }
    }
    return null;
  }


  /**
   * Get the Language object for the given short language name.
   *
   * @param langCode e.g. <code>en</code> or <code>es-US</code>
   * @return a Language object
   * @throws IllegalArgumentException if the language is not supported or if the language code is invalid
   */
  public static Language getLanguageForShortName(final String langCode) {
    final Language language = getLanguageForShortNameOrNull(langCode);
    if (language == null) {
      final List<String> codes = new ArrayList<>();
      for (Language realLanguage : LANGUAGES) {
        codes.add(realLanguage.getShortNameWithCountryAndVariant());
      }
      Collections.sort(codes);
      throw new IllegalArgumentException("'" + langCode + "' is not a language code known to LanguageTool." +
              " Supported language codes are: " + StringUtils.join(codes, ", ") + ". The list of languages is read from " + PROPERTIES_PATH +
              " in the Java classpath. See http://wiki.languagetool.org/java-api for details.");
    }
    return language;
  }


  /**
   * Return whether a language with the given language code is supported. Which languages
   * are supported depends on the classpath when the {@code Language} object is initialized.
   *
   * @param langCode e.g. {@code en} or {@code en-US}
   * @return true if the language is supported
   * @throws IllegalArgumentException in some cases of an invalid language code format
   * @since 2.1
   */
  public static boolean isLanguageSupported(final String langCode) {
    return getLanguageForShortNameOrNull(langCode) != null;
  }
  
  private static Language getLanguageForShortNameOrNull(final String langCode) {
    StringTools.assureSet(langCode, "langCode");
    Language result = null;
    if (langCode.contains("-x-")) {
      // e.g. "de-DE-x-simple-language"
      for (Language element : Language.LANGUAGES) {
        if (element.getShortName().equalsIgnoreCase(langCode)) {
          return element;
        }
      }
    } else if (langCode.contains("-")) {
      final String[] parts = langCode.split("-"); 
      if (parts.length == 2) { // e.g. en-US
        for (Language element : Language.LANGUAGES) {
          if (parts[0].equalsIgnoreCase(element.getShortName())
              && element.getCountries().length == 1
              && parts[1].equalsIgnoreCase(element.getCountries()[0])) {
            result = element;
            break;
          }
        }
      } else if (parts.length == 3) { // e.g. ca-ES-valencia
        for (Language element : Language.LANGUAGES) {
          if (parts[0].equalsIgnoreCase(element.getShortName())
              && element.getCountries().length == 1
              && parts[1].equalsIgnoreCase(element.getCountries()[0])
              && parts[2].equalsIgnoreCase(element.getVariant())) {
            result = element;
            break;
          }
        }
      }
      else { 
        throw new IllegalArgumentException("'" + langCode + "' isn't a valid language code");
      }
    } else {
      for (Language element : Language.LANGUAGES) {
        if (langCode.equalsIgnoreCase(element.getShortName())) {
          result = element;
          break;
        }
      }
    }
    return result;
  }
  
  /**
   * Get the best match for a locale, using American English as the final fallback if nothing
   * else fits. The returned language will be a country variant language (e.g. British English, not just English)
   * if available.
   * @since 1.8
   * @throws RuntimeException if no language was found and American English as a fallback is not available
   */
  public static Language getLanguageForLocale(final Locale locale) {
    final Language language = getLanguageForLanguageNameAndCountry(locale);
    if (language != null) {
      return language;
    } else {
      final Language firstFallbackLanguage = getLanguageForLanguageNameOnly(locale);
      if (firstFallbackLanguage != null) {
        return firstFallbackLanguage;
      }
    }
    for (Language aLanguage : REAL_LANGUAGES) {
      if (aLanguage.getShortNameWithCountryAndVariant().equals("en-US")) {
        return aLanguage;
      }
    }
    throw new RuntimeException("No appropriate language found, not even en-US. Supported languages: " + Arrays.toString(REAL_LANGUAGES));
  }


  private static Language getLanguageForLanguageNameAndCountry(Locale locale) {
    for (Language language : Language.REAL_LANGUAGES) {
      if (language.getShortName().equals(locale.getLanguage())) {
        final List<String> countryVariants = Arrays.asList(language.getCountries());
        if (countryVariants.contains(locale.getCountry())) {
          return language;
        }
      }
    }
    return null;
  }


  private static Language getLanguageForLanguageNameOnly(Locale locale) {
    // use default variant if available:
    for (Language language : Language.REAL_LANGUAGES) {
      if (language.getShortName().equals(locale.getLanguage()) && language.hasVariant()) {
        final Language defaultVariant = language.getDefaultLanguageVariant();
        if (defaultVariant != null) {
          return defaultVariant;
        }
      }
    }
    // use the first match otherwise (which should be the only match):
    for (Language language : Language.REAL_LANGUAGES) {
      if (language.getShortName().equals(locale.getLanguage()) && !language.hasVariant()) {
        return language;
      }
    }
    return null;
  }


  @Override
  public final String toString() {
    return getName();
  }


  /**
   * Whether this is a country variant of another language, i.e. whether it doesn't
   * directly extend {@link Language}, but a subclass of {@link Language}.
   * @since 1.8
   */
  public final boolean isVariant() {
    for (Language language : LANGUAGES) {
      final boolean skip = language.getShortNameWithCountryAndVariant().equals(getShortNameWithCountryAndVariant());
      if (!skip && language.getClass().isAssignableFrom(getClass())) {
        return true;
      }
    }
    return false;
  }


  /**
   * Whether this class has at least one subclass that implements variants of this language.
   * @since 1.8
   */
  public final boolean hasVariant() {
    for (Language language : LANGUAGES) {
      final boolean skip = language.getShortNameWithCountryAndVariant().equals(getShortNameWithCountryAndVariant());
      if (!skip && getClass().isAssignableFrom(language.getClass())) {
        return true;
      }
    }
    return false;
  }


  public boolean isExternal() {
    return isExternalLanguage;
  }


  /**
   * Sets the language as external. Useful for
   * making a copy of an existing language.
   * @since 2.6
   */
  public void makeExternal() {
    isExternalLanguage = true;
  }


  /**
   * Return true if this is the same language as the given one, considering country
   * variants only if set for both languages. For example: en = en, en = en-GB, en-GB = en-GB,
   * but en-US != en-GB
   * @since 1.8
   */
  public boolean equalsConsiderVariantsIfSpecified(Language otherLanguage) {
    if (getShortName().equals(otherLanguage.getShortName())) {
      final boolean thisHasCountry = hasCountry();
      final boolean otherHasCountry = otherLanguage.hasCountry();
      return !(thisHasCountry && otherHasCountry) ||
              getShortNameWithCountryAndVariant().equals(otherLanguage.getShortNameWithCountryAndVariant());
    } else {
      return false;
    }
  }


  private boolean hasCountry() {
    return getCountries().length == 1;
  }


}
Source Code of org.languagetool.Language

Related Classes of org.languagetool.Language