Source Code of org.languagetool.synthesis.ca.CatalanSynthesizer

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2012 Jaume Ortolà i Font
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.synthesis.ca;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;


import org.languagetool.AnalyzedToken;
import org.languagetool.synthesis.BaseSynthesizer;


/**
 * Catalan word form synthesizer.
 * 
 * There are special additions:
 * "DT" tag adds "el, la, l', els, les" according to the gender  
 * and the number of the word and the Catalan rules for apostrophation (l').
 * "DTa" adds "al, a la, a l', als, a les"
 * "DTde" adds "del, de la, de l', dels, de les"
 * "DTper" adds "pel, per la, per l', pels, per les"
 * "DTca" adds "cal, ca la, ca l', cals, ca les"
 *
 * @author Jaume Ortolà i Font
 */
public class CatalanSynthesizer extends BaseSynthesizer {


  private static final String RESOURCE_FILENAME = "/ca/catalan_synth.dict";
  private static final String TAGS_FILE_NAME = "/ca/catalan_tags.txt";


  /** A special tag to add determiner (el, la, l', els, les). **/
  // private static final String ADD_DETERMINER = "DT";


  /** Patterns for number and gender **/
  private static final Pattern pMS = Pattern.compile("(N|A.).[MC][SN].*|V.P.*SM.?");
  private static final Pattern pFS = Pattern.compile("(N|A.).[FC][SN].*|V.P.*SF.?");
  private static final Pattern pMP = Pattern.compile("(N|A.).[MC][PN].*|V.P.*PM.?");
  private static final Pattern pFP = Pattern.compile("(N|A.).[FC][PN].*|V.P.*PF.?");


  /** Pattern for previous preposition passed in the postag **/
  private static final Pattern pPrep = Pattern.compile("(DT)(.*)");


  /** Patterns for apostrophation **/
  private static final Pattern pMascYes = Pattern.compile("h?[aeiouàèéíòóú].*",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern pMascNo = Pattern.compile("h?[ui][aeioàèéóò].+",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern pFemYes = Pattern.compile("h?[aeoàèéíòóú].*|h?[ui][^aeiouàèéíòóúüï]+[aeiou][ns]?|urbs",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  private static final Pattern pFemNo = Pattern.compile("host|ira|inxa",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
  
  /** Patterns verb **/
  private static final Pattern pVerb = Pattern.compile("V.*[CVBXYZ0123456]");


  public CatalanSynthesizer() {
    super(RESOURCE_FILENAME, TAGS_FILE_NAME);
  }


  @Override
  public String[] synthesize(final AnalyzedToken token, final String posTag) throws IOException {
    initPossibleTags();
    Pattern p;
    boolean addDt = false; 
    String prep = ""; 
    final Matcher mPrep = pPrep.matcher(posTag);
    if (mPrep.matches()) {
      addDt=true; // add definite article before token
      if (mPrep.groupCount()>1) {
        prep=mPrep.group(2); // add preposition before article
      }
    }
    if (addDt) {
      p = Pattern.compile("N.*|A.*|V.P.*|PX.");
    } else {
      p = Pattern.compile(posTag);
    }
    final List<String> results = new ArrayList<>();
    final IStemmer synthesizer = createStemmer();
    
    for (final String tag : possibleTags) {
      final Matcher m = p.matcher(tag);
      if (m.matches()) {
        if (addDt) {
          lookupWithEl(token.getLemma(), tag, prep, results, synthesizer);
        } else {
          lookup(token.getLemma(), tag, results);
        }
      }
    }       
    
    // if not found, try verbs from any regional variant
    if ((results.size() == 0) && posTag.startsWith("V")) {
      if (!posTag.endsWith("0")) {
            lookup(token.getLemma(), posTag.substring(0, posTag.length() - 1).concat("0"), results);
      }
      if (results.size() == 0) { // another try
        return synthesize(token, posTag.substring(0, posTag.length() - 1).concat("."), true);
      }
    }
    return results.toArray(new String[results.size()]);
  }
  
  @Override
  public String[] synthesize(final AnalyzedToken token, final String posTag,
      final boolean posTagRegExp) throws IOException {
    if (posTagRegExp) {
      initPossibleTags();
      Pattern p = Pattern.compile(posTag);
      final List<String> results = new ArrayList<>();
      for (final String tag : possibleTags) {
        final Matcher m = p.matcher(tag);
        if (m.matches()) {
          lookup(token.getLemma(), tag, results);
        }
      }
      // if not found, try verbs from any regional variant
      if ((results.size() == 0)) {
        final Matcher mVerb = pVerb.matcher(posTag);
        if (mVerb.matches()) {
          if (!posTag.endsWith("0")) {
            p = Pattern.compile(posTag.substring(0, posTag.length() - 1)
                .concat("0"));
            for (final String tag : possibleTags) {
              final Matcher m = p.matcher(tag);
              if (m.matches()) {
                lookup(token.getLemma(), tag, results);
              }
            }
          }
          if (results.size() == 0) { // another try
            p = Pattern.compile(posTag.substring(0, posTag.length() - 1)
                .concat("."));
            for (final String tag : possibleTags) {
              final Matcher m = p.matcher(tag);
              if (m.matches()) {
                lookup(token.getLemma(), tag, results);
              }
            }
          }
        }
      }
      return results.toArray(new String[results.size()]);
    }
    return synthesize(token, posTag);
  }


  /**
   * Lookup the inflected forms of a lemma defined by a part-of-speech tag.
   * Adds determiner "el" properly inflected and preposition
   * (prep. +) det. + noun. / adj.
   * @param lemma the lemma to be inflected.
   * @param posTag the desired part-of-speech tag.
   * @param results the list to collect the inflected forms.
   * @param synthesizer the stemmer to use.
   */
  private void lookupWithEl(String lemma, String posTag, String prep, List<String> results, IStemmer synthesizer) {
    final List<WordData> wordForms = synthesizer.lookup(lemma + "|" + posTag);
    final Matcher mMS = pMS.matcher(posTag);
    final Matcher mFS = pFS.matcher(posTag);
    final Matcher mMP = pMP.matcher(posTag);
    final Matcher mFP = pFP.matcher(posTag);
    for (WordData wd : wordForms) {
      final String word = wd.getStem().toString();
      if (mMS.matches()) {
        final Matcher mMascYes = pMascYes.matcher(word);
        final Matcher mMascNo = pMascNo.matcher(word);
        if (prep.equals("per")) {  if (mMascYes.matches() && !mMascNo.matches()) {  results.add("per l'" + word);  }  else {results.add("pel " + word);  } }
        else if (prep.isEmpty()) {  if (mMascYes.matches() && !mMascNo.matches()) {  results.add("l'" + word);  }  else {results.add("el " + word);  } }
        else {  if (mMascYes.matches() && !mMascNo.matches()) {  results.add(prep+" l'" + word);  }  else {results.add(prep+"l " + word);  } }
        
      }
      if (mFS.matches()) {
        final Matcher mFemYes = pFemYes.matcher(word);
        final Matcher mFemNo = pFemNo.matcher(word);
        if (prep.equals("per")) {  if (mFemYes.matches() && !mFemNo.matches()) {  results.add("per l'" + word);  }  else {results.add("per la " + word);} }
        else if (prep.isEmpty()) {  if (mFemYes.matches() && !mFemNo.matches()) {  results.add("l'" + word);  }  else {results.add("la " + word);} }
        else {  if (mFemYes.matches() && !mFemNo.matches()) {  results.add(prep+" l'" + word);  }  else {results.add(prep+" la " + word);} }
      }
      if (mMP.matches()) {    
        if (prep.equals("per")) { results.add("pels " + word); }
        else if (prep.isEmpty()) { results.add("els " + word); }
        else { results.add(prep+"ls " + word); }
      }
      if (mFP.matches()) { 
        if (prep.isEmpty()) { results.add("les " + word);  } else {results.add(prep+" les " + word);  }
      }
    }


  }
}
Source Code of org.languagetool.synthesis.ca.CatalanSynthesizer

Related Classes of org.languagetool.synthesis.ca.CatalanSynthesizer