Examples of IStemmer

morfologik.stemming.IStemmer
org.carrot2.text.linguistic.IStemmer
Simple lemmatization engine transforming an inflected form of a word to its base form or some other unique token.

Examples of morfologik.stemming.IStemmer

    return tokenReadings;
  }


  @Override
  public List<AnalyzedToken> additionalTags(String word) {
    final IStemmer dictLookup;
    try {
      dictLookup = new DictionaryLookup(getDictionary());
    } catch (IOException e) {
      throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
    }
    List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
    //Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
    //Adjectiu femení singular o participi femení singular + -ment
    if (word.endsWith("ment")){
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
      for (AnalyzedToken taggerToken : taggerTokens ) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = ADJ_PART_FS.matcher(posTag);
          if (m.matches()) {
            additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
            return additionalTaggedTokens;
          }
        }
      }
    }
    //Any well-formed verb with prefixes is tagged as a verb copying the original tags
    Matcher matcher=PREFIXES_FOR_VERBS.matcher(word);
    if (matcher.matches()) {
      final String possibleVerb = matcher.group(2).toLowerCase();
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
      for (AnalyzedToken taggerToken : taggerTokens ) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = VERB.matcher(posTag);
          if (m.matches()) {
            String lemma=matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
            additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
          }
        }
      }
      return additionalTaggedTokens;
    }
    // Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
    /*if (word.startsWith("ex")) {
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
      List<AnalyzedToken> taggerTokens;
      taggerTokens = asAnalyzedTokenList(possibleNoun,dictLookup.lookup(possibleNoun));
      for (AnalyzedToken taggerToken : taggerTokens) {
        final String posTag = taggerToken.getPOSTag();
        if (posTag != null) {
          final Matcher m = NOUN.matcher(posTag);
          if (m.matches()) {
            String lemma = "ex".concat(taggerToken.getLemma());
            additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
          }
        }
      }
      return additionalTaggedTokens;
    }*/
    // Interpret deprecated characters of "ela geminada"
    // U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
    // U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
    if (word.contains("\u0140") || word.contains("\u013f")) {
      final String lowerWord = word.toLowerCase(conversionLocale);
      final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
      List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word,dictLookup.lookup(possibleWord));
      return taggerTokens;
    }
    return null;
  }

View Full Code Here

Examples of morfologik.stemming.IStemmer

      p = Pattern.compile("N.*|A.*|V.P.*|PX.");
    } else {
      p = Pattern.compile(posTag);
    }
    final List<String> results = new ArrayList<>();
    final IStemmer synthesizer = createStemmer();
    
    for (final String tag : possibleTags) {
      final Matcher m = p.matcher(tag);
      if (m.matches()) {
        if (addDt) {

View Full Code Here

Examples of morfologik.stemming.IStemmer

  public final String[] synthesize(final AnalyzedToken token,
      final String posTag) throws IOException {
    if (posTag == null) {
      return null;
    }
    final IStemmer synthesizer = new DictionaryLookup(getDictionary());
    boolean isNegated = false;
    if (token.getPOSTag() != null) {
      isNegated = posTag.indexOf(NEGATION_TAG) > 0
          || token.getPOSTag().indexOf(NEGATION_TAG) > 0
          && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0);

View Full Code Here

Examples of morfologik.stemming.IStemmer

    if (posTagRegExp) {
      if (possibleTags == null) {
        possibleTags = SynthesizerTools.loadWords(JLanguageTool.getDataBroker().
            getFromResourceDirAsStream(TAGS_FILE_NAME));
      }
      final IStemmer synthesizer = new DictionaryLookup(getDictionary());
      final List<String> results = new ArrayList<>();


      boolean isNegated = false;
      if (token.getPOSTag() != null) {
        isNegated = posTag.indexOf(NEGATION_TAG) > 0

View Full Code Here

Examples of morfologik.stemming.IStemmer

        }
        return ret;        
    }
    
    private IStemmer loadDictionary() throws IOException {
        IStemmer dictLookup = new DictionaryLookup(Dictionary.read(dictFile));
        return dictLookup;
    }

View Full Code Here

Examples of morfologik.stemming.IStemmer

    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;    
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer morfologik = new DictionaryLookup(getDictionary());


    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(plLocale);
      taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word));
      lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord));       
      final boolean isLowercase = word.equals(lowerWord);


      //normal case
      addTokens(taggerTokens, l);


      if (!isLowercase) {
        //lowercase
        addTokens(lowerTaggerTokens, l);
      }


      //uppercase
      if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
        if (isLowercase) {
          upperTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(StringTools
              .uppercaseFirstChar(word)));
          if (!upperTaggerTokens.isEmpty()) {
            addTokens(upperTaggerTokens, l);
          } else {
            l.add(new AnalyzedToken(word, null, null));

View Full Code Here

Examples of morfologik.stemming.IStemmer

    this.manualSynthesizer = manualSynthesizer;
  }


  @Override
  protected IStemmer createStemmer() {
    return new IStemmer() { // null synthesiser 
      @Override
      public List<WordData> lookup(CharSequence word) {
        return new ArrayList<>();
      }
    };

View Full Code Here

Examples of org.carrot2.text.linguistic.IStemmer

        {
            // Here we always return the same language model, regardless of the requested
            // language. In your implementation you may want to return different models
            // based on the language, if needed.
            System.out.println("stemmer");
            return new IStemmer()
            {
                public CharSequence stem(CharSequence word)
                {
                    // Some contrived stemming algorithm
                    return word.length() > 3 ? word.subSequence(0, word.length() - 2)

View Full Code Here

Examples of org.carrot2.text.linguistic.IStemmer

    /**
     * Performs stemming and saves the results to the <code>context</code>.
     */
    public void stem(PreprocessingContext context)
    {
        final IStemmer stemmer = context.language.getStemmer();


        final char [][] wordImages = context.allWords.image;
        final char [][] stemImages = new char [wordImages.length] [];


        final MutableCharArray mutableCharArray = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);
        char [] buffer = new char [128];


        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];


            final boolean different = CharArrayUtils.toLowerCase(word, buffer);


            mutableCharArray.reset(buffer, 0, word.length);
            final CharSequence stemmed = stemmer.stem(mutableCharArray);
            if (stemmed != null)
            {
                mutableCharArray.reset(stemmed);
                stemImages[i] = context.intern(mutableCharArray);
            }

View Full Code Here

Examples of org.carrot2.text.linguistic.IStemmer

public final class TestStemmerFactory implements IStemmerFactory
{
    @Override
    public IStemmer getStemmer(LanguageCode language)
    {
        return new IStemmer()
        {
            public CharSequence stem(CharSequence word)
            {
                if (word.length() > 2)
                {

View Full Code Here

0 1 2

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.