Examples of ILexicalData


Examples of org.carrot2.text.linguistic.ILexicalData

    }
    return solrStopWords.get(fieldName);
  }

  public ILexicalData getLexicalData(LanguageCode languageCode) {
    final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
        .getLexicalData(languageCode);

    return new ILexicalData() {
      public boolean isStopLabel(CharSequence word) {
        // Nothing in Solr maps to the concept of a stop label,
        // so return Carrot2's default here.
        return carrot2LexicalData.isStopLabel(word);
      }

      public boolean isCommonWord(MutableCharArray word) {
        // Loop over the fields involved in clustering first
        for (String fieldName : fieldNames) {
          for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
            if (stopWords.contains(word)) {
              return true;
            }
          }
        }
        // Check default Carrot2 stop words too
        return carrot2LexicalData.isCommonWord(word);
      }
    };
  }
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

      return;
    }

    // Test with Maltese so that the English clustering performed in other tests
    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);

    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

        {
            // Here we always return the same language model, regardless of the requested
            // language. In your implementation you may want to return different models
            // based on the language, if needed.
            System.out.println("lexical data");
            return new ILexicalData()
            {
                @Override
                public boolean isStopLabel(CharSequence formattedLabel)
                {
                    return formattedLabel.length() <= 4;
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

        final char [][] wordImages = context.allWords.image;
        final short [] types = context.allWords.type;

        final MutableCharArray mutableCharArray = new MutableCharArray("");
        char [] buffer = new char [128];
        final ILexicalData lexData = context.language.getLexicalData();

        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];

            CharArrayUtils.toLowerCase(word, buffer);
            mutableCharArray.reset(buffer, 0, word.length);
            if (lexData.isCommonWord(mutableCharArray))
            {
                types[i] |= ITokenizer.TF_COMMON_WORD;
            }
        }
    }
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

                return -Float.compare(c1.score, c2.score);
            }
        });

        j = 0;
        ILexicalData lexicalData = context.language.getLexicalData();
        for (int max = candidates.size(), i = 0; i < max && j < maxBaseClusters; i++)
        {
            ClusterCandidate cc = candidates.get(i);
            // Build the candidate cluster's label for filtering. This may be costly so
            // we only do this for base clusters which are promoted to merging phase.
            assert cc.phrases.size() == 1;
            if (!lexicalData.isStopLabel(buildLabel(cc.phrases.get(0))))
            {
                candidates.set(j++, cc);
            }
        }
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

public final class TestLexicalDataFactory implements ILexicalDataFactory
{
    @Override
    public ILexicalData getLexicalData(LanguageCode language)
    {
        return new ILexicalData()
        {
            public boolean isCommonWord(MutableCharArray word)
            {
                return word.toString().contains("stop");
            }
View Full Code Here

Examples of org.carrot2.text.linguistic.ILexicalData

    }

    @Override
    public ILexicalData getLexicalData(LanguageCode language)
    {
        return new ILexicalData()
        {
            public boolean isCommonWord(MutableCharArray word)
            {
                return word.toString().contains("stop");
            }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.