Package morfologik.stemming

Examples of morfologik.stemming.DictionaryLookup


  }

  public void testDictionary() throws IOException {
    final Dictionary dictionary = Dictionary.read(
        JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
    final DictionaryLookup dl = new DictionaryLookup(dictionary);
    for (WordData wd : dl) {
      if (wd.getTag() == null || wd.getTag().length() == 0) {
        System.err.println("**** Warning: the word " + wd.getWord() + "/" + wd.getStem()
                + " lacks a POS tag in the dictionary.");
      }
View Full Code Here


    if (posTag == null) {
      return null;
    }
    if (synthesizer == null) {
      final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME);
      synthesizer = new DictionaryLookup(Dictionary.read(url));
    }
    boolean isNegated = false;
    if (token.getPOSTag() != null) {
      isNegated = posTag.indexOf(NEGATION_TAG) > 0
          || token.getPOSTag().indexOf(NEGATION_TAG) > 0
View Full Code Here

        possibleTags = SynthesizerTools.loadWords(JLanguageTool.getDataBroker().
            getFromResourceDirAsStream(TAGS_FILE_NAME));
      }
      if (synthesizer == null) {
        final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME);
        synthesizer = new DictionaryLookup(Dictionary.read(url));
      }
      final ArrayList<String> results = new ArrayList<>();

      boolean isNegated = false;
      if (token.getPOSTag() != null) {
View Full Code Here

          dictFile = new File(url.toURI());
        } catch (URISyntaxException e) {
          throw new RuntimeException("Could not load " + ENGLISH_DICT, e);
        }
        try {
          dictLookup = new DictionaryLookup(Dictionary.read(dictFile));
        } catch (IOException e) {
          throw new RuntimeException("Could not load " + dictFile, e);
        }
        return dictLookup;
    }
View Full Code Here

    boolean firstWord = true;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;

    final IStemmer morfologik = new DictionaryLookup(dictionary);

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      String[] taggerTokens = lexiconLookup(word, morfologik);
      if (firstWord && taggerTokens == null && ignoreCase) { // e.g. "Das" -> "das" at start of sentence
View Full Code Here

  }

  public void testDictionary() throws IOException {
    final Dictionary dictionary = Dictionary.read(
        JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/de/german.dict"));
    final DictionaryLookup dl = new DictionaryLookup(dictionary);
    for (WordData wd : dl) {
      if (wd.getTag() == null || wd.getTag().length() == 0) {
        System.err.println("**** Warning: the word " + wd.getWord() + "/" + wd.getStem()
                + " lacks a POS tag in the dictionary.");
      }
View Full Code Here

    return false;
  }

  public static void testDictionary(BaseTagger tagger, Language language) throws IOException {
    final Dictionary dictionary = Dictionary.read(JLanguageTool.getDataBroker().getFromResourceDirAsUrl(tagger.getFileName()));
    final DictionaryLookup lookup = new DictionaryLookup(dictionary);
    for (WordData wordData : lookup) {
      if (wordData.getTag() == null || wordData.getTag().length() == 0) {
        System.err.println("**** Warning: " + language + ": the word " + wordData.getWord() + "/" + wordData.getStem() + " lacks a POS tag in the dictionary.");
      }
    }
View Full Code Here

  @Override
  public final List<AnalyzedTokenReadings> tag(
      final List<String> sentenceTokens) throws IOException {
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer morfologik = new DictionaryLookup(getDictionary());
    if (manualTagger == null && plaintextDictPath != null) {
      manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(plaintextDictPath));
    }

    for (final String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerCaseWord = word.toLowerCase(RO_LOCALE);
      final List<WordData> taggerTokens = morfologik.lookup(lowerCaseWord);
      if (taggerTokens != null) {
        for (WordData wd : taggerTokens) {
          final String[] tagsArr = wd.getStem().toString().split("\\+");
          for (final String currTag : tagsArr) {
            l.add(new AnalyzedToken(word,
View Full Code Here

   * @since 2.3
   */
  protected IStemmer createStemmer() {
    try {
      final Dictionary dict = getDictionary();
      return new DictionaryLookup(dict);
    } catch (IOException e) {
      throw new RuntimeException("Could not load dictionary", e);
    }
  }
View Full Code Here

    List<AnalyzedToken> taggerTokens;
    List<AnalyzedToken> lowerTaggerTokens;
    List<AnalyzedToken> upperTaggerTokens;
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer dictLookup = new DictionaryLookup(getDictionary());

    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerWord = word.toLowerCase(conversionLocale);
      taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(word));
      lowerTaggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(lowerWord));
      final boolean isLowercase = word.equals(lowerWord);
      final boolean isMixedCase = StringTools.isMixedCase(word);

      //normal case
      addTokens(taggerTokens, l);

      //tag non-lowercase (alluppercase or startuppercase), but not mixedcase word with lowercase word tags
      if (!isLowercase && !isMixedCase) {
        addTokens(lowerTaggerTokens, l);
      }

      //tag lowercase word with startuppercase word tags
      if (tagLowercaseWithUppercase) {
        if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
          if (isLowercase) {
            upperTaggerTokens = asAnalyzedTokenList(word,
                dictLookup.lookup(StringTools.uppercaseFirstChar(word)));
            if (!upperTaggerTokens.isEmpty()) {
              addTokens(upperTaggerTokens, l);
            }
          }
        }
View Full Code Here

TOP

Related Classes of morfologik.stemming.DictionaryLookup

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.