Package org.apache.lucene.analysis.hebrew

Source Code of org.apache.lucene.analysis.hebrew.StreamLemmasFilter

/***************************************************************************
*   Copyright (C) 2010-2013 by                                            *
*      Itamar Syn-Hershko <itamar at code972 dot com>                     *
*    Ofer Fort <oferiko at gmail dot com> (initial Java port)           *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU Affero General Public License           *
*   version 3, as published by the Free Software Foundation.              *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU Affero General Public License for more details.                   *
*                                                                         *
*   You should have received a copy of the GNU Affero General Public      *
*   License along with this program; if not, see                          *
*   <http://www.gnu.org/licenses/>.                                       *
**************************************************************************/
package org.apache.lucene.analysis.hebrew;

import com.code972.hebmorph.*;
import com.code972.hebmorph.datastructures.DictHebMorph;
import com.code972.hebmorph.datastructures.DictRadix;
import com.code972.hebmorph.lemmafilters.LemmaFilterBase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.io.Reader;
import java.util.*;

public class StreamLemmasFilter extends Tokenizer {
    private final StreamLemmatizer _streamLemmatizer;
    private final CharArraySet commonWords;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);

    private final CharacterUtils charUtils;

    private final LemmaFilterBase lemmaFilter;
    private final List<Token> stack = new ArrayList<Token>();
    private final List<Token> filterCache = new ArrayList<Token>();
    private int index = 0;
    private final Set<String> previousLemmas = new HashSet<String>();
    private boolean keepOriginalWord;

    public StreamLemmasFilter(final Reader input, final DictHebMorph dict) {
        this(input, dict, null, null, null);
    }


    public StreamLemmasFilter(final Reader input, final DictRadix<MorphData> dict, final HashMap<String,Integer> pref, DictRadix<Byte> specialTokenizationCases, final CharArraySet commonWords, final LemmaFilterBase lemmaFilter) {
        this(input, new DictHebMorph(dict,pref), null, commonWords, lemmaFilter);
    }

    public StreamLemmasFilter(final Reader input, final DictHebMorph dict, final CharArraySet commonWords, final LemmaFilterBase lemmaFilter) {
        this(input, dict, null, commonWords, lemmaFilter);
    }

    public StreamLemmasFilter(Reader _input, DictHebMorph dict,
                              DictRadix<Byte> specialTokenizationCases, CharArraySet commonWords, LemmaFilterBase lemmaFilter) {
        super(_input);
        _streamLemmatizer = new StreamLemmatizer(input, dict, specialTokenizationCases);
        this.commonWords = commonWords != null ? commonWords : CharArraySet.EMPTY_SET;
        this.lemmaFilter = lemmaFilter;
        charUtils = CharacterUtils.getInstance(Version.LUCENE_46);
    }

    public void setSuffixForExactMatch(Character c) {
        _streamLemmatizer.setSuffixForExactMatch(c);
    }

    public void setCustomWords(DictRadix<MorphData> customWords) {
        _streamLemmatizer.setCustomWords(customWords);
    }

    private final Reference<String> tempRefObject = new Reference<>("");

    private int currentStartOffset, currentEndOffset;

    @Override
    public final boolean incrementToken() throws IOException {
        clearAttributes();

        // Index all unique lemmas at the same position
        while (index < stack.size()) {
            final HebrewToken res = (HebrewToken) ((stack.get(index) instanceof HebrewToken) ? stack.get(index) : null);
            index++;

            if ((res == null) || !previousLemmas.add(res.getLemma())) // Skip multiple lemmas (we will merge morph properties later)
                continue;

            createHebrewToken(res);
            offsetAtt.setOffset(currentStartOffset, currentEndOffset);
            typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew));
            posIncrAtt.setPositionIncrement(0);

            return true;
        }

        // Reset state
        index = 0;
        stack.clear();
        previousLemmas.clear();

        // Lemmatize next word in stream. The HebMorph lemmatizer will always return a token, unless
        // an unrecognized Hebrew word is hit, then an empty tokens array will be returned.
        final int tokenType = _streamLemmatizer.getLemmatizeNextToken(tempRefObject, stack);
        if (tokenType == 0) { // EOS
            return false;
        }

        // Store the location of the word in the original stream
        currentStartOffset = correctOffset(_streamLemmatizer.getStartOffset());
        currentEndOffset = correctOffset(_streamLemmatizer.getEndOffset());
        offsetAtt.setOffset(currentStartOffset, currentEndOffset);

        final String word = tempRefObject.ref;
        if (commonWords.contains(word)) { // common words should be treated later using dedicated filters
            termAtt.copyBuffer(word.toCharArray(), 0, word.length());
            typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew));
            stack.clear();

            if (!keepOriginalWord) {
                if ((tokenType & com.code972.hebmorph.Tokenizer.TokenType.Exact) > 0) {
                    keywordAtt.setKeyword(true);
                }
                return true;
            }

            keywordAtt.setKeyword(true);
            if ((tokenType & com.code972.hebmorph.Tokenizer.TokenType.Exact) == 0) {
                stack.add(new HebrewToken(word, (byte) 0, 0, word, 1.0f));
            }

            return true;
        }

        // Mark request for exact matches in queries, if configured in the tokenizer
        if ((tokenType & com.code972.hebmorph.Tokenizer.TokenType.Exact) > 0) {
            keywordAtt.setKeyword(true);
        }

        // A non-Hebrew word
        if (stack.size() == 1 && !(stack.get(0) instanceof HebrewToken)) {
            termAtt.copyBuffer(word.toCharArray(), 0, word.length());

            final Token tkn = stack.get(0);
            if (tkn.isNumeric()) {
                typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Numeric));
            } else {
                typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.NonHebrew));
            }

            applyLowercaseFilter();

            stack.clear();
            return true;
        }

        // If we arrived here, we hit a Hebrew word
        typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Hebrew));
        // TODO: typeAtt.SetType(TokenTypeSignature(TOKEN_TYPES.Acronym));

        // Do some filtering if requested...
        if (lemmaFilter != null && lemmaFilter.filterCollection(word, stack, filterCache) != null) {
            stack.clear();
            stack.addAll(filterCache);
        }

        // OOV case - store the word as-is, and also output a suffixed version of it
        if (stack.isEmpty()) {
            termAtt.copyBuffer(word.toCharArray(), 0, word.length());

            if (keepOriginalWord) {
                keywordAtt.setKeyword(true);
            }

            if ((tokenType & com.code972.hebmorph.Tokenizer.TokenType.Mixed) > 0) {
                typeAtt.setType(HebrewTokenizer.tokenTypeSignature(HebrewTokenizer.TOKEN_TYPES.Mixed));
                applyLowercaseFilter();
                return true;
            }
            if ((tokenType & com.code972.hebmorph.Tokenizer.TokenType.Exact) > 0) {
                applyLowercaseFilter();
                return true;
            }

            if (keepOriginalWord)
                stack.add(new HebrewToken(word, (byte) 0, 0, word, 1.0f));

            return true;
        }

        // Mark and store the original term to increase precision, while all lemmas
        // will be popped out of the stack and get stored at the next call to IncrementToken.
        if (keepOriginalWord) {
            termAtt.copyBuffer(word.toCharArray(), 0, word.length());
            keywordAtt.setKeyword(true);
            return true;
        }

        // If !keepOriginalWord
        final HebrewToken hebToken = (HebrewToken) stack.get(0);
        if (stack.size() == 1) { // only one lemma was found
            stack.clear();
        } else { // // more than one lemma exist.
            index = 1;
            previousLemmas.add(hebToken.getLemma());
        }
        createHebrewToken(hebToken);

        return true;
    }

    private void applyLowercaseFilter() {
        charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
    }

    protected void createHebrewToken(HebrewToken hebToken) {
        String tokenVal = hebToken.getLemma() == null ? hebToken.getText().substring(hebToken.getPrefixLength()) : hebToken.getLemma();
        termAtt.copyBuffer(tokenVal.toCharArray(), 0, tokenVal.length());
    }

    @Override
    public final void end() throws IOException {
        super.end();
        // set final offset
        int finalOffset = correctOffset(_streamLemmatizer.getEndOffset());
        currentStartOffset = currentEndOffset = finalOffset;
        offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void close() throws IOException {
        super.close();
        stack.clear();
        filterCache.clear();
        previousLemmas.clear();
        index = 0;
        _streamLemmatizer.reset(input);
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        stack.clear();
        filterCache.clear();
        previousLemmas.clear();
        index = 0;
        currentStartOffset = currentEndOffset = 0;
        _streamLemmatizer.reset(input);
    }

    public void setKeepOriginalWord(boolean keepOriginalWord) {
        this.keepOriginalWord = keepOriginalWord;
    }
}
TOP

Related Classes of org.apache.lucene.analysis.hebrew.StreamLemmasFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.