Package org.carrot2.text.linguistic.lucene

Source Code of org.carrot2.text.linguistic.lucene.ThaiTokenizerAdapter

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.text.linguistic.lucene;

import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;
import java.util.Locale;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.th.ThaiWordFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.ExceptionUtils;

/**
* Thai tokenizer implemented using Lucene's {@link ThaiWordFilter}.
*/
public final class ThaiTokenizerAdapter implements ITokenizer
{
    private TokenStream wordTokenFilter;
    private CharTermAttribute term = null;
    private TypeAttribute type = null;

    private final MutableCharArray tempCharSequence;

    public ThaiTokenizerAdapter()
    {
        this.tempCharSequence = new MutableCharArray(new char [0]);
        if (!platformSupportsThai()) {
            throw new RuntimeException("Thai segmentation not supported on this platform.");
        }
    }

    public short nextToken() throws IOException
    {
        final boolean hasNextToken = wordTokenFilter.incrementToken();
        if (hasNextToken)
        {
            final char [] image = term.buffer();
            final int length = term.length();
            tempCharSequence.reset(image, 0, length);

            short flags = 0;
            final String typeString = type.type();
            if (typeString.equals("<SOUTHEAST_ASIAN>") || typeString.equals("<ALPHANUM>"))
            {
                flags = ITokenizer.TT_TERM;
            }
            else if (typeString.equals("<NUM>"))
            {
                flags = ITokenizer.TT_NUMERIC;
            }
            else
            {
                flags = ITokenizer.TT_PUNCTUATION;
            }
            return flags;
        }

        return ITokenizer.TT_EOF;
    }

    public void setTermBuffer(MutableCharArray array)
    {
        array.reset(term.buffer(), 0, term.length());
    }

    @SuppressWarnings("deprecation")
    public void reset(Reader input) throws IOException
    {
        assert input != null;
        try
        {
            this.wordTokenFilter = new ThaiWordFilter(Version.LUCENE_CURRENT,
                new StandardTokenizer(Version.LUCENE_CURRENT, input));
            this.term = wordTokenFilter.addAttribute(CharTermAttribute.class);
            this.type = wordTokenFilter.addAttribute(TypeAttribute.class);
            this.wordTokenFilter.reset();
        }
        catch (Exception e)
        {
            throw ExceptionUtils.wrapAsRuntimeException(e);
        }
    }
   
    /**
     * Check support for Thai.
     */
    public static boolean platformSupportsThai()
    {
        try {
            // Check if Thai break iteration is supported, code taken from Lucene's ThaiWordFilter.
            final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
            proto.setText("ภาษาไทย");
            return proto.isBoundary(4);
        } catch (Throwable e) {
            return false;
        }
    }   
}
TOP

Related Classes of org.carrot2.text.linguistic.lucene.ThaiTokenizerAdapter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.