Package com.senseidb.plugin.analyzer

Source Code of com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory

/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.senseidb.plugin.analyzer;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.collections.MapUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.apache.lucene.util.Version;

import com.senseidb.plugin.SenseiPluginFactory;
import com.senseidb.plugin.SenseiPluginRegistry;

/**
* A {@code SenseiPluginFactory} that instantiates a {@link PatternAnalyzer}. It can be configured through
* the following properties:
*
* <ul>
*     <li><code>matchVersion</code>: the Lucene version compatibility (default: LUCENE_35). See {@link Version}.
*     <li><code>pattern</code>: a regular expression delimiting tokens (default: "<code>\\s+</code>")
*     <li><code>toLowerCase</code>: if true, tokens are converted to lower case (default: true)
*     <li><code>stopWords</code>: a comma-separated list of stop words (defaults to an empty list)
* </ul>
*
* Example configuration:
*
* <pre>
* sensei.index.analyzer.class=com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory
* sensei.index.analyzer.pattern=[ -_./:]
* </pre>
*
* @author jgrande
*
*/
public class LucenePatternAnalyzerFactory implements SenseiPluginFactory<Analyzer> {

    @Override
    public Analyzer getBean(Map<String, String> initProperties, String fullPrefix, SenseiPluginRegistry pluginRegistry) {
        Version matchVersion = Version.valueOf(MapUtils.getString(initProperties, "matchVersion", "LUCENE_35"));
        Pattern pattern = Pattern.compile(MapUtils.getString(initProperties, "pattern", "\\s+"));
        boolean toLowerCase = MapUtils.getBoolean(initProperties, "toLowerCase", true);
        String stopWordsStr = MapUtils.getString(initProperties, "stopWords", "");
        Set<String> stopWords = new HashSet<String>(Arrays.asList(stopWordsStr.split("\\s*,\\s*")));
        return new PatternAnalyzer(matchVersion, pattern, toLowerCase, stopWords);
    }

}
TOP

Related Classes of com.senseidb.plugin.analyzer.LucenePatternAnalyzerFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.