Source Code of org.elasticsearch.index.analysis.SynonymTokenFilterFactory

/*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.elasticsearch.index.analysis;


import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;


@AnalysisSettingsRequired
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {


    private final SynonymMap synonymMap;


    @Inject public SynonymTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
                                             @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);


        List<String> rules = Analysis.getWordList(env, settings, "synonyms");
        if (rules == null) {
            throw new ElasticSearchIllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
        }
        boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
        boolean expand = settings.getAsBoolean("expand", true);


        String tokenizerName = settings.get("tokenizer", "whitespace");


        TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);
        if (tokenizerFactoryFactory == null) {
            tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
        }
        if (tokenizerFactoryFactory == null) {
            throw new ElasticSearchIllegalArgumentException("failed to fine tokenizer [" + tokenizerName + "] for synonym token filter");
        }
        TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, settings);
        synonymMap = new SynonymMap(ignoreCase);
        parseRules(rules, synonymMap, "=>", ",", expand, tokenizerFactory);
    }


    @Override public TokenStream create(TokenStream tokenStream) {
        return new SynonymFilter(tokenStream, synonymMap);
    }


    static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
                           String synSep, boolean expansion, TokenizerFactory tokFactory) {
//        int count = 0;
        for (String rule : rules) {
            // To use regexes, we need an expression that specifies an odd number of chars.
            // This can't really be done with string.split(), and since we need to
            // do unescaping at some point anyway, we wouldn't be saving any effort
            // by using regexes.


            List<String> mapping = Strings.splitSmart(rule, mappingSep, false);


            List<List<String>> source;
            List<List<String>> target;


            if (mapping.size() > 2) {
                throw new RuntimeException("Invalid Synonym Rule:" + rule);
            } else if (mapping.size() == 2) {
                source = getSynList(mapping.get(0), synSep, tokFactory);
                target = getSynList(mapping.get(1), synSep, tokFactory);
            } else {
                source = getSynList(mapping.get(0), synSep, tokFactory);
                if (expansion) {
                    // expand to all arguments
                    target = source;
                } else {
                    // reduce to first argument
                    target = new ArrayList<List<String>>(1);
                    target.add(source.get(0));
                }
            }


            boolean includeOrig = false;
            for (List<String> fromToks : source) {
//                count++;
                for (List<String> toToks : target) {
                    map.add(fromToks,
                            SynonymMap.makeTokens(toToks),
                            includeOrig,
                            true
                    );
                }
            }
        }
    }


    // a , b c , d e f => [[a],[b,c],[d,e,f]]
    private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
        List<String> strList = Strings.splitSmart(str, separator, false);
        // now split on whitespace to get a list of token strings
        List<List<String>> synList = new ArrayList<List<String>>();
        for (String toks : strList) {
            List<String> tokList = tokFactory == null ?
                    Strings.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
            synList.add(tokList);
        }
        return synList;
    }


    private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) {
        TokenStream ts = tokFactory.create(new FastStringReader(source));
        List<String> tokList = new ArrayList<String>();
        try {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            while (ts.incrementToken()) {
                if (termAtt.length() > 0)
                    tokList.add(termAtt.toString());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return tokList;
    }
}
Source Code of org.elasticsearch.index.analysis.SynonymTokenFilterFactory

Related Classes of org.elasticsearch.index.analysis.SynonymTokenFilterFactory