Package org.apache.lucene.search.spell

Source Code of org.apache.lucene.search.spell.JahiaExtendedSpellChecker

package org.apache.lucene.search.spell;

import java.io.IOException;
import java.util.Iterator;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.store.Directory;

public class JahiaExtendedSpellChecker extends SpellChecker {

    public static final String F_LANGUAGE = "language";

    public static final String F_SITE = "site";

    // minimum score for hits generated by the spell checker query
    private float minScore = 0.5f;

    /**
     * Boost value for start and end grams
     */
    private float bStart = 2.0f;

    private float bEnd = 1.0f;

    private IndexSearcher searcher;

    /**
     * Use the given directory as a spell checker index. The directory is
     * created if it doesn't exist yet.
     *
     * @param spellIndex
     * @throws IOException
     */
    public JahiaExtendedSpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
        super(spellIndex, sd);
    }

    public JahiaExtendedSpellChecker(Directory spellIndex) throws IOException {
        super(spellIndex);
    }

    /**
     * Suggest similar words (optionally restricted to a field of an index).
     *
     * <p>
     * As the Lucene similarity that is used to fetch the most relevant
     * n-grammed terms is not the same as the edit distance strategy used to
     * calculate the best matching spell-checked word from the hits that Lucene
     * found, one usually has to retrieve a couple of numSug's in order to get
     * the true best match.
     *
     * <p>
     * I.e. if numSug == 1, don't count on that suggestion being the best one.
     * Thus, you should set this value to <b>at least</b> 5 for a good
     * suggestion.
     *
     * @param word
     *            the word you want a spell check done on
     * @param numSug
     *            the number of suggested words
     * @param ir
     *            the indexReader of the user index (can be null see field
     *            param)
     * @param field
     *            the field of the user index: if field is not null, the
     *            suggested words are restricted to the words present in this
     *            field.
     * @param morePopular
     *            return only the suggest words that are as frequent or more
     *            frequent than the searched word (only if restricted mode =
     *            (indexReader!=null and field!=null)
     * @throws IOException
     * @return String[] the sorted list of the suggest words with these 2
     *         criteria: first criteria: the edit distance, second criteria
     *         (only if restricted mode): the popularity of the suggest words in
     *         the field of the user index
     */
    public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, boolean morePopular,
            String site, String language) throws IOException {

        float min = this.minScore;
        final int lengthWord = word.length();

        final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
        final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
        // if the word exists in the real index and we don't care for word
        // frequency, return the word itself
        if (!morePopular && freq > 0) {
            return new String[] { word };
        }

        BooleanQuery query = new BooleanQuery();
        String[] grams;
        String key;

        // ensure language
        if (language != null) {
            add(query, F_LANGUAGE, language, BooleanClause.Occur.MUST);
        }

        // ensure site
        if (site != null) {
            add(query, F_SITE, site, BooleanClause.Occur.MUST);
        }

        for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

            key = "gram" + ng; // form key

            grams = formGrams(word, ng); // form word into ngrams (allow dups
            // too)

            if (grams.length == 0) {
                continue; // hmm
            }

            if (bStart > 0) { // should we boost prefixes?
                add(query, "start" + ng, grams[0], bStart); // matches start of
                // word

            }
            if (bEnd > 0) { // should we boost suffixes
                add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches
                // end of
                // word

            }
            for (int i = 0; i < grams.length; i++) {
                add(query, key, grams[i]);
            }
        }

        // System.out.println("Q: " + query);
        IndexSearcher usedSearcher = searcher;
        Hits hits = null;
        boolean retry = true;
        while (retry) {
            try {
                hits = usedSearcher.search(query);
            } catch (IOException e) {
                if (retry && usedSearcher != searcher) {
                    usedSearcher = searcher;
                } else {
                    throw e;
                }
            } finally {
                retry = false;
            }
        }
        // System.out.println("HITS: " + hits.length());
        SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

        // go thru more than 'maxr' matches in case the distance filter triggers
        int stop = Math.min(hits.length(), 10 * numSug);
        SuggestWord sugWord = new SuggestWord();
        for (int i = 0; i < stop; i++) {

            sugWord.string = hits.doc(i).get(F_WORD + (language != null ? "-" + language : "")); // get
            // orig
            // word

            // don't suggest a word for itself, that would be silly
            if (sugWord.string == null || word.equals(sugWord.string)) {
                continue;
            }

            // edit distance
            sugWord.score = getStringDistance().getDistance(word, sugWord.string);
            if (sugWord.score < min) {
                continue;
            }

            if (ir != null && field != null) { // use the user index
                sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq
                // in
                // the
                // index
                // don't suggest a word that is not present in the field
                if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
                    continue;
                }
            }
            sugQueue.insert(sugWord);
            if (sugQueue.size() == numSug) {
                // if queue full, maintain the minScore score
                min = ((SuggestWord) sugQueue.top()).score;
            }
            sugWord = new SuggestWord();
        }

        // convert to array string
        String[] list = new String[sugQueue.size()];
        for (int i = sugQueue.size() - 1; i >= 0; i--) {
            list[i] = ((SuggestWord) sugQueue.pop()).string;
        }

        return list;
    }

    private int getMin(int l) {
        if (l > 5) {
            return 3;
        }
        if (l == 5) {
            return 2;
        }
        return 1;
    }

    private int getMax(int l) {
        if (l > 5) {
            return 4;
        }
        if (l == 5) {
            return 3;
        }
        return 2;
    }

    /**
     * Sets the accuracy 0 &lt; minScore &lt; 1; default 0.5
     */
    public void setAccuracy(float minScore) {
        this.minScore = minScore;
        super.setAccuracy(minScore);
    }

    /**
     * Form all ngrams for a given word.
     *
     * @param text
     *            the word to parse
     * @param ng
     *            the ngram length e.g. 3
     * @return an array of all ngrams in the word and note that duplicates are
     *         not removed
     */
    private static String[] formGrams(String text, int ng) {
        int len = text.length();
        String[] res = new String[len - ng + 1];
        for (int i = 0; i < len - ng + 1; i++) {
            res[i] = text.substring(i, i + ng);
        }
        return res;
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value, float boost) {
        Query tq = new TermQuery(new Term(name, value));
        tq.setBoost(boost);
        q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
    }

    private static void add(BooleanQuery q, String name, String value, Occur occur) {
        q.add(new BooleanClause(new TermQuery(new Term(name, value)), occur));
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value) {
        q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
    }

    /**
     * Use a different index as the spell checker index or re-open the existing
     * index if <code>spellIndex</code> is the same value as given in the
     * constructor.
     *
     * @param spellIndex
     * @throws IOException
     */
    public void setSpellIndex(Directory spellIndex) throws IOException {
        this.spellIndex = spellIndex;
        if (!IndexReader.indexExists(spellIndex)) {
            IndexWriter writer = new IndexWriter(spellIndex, null, true);
            writer.close();
        }
        // close the old searcher, if there was one
        reopenSearcher();
    }
   
    private void reopenSearcher() throws IOException {
        // create a new and close the old searcher
        IndexSearcher oldSearcher = searcher;
        IndexSearcher newSearcher = new IndexSearcher(this.spellIndex);
        searcher = newSearcher;
        if (oldSearcher != null) {
            oldSearcher.close();     
        }
    }

    /**
     * Removes all terms from the spell check index.
     *
     * @throws IOException
     */
    public void clearIndex() throws IOException {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();

        reopenSearcher();
    }

    /**
     * Check whether the word exists in the index.
     *
     * @param word
     * @throws IOException
     * @return true iff the word exists in the index
     */
    public boolean exist(String word, String langCode, String site) throws IOException {
        BooleanQuery query = new BooleanQuery();
        add(query, F_WORD + (langCode != null ? "-" + langCode : ""), word, BooleanClause.Occur.MUST);
        add(query, F_SITE, site, BooleanClause.Occur.MUST);
        return searcher.search(query).length() > 0;
    }

    /**
     * Indexes the data from the given {@link Dictionary}.
     *
     * @param dict
     *            Dictionary to index
     * @param mergeFactor
     *            mergeFactor to use when indexing
     * @param ramMB
     *            the max amount or memory in MB to use
     * @throws IOException
     */
    public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, String site, String langCode)
            throws IOException {
        IndexWriter writer = new IndexWriter(spellIndex, true, new WhitespaceAnalyzer());
        writer.setMergeFactor(mergeFactor);
        writer.setRAMBufferSizeMB(ramMB);

        Iterator<?> iter = dict.getWordsIterator();
        while (iter.hasNext()) {
            String word = (String) iter.next();

            int len = word.length();
            if (len < 3) {
                continue; // too short we bail but "too long" is fine...
            }

            if (this.exist(word, langCode, site)) { // if the word already exist in
                // the gramindex
                continue;
            }

            // ok index the word
            Document doc = createDocument(word, getMin(len), getMax(len), site, langCode);
            writer.addDocument(doc);
        }
        // close writer
        writer.optimize();
        writer.close();
        // also re-open the spell index to see our own changes when the next
        // suggestion is fetched:
        // create a new and close the old searcher
        IndexSearcher oldSearcher = searcher;
        IndexSearcher newSearcher = new IndexSearcher(this.spellIndex);
        searcher = newSearcher;
        oldSearcher.close();
    }

    private static Document createDocument(String text, int ng1, int ng2, String site, String langCode) {
        Document doc = new Document();
        doc.add(new Field(F_WORD + (langCode != null ? "-" + langCode : ""), text, Field.Store.YES,
                Field.Index.NOT_ANALYZED)); // orig term
        doc.add(new Field(F_LANGUAGE, langCode, Field.Store.YES, Field.Index.NOT_ANALYZED)); // language       
        doc.add(new Field(F_SITE, site, Field.Store.YES, Field.Index.NOT_ANALYZED)); // site       
        addGram(text, doc, ng1, ng2);
        return doc;
    }

    private static void addGram(String text, Document doc, int ng1, int ng2) {
        int len = text.length();
        for (int ng = ng1; ng <= ng2; ng++) {
            String key = "gram" + ng;
            String end = null;
            for (int i = 0; i < len - ng + 1; i++) {
                String gram = text.substring(i, i + ng);
                doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
                if (i == 0) {
                    doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
                }
                end = gram;
            }
            if (end != null) { // may not be present if len==ng1
                doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
            }
        }
    }
}
TOP

Related Classes of org.apache.lucene.search.spell.JahiaExtendedSpellChecker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.