Source Code of org.jahia.services.search.spell.CompositeSpellChecker$TwelveHoursRefreshInterval

/**
 * This file is part of Jahia, next-generation open source CMS:
 * Jahia's next-generation, open source CMS stems from a widely acknowledged vision
 * of enterprise application convergence - web, search, document, social and portal -
 * unified by the simplicity of web content management.
 *
 * For more information, please visit http://www.jahia.com.
 *
 * Copyright (C) 2002-2011 Jahia Solutions Group SA. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * As a special exception to the terms and conditions of version 2.0 of
 * the GPL (or any later version), you may redistribute this Program in connection
 * with Free/Libre and Open Source Software ("FLOSS") applications as described
 * in Jahia's FLOSS exception. You should have received a copy of the text
 * describing the FLOSS exception, and it is also available here:
 * http://www.jahia.com/license
 *
 * Commercial and Supported Versions of the program (dual licensing):
 * alternatively, commercial and supported versions of the program may be used
 * in accordance with the terms and conditions contained in a separate
 * written agreement between you and Jahia Solutions Group SA.
 *
 * If you are unsure which license is appropriate for your use,
 * please contact the sales department at sales@jahia.com.
 */


package org.jahia.services.search.spell;


import org.apache.jackrabbit.core.query.lucene.SearchIndex;
import org.apache.jackrabbit.core.query.lucene.FieldNames;
import org.apache.jackrabbit.core.query.QueryHandler;
import org.apache.jackrabbit.spi.Name;
import org.apache.jackrabbit.spi.commons.query.LocationStepQueryNode;
import org.apache.jackrabbit.spi.commons.query.PathQueryNode;
import org.apache.jackrabbit.spi.commons.query.QueryRootNode;
import org.apache.jackrabbit.spi.commons.query.RelationQueryNode;
import org.apache.jackrabbit.spi.commons.query.TraversingQueryNodeVisitor;
import org.slf4j.Logger;
import org.apache.lucene.search.spell.JahiaExtendedSpellChecker;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NativeFSLockFactory;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.jahia.api.Constants;
import org.jahia.bin.Jahia;
import org.jahia.exceptions.JahiaException;
import org.jahia.registries.ServicesRegistry;
import org.jahia.services.SpringContextSingleton;
import org.jahia.services.content.*;
import org.jahia.services.sites.JahiaSitesBaseService;


import java.io.IOException;
import java.io.StringReader;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;


import javax.jcr.Node;
import javax.jcr.NodeIterator;
import javax.jcr.RepositoryException;


/**
 * <code>LuceneSpellChecker</code> implements a spell checker based on the terms
 * present in a lucene index.
 */
public class CompositeSpellChecker implements org.apache.jackrabbit.core.query.lucene.SpellChecker {
    
    /**
     * Logger instance for this class.
     */
    private static final Logger logger = org.slf4j.LoggerFactory.getLogger(CompositeSpellChecker.class);


    public static final class FiveSecondsRefreshInterval extends CompositeSpellChecker {
        public FiveSecondsRefreshInterval() {
            super(5 * 1000);
        }
    }


    public static final class OneMinuteRefreshInterval extends CompositeSpellChecker {
        public OneMinuteRefreshInterval() {
            super(60 * 1000);
        }
    }


    public static final class FiveMinutesRefreshInterval extends CompositeSpellChecker {
        public FiveMinutesRefreshInterval() {
            super(5 * 60 * 1000);
        }
    }


    public static final class ThirtyMinutesRefreshInterval extends CompositeSpellChecker {
        public ThirtyMinutesRefreshInterval() {
            super(30 * 60 * 1000);
        }
    }


    public static final class OneHourRefreshInterval extends CompositeSpellChecker {
        public OneHourRefreshInterval() {
            super(60 * 60 * 1000);
        }
    }


    public static final class SixHoursRefreshInterval extends CompositeSpellChecker {
        public SixHoursRefreshInterval() {
            super(6 * 60 * 60 * 1000);
        }
    }


    public static final class TwelveHoursRefreshInterval extends CompositeSpellChecker {
        public TwelveHoursRefreshInterval() {
            super(12 * 60 * 60 * 1000);
        }
    }


    public static final class OneDayRefreshInterval extends CompositeSpellChecker {
        public OneDayRefreshInterval() {
            super(24 * 60 * 60 * 1000);
        }
    }


    private static Map<String, InternalSpellChecker> spellCheckers = new HashMap<String, InternalSpellChecker>(2);
    
    /**
     * Triggers update of the spell checker dictionary index.
     */
    public static void updateSpellCheckerIndex() {
        for (InternalSpellChecker checker : spellCheckers.values()) {
            checker.lastRefresh = 0;
            checker.refreshSpellChecker();
        }
    }
    
    /**
     * The internal spell checker.
     */
    private InternalSpellChecker spellChecker;


    /**
     * The refresh interval.
     */
    private final long refreshInterval;


    /**
     * Spell checker with a default refresh interval of one hour.
     */
    public CompositeSpellChecker() {
        this(60 * 60 * 1000); // default refresh interval: one hour
    }


    protected CompositeSpellChecker(long refreshInterval) {
        this.refreshInterval = refreshInterval;
    }


    /**
     * Initializes this spell checker.
     * 
     * @param handler
     *            the query handler that created this spell checker.
     * @throws IOException
     *             if <code>handler</code> is not of type {@link SearchIndex}.
     */
    public void init(QueryHandler handler) throws IOException {
        if (handler instanceof SearchIndex) {
            this.spellChecker = new InternalSpellChecker((SearchIndex) handler);
            spellCheckers.put(((SearchIndex) handler).getPath(), spellChecker);
        } else {
            throw new IOException("CompositeSpellChecker only works with " + SearchIndex.class.getName());
        }
    }


    /**
     * {@inheritDoc}
     */
    public String check(QueryRootNode aqt) throws IOException {
        final Map<String, String> spellcheckInfo = new HashMap<String, String>();
        try {
            aqt.accept(new TraversingQueryNodeVisitor() {
                public Object visit(RelationQueryNode node, Object data) throws RepositoryException {
                    if (!spellcheckInfo.containsKey("statement")
                            && node.getOperation() == RelationQueryNode.OPERATION_SPELLCHECK) {
                        spellcheckInfo.put("statement", node.getStringValue());
                    } else if (!spellcheckInfo.containsKey("language") && node.getRelativePath() != null
                            && node.getRelativePath().getNumOperands() > 0) {
                        Name propertyName = ((LocationStepQueryNode) node.getRelativePath().getOperands()[0])
                                .getNameTest();
                        if ("language".equals(propertyName.getLocalName())) {
                            spellcheckInfo.put("language", node.getStringValue());
                        }
                    }
                    return super.visit(node, data);
                }


                public Object visit(PathQueryNode node, Object data) throws RepositoryException {
                    for (int i : new int[] { 0, 1 }) {
                        if (node.getPathSteps().length > i + 1
                                && "sites".equals(node.getPathSteps()[i].getNameTest().getLocalName())) {
                            spellcheckInfo.put("site", node.getPathSteps()[++i].getNameTest().getLocalName());
                        }
                    }
                    return super.visit(node, data);
                }
            }, null);
            if (!spellcheckInfo.containsKey("statement")) {
                // no spellcheck operation in query
                return null;
            }
            if (!spellcheckInfo.containsKey("language")) {
                Locale locale = JCRSessionFactory.getInstance().getCurrentLocale();
                if (locale != null) {
                    spellcheckInfo.put("language", locale.toString());
                }
            }       
        } catch (RepositoryException e) {
        }


        return spellChecker.suggest(spellcheckInfo.get("statement"), spellcheckInfo
                .get("site"), spellcheckInfo.get("language"));
    }


    public void close() {
        try {
            spellChecker.close();
        } finally {
            spellCheckers.remove(spellChecker.handler.getPath());
        }
    }


    // ------------------------------< internal
    // >--------------------------------


    private final class InternalSpellChecker {


        /**
         * Timestamp when the last refresh was done.
         */
        private long lastRefresh;


        /**
         * Set to true while a refresh is done in a separate thread.
         */
        private boolean refreshing = false;


        /**
         * The query handler associated with this spell checker.
         */
        private final SearchIndex handler;


        /**
         * The directory where the spell index is stored.
         */
        private final Directory spellIndexDirectory;


        /**
         * The underlying spell checker.
         */
        private JahiaExtendedSpellChecker spellChecker;


        /**
         * Creates a new internal spell checker.
         * 
         * @param handler
         *            the associated query handler.
         */
        InternalSpellChecker(SearchIndex handler) throws IOException {
            this.handler = handler;
            String path = handler.getPath() + File.separatorChar + "spellchecker";
            this.spellIndexDirectory = FSDirectory.getDirectory(path, new NativeFSLockFactory(path));
            if (IndexReader.indexExists(spellIndexDirectory)) {
                this.lastRefresh = System.currentTimeMillis();
            }
            this.spellChecker = new JahiaExtendedSpellChecker(spellIndexDirectory);
            refreshSpellChecker();
        }


        /**
         * Checks a fulltext query statement and suggests a spell checked
         * version of the statement. If the spell checker thinks the spelling is
         * correct <code>null</code> is returned.
         * 
         * @param statement
         *            the fulltext query statement.
         * @return a suggestion or <code>null</code>.
         */
        String suggest(String statement, String site, String language) throws IOException {
            // tokenize the statement (field name doesn't matter actually...)
            List<String> words = new ArrayList<String>();
            List<Token> tokens = new ArrayList<Token>();
            tokenize(statement, words, tokens);


            String[] suggestions = check((String[]) words.toArray(new String[words.size()]), site, language);
            if (suggestions != null) {
                // replace words in statement in reverse order because length
                // of statement will change
                StringBuffer sb = new StringBuffer(statement);
                for (int i = suggestions.length - 1; i >= 0; i--) {
                    Token t = (Token) tokens.get(i);
                    // only replace if word actually changed
                    if (!t.termText().equalsIgnoreCase(suggestions[i])) {
                        sb.replace(t.startOffset(), t.endOffset(), suggestions[i]);
                    }
                }
                return sb.toString();
            } else {
                return null;
            }
        }


        void close() {
            try {
                spellIndexDirectory.close();
            } catch (IOException e) {
                // ignore
            }
            // urgh, the lucene spell checker cannot be closed explicitly.
            // finalize will close the reader...
            spellChecker = null;
        }


        /**
         * Tokenizes the statement into words and tokens.
         * 
         * @param statement
         *            the fulltext query statement.
         * @param words
         *            this list will be filled with the original words extracted
         *            from the statement.
         * @param tokens
         *            this list will be filled with the tokens parsed from the
         *            statement.
         * @throws IOException
         *             if an error occurs while parsing the statement.
         */
        private void tokenize(String statement, List<String> words, List<Token> tokens) throws IOException {
            TokenStream ts = handler.getTextAnalyzer().tokenStream(FieldNames.FULLTEXT, new StringReader(statement));
            try {
                Token t;
                while ((t = ts.next()) != null) {
                    String origWord = statement.substring(t.startOffset(), t.endOffset());
                    if (t.getPositionIncrement() > 0) {
                        words.add(t.termText());
                        tokens.add(t);
                    } else {
                        // very simple implementation: use termText with length
                        // closer to original word
                        Token current = (Token) tokens.get(tokens.size() - 1);
                        if (Math.abs(origWord.length() - current.termText().length()) > Math.abs(origWord.length()
                                - t.termText().length())) {
                            // replace current token and word
                            words.set(words.size() - 1, t.termText());
                            tokens.set(tokens.size() - 1, t);
                        }
                    }
                }
            } finally {
                ts.close();
            }
        }


        /**
         * Checks the spelling of the passed <code>words</code> and returns a
         * suggestion.
         * 
         * @param words
         *            the words to check.
         * @return a suggestion of correctly spelled <code>words</code> or
         *         <code>null</code> if this spell checker thinks
         *         <code>words</code> are spelled correctly.
         * @throws IOException
         *             if an error occurs while spell checking.
         */
        private String[] check(String words[], String site, String language) throws IOException {
            refreshSpellChecker();
            boolean hasSuggestion = false;
            IndexReader reader = handler.getIndexReader();
            try {
                for (int retries = 0; retries < 100; retries++) {
                    try {
                        String[] suggestion = new String[words.length];
                        for (int i = 0; i < words.length; i++) {
                            String[] similar = spellChecker.suggestSimilar(words[i], 5, reader, FieldNames.FULLTEXT,
                                    true, site, language);
                            if (similar.length > 0) {
                                suggestion[i] = similar[0];
                                hasSuggestion = true;
                            } else {
                                suggestion[i] = words[i];
                            }
                        }
                        if (hasSuggestion) {
                            logger.debug("Successful after {} retries " + retries);
                            return suggestion;
                        } else {
                            return null;
                        }
                    } catch (AlreadyClosedException e) {
                        // it may happen that the index reader inside the
                        // spell checker is closed while searching for
                        // suggestions. this is actually a design flaw in the
                        // lucene spell checker, but for now we simply retry
                    }
                }
                // unsuccessful after retries
                return null;
            } finally {
                reader.close();
            }
        }


        /**
         * Refreshes the underlying spell checker in a background thread.
         * Synchronization is done on this <code>CompositeSpellChecker</code>
         * instance. While the refresh takes place {@link #refreshing} is set to
         * <code>true</code>.
         */
        private void refreshSpellChecker() {
            if (lastRefresh + refreshInterval < System.currentTimeMillis()) {
                synchronized (this) {
                    if (refreshing) {
                        return;
                    } else {
                        refreshing = true;
                        Runnable refresh = new Runnable() {
                            public void run() {
                                while (!SpringContextSingleton.getInstance().isInitialized()) {
                                    // wait until services are started
                                    try {
                                        Thread.sleep(5000);
                                    } catch (InterruptedException ex) {
                                        // do nothing
                                    }
                                }
                                try {
                                    Set<String> sites = JCRTemplate.getInstance().doExecuteWithSystemSession(
                                            new JCRCallback<Set<String>>() {
                                                public Set<String> doInJCR(JCRSessionWrapper session)
                                                        throws RepositoryException {
                                                    Set<String> sites = new HashSet<String>();
                                                    if (session.nodeExists("/sites")) {
                                                        Node sitesRoot = session.getNode("/sites");
                                                        for (NodeIterator it = sitesRoot.getNodes(); it.hasNext();) {
                                                            Node child = (Node) it.next();
                                                            if (child.isNodeType(Constants.JAHIANT_VIRTUALSITE)) {
                                                                sites.add(child.getName());
                                                            }
                                                        }
                                                    }
                                                    return sites;
                                                }
                                            });


                                    if (!sites.isEmpty()) {
                                        IndexReader reader = handler.getIndexReader();
                                        try {


                                            long time = System.currentTimeMillis();
                                            logger.debug("Starting spell checker index refresh");
                                            for (String site : sites) {
                                                for (String language : JahiaSitesBaseService.getInstance()
                                                        .getSiteByKey(site).getLanguages()) {
                                                    StringBuilder fullTextName = new StringBuilder(FieldNames.FULLTEXT);
                                                    if (site != null) {
                                                        fullTextName.append("-").append(site);
                                                    }
                                                    // add language independend
                                                    // fulltext values first
                                                    spellChecker.indexDictionary(new LuceneDictionary(reader,
                                                            fullTextName.toString()), 300, 10, site, language);


                                                    // add language dependend
                                                    // fulltext values
                                                    if (language != null) {
                                                        fullTextName.append("-").append(language);
                                                    }
                                                    spellChecker.indexDictionary(new LuceneDictionary(reader,
                                                            fullTextName.toString()), 300, 10, site, language);
                                                }
                                            }
                                            time = System.currentTimeMillis() - time;
                                            time = time / 1000;
                                            logger.info("Spell checker index refreshed in: {} s." + time);
                                        } finally {
                                            reader.close();
                                        }
                                    }
                                } catch (RepositoryException e) {
                                    logger.warn("Error creating spellcheck index", e);
                                } catch (JahiaException e) {
                                    logger.warn("Error creating spellcheck index", e);
                                } catch (IOException e) {
                                    // ignore
                                } finally {
                                    synchronized (InternalSpellChecker.this) {
                                        refreshing = false;
                                    }
                                }
                            }
                        };
                        new Thread(refresh, "SpellChecker Refresh").start();
                        lastRefresh = System.currentTimeMillis();
                    }
                }
            }
        }
    }
}
Source Code of org.jahia.services.search.spell.CompositeSpellChecker$TwelveHoursRefreshInterval

Related Classes of org.jahia.services.search.spell.CompositeSpellChecker$TwelveHoursRefreshInterval