Package org.exist.xquery.modules.ngram

Source Code of org.exist.xquery.modules.ngram.NGramSearch

/*
*  eXist Open Source Native XML Database
*  Copyright (C) 2007-09 The eXist Project
*  http://exist-db.org
*
*  This program is free software; you can redistribute it and/or
*  modify it under the terms of the GNU Lesser General Public License
*  as published by the Free Software Foundation; either version 2
*  of the License, or (at your option) any later version.
*
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*
*  $Id$
*/
package org.exist.xquery.modules.ngram;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.exist.dom.DocumentSet;
import org.exist.dom.EmptyNodeSet;
import org.exist.dom.Match;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.dom.QName;
import org.exist.indexing.ngram.NGramIndex;
import org.exist.indexing.ngram.NGramIndexWorker;
import org.exist.storage.ElementValue;
import org.exist.xquery.*;
import org.exist.xquery.modules.ngram.query.AlternativeStrings;
import org.exist.xquery.modules.ngram.query.EmptyExpression;
import org.exist.xquery.modules.ngram.query.EndAnchor;
import org.exist.xquery.modules.ngram.query.EvaluatableExpression;
import org.exist.xquery.modules.ngram.query.FixedString;
import org.exist.xquery.modules.ngram.query.StartAnchor;
import org.exist.xquery.modules.ngram.query.Wildcard;
import org.exist.xquery.modules.ngram.query.WildcardedExpression;
import org.exist.xquery.modules.ngram.query.WildcardedExpressionSequence;
import org.exist.xquery.modules.ngram.utils.F;
import org.exist.xquery.modules.ngram.utils.NodeProxies;
import org.exist.xquery.modules.ngram.utils.NodeSets;
import org.exist.xquery.util.Error;
import org.exist.xquery.value.*;

public class NGramSearch extends Function implements Optimizable {

    private static final String INTERVAL_QUALIFIER_PATTERN = "\\{([0-9]+),([0-9]+)\\}";

    private static final String SEARCH_DESCRIPTION = "Searches the given $queryString in the index "
        + "defined on the input node set $nodes. "
        + "String comparison is case insensitive. Nodes need to have an ngram index to be searched.";

    private static final String WILDCARD_PATTERN_DESCRIPTION = "The string to search for."
        + "A full stop, '.', (not between brackets), without any qualifiers: Matches a single arbitrary character."
        + "A full stop, '.', (not between brackets), immediately followed by a single question mark, '?': Matches either no characters or one character."
        + "A full stop, '.', (not between brackets), immediately followed by a single asterisk, '*': Matches zero or more characters."
        + "A full stop, '.', (not between brackets), immediately followed by a single plus sign, '+': Matches one or more characters."
        + "A full stop, '.', immediately followed by a sequence of characters that matches the regular expression {[0-9]+,[0-9]+}: Matches a number of characters, where the number is no less than the number represented by the series of digits before the comma, and no greater than the number represented by the series of digits following the comma."
        + "An  expression  \"[…]\"  matches a single character, namely any of the characters"
        + "enclosed by the brackets.  The string enclosed by the brackets cannot be empty; "
        + "therefore ']' can be allowed between  the brackets, provided that it is the first character."
        + "(Thus, \"[][?]\" matches the three characters '[', ']' and '?'.)"
        + "A circumflex accent, '^', at the start of the search string matches the start of the element content."
        + "A dollar sign, '$', at the end of the search string matches the end of the element content."
        + "One can remove the special meaning of any character mentioned above by preceding them by a backslash."
        + "Between brackets these characters stand for themselves.  Thus, \"[[?*\\]\" matches"
        + "the four characters '[', '?', '*' and '\\'."
        + "'?', '*', '+' and character sequences matching the regular expression {[0-9]+,[0-9]+} not immediately preceeded by an unescaped period, '.', stand for themselves."
        + "'^' and '$' not at the very beginning or end of the search string, respectively, stand for themselves.";

    protected static Logger LOG = Logger.getLogger(NGramSearch.class);

    public final static FunctionSignature signatures[] = {
        new FunctionSignature(new QName("contains", NGramModule.NAMESPACE_URI, NGramModule.PREFIX),
            "Similar to the standard XQuery fn:contains function, but based on the NGram index. " + SEARCH_DESCRIPTION
                + "The string may appear at any position within the node content.", new SequenceType[] {
                new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE,
                    "The input node set to search"),
                new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE,
                    "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE,
                Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes containing the query string "
                    + "or the empty sequence")),
        new FunctionSignature(new QName("ends-with", NGramModule.NAMESPACE_URI, NGramModule.PREFIX),
            "Similar to the standard XQuery fn:ends-with function, but based on the NGram index. " + SEARCH_DESCRIPTION
                + "The string has to appear at the end of the node's content.", new SequenceType[] {
                new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE,
                    "The input node set to search"),
                new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE,
                    "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE,
                Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes ending with the query string "
                    + "or the empty sequence")),
        new FunctionSignature(new QName("starts-with", NGramModule.NAMESPACE_URI, NGramModule.PREFIX),
            "Similar to the standard XQuery fn:starts-with function, but based on the NGram index. "
                + SEARCH_DESCRIPTION + "The string has to appear at the start of the node's content.",
            new SequenceType[] {
                new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE,
                    "The input node set to search"),
                new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE,
                    "The exact string to search for") }, new FunctionReturnSequenceType(Type.NODE,
                Cardinality.ZERO_OR_MORE,
                "a set of nodes from the input node set $nodes starting with the query string "
                    + "or the empty sequence")),
        new FunctionSignature(new QName("wildcard-contains", NGramModule.NAMESPACE_URI, NGramModule.PREFIX),
            "Similar to the standard XQuery fn:matches function, but based on the NGram index and "
                + "allowing wildcards in the query string. " + SEARCH_DESCRIPTION
                + "The string has to match the whole node's content.", new SequenceType[] {
                new FunctionParameterSequenceType("nodes", Type.NODE, Cardinality.ZERO_OR_MORE,
                    "The input node set to search"),
                new FunctionParameterSequenceType("queryString", Type.STRING, Cardinality.ZERO_OR_ONE,
                    WILDCARD_PATTERN_DESCRIPTION) }, new FunctionReturnSequenceType(Type.NODE,
                Cardinality.ZERO_OR_MORE, "a set of nodes from the input node set $nodes matching the query string "
                    + "or the empty sequence")) };

    private LocationStep contextStep = null;
    protected QName contextQName = null;
    protected int axis = Constants.UNKNOWN_AXIS;
    private NodeSet preselectResult = null;
    protected boolean optimizeSelf = false;
    protected boolean optimizeChild = false;
   
    public NGramSearch(XQueryContext context, FunctionSignature signature) {
        super(context, signature);
    }

    @Override
    public void setArguments(List<Expression> arguments) throws XPathException {
        steps.clear();
        Expression path = arguments.get(0);
        steps.add(path);

        Expression arg = arguments.get(1);
        arg = new DynamicCardinalityCheck(context, Cardinality.ZERO_OR_ONE, arg, new org.exist.xquery.util.Error(
            Error.FUNC_PARAM_CARDINALITY, "2", mySignature));
        if(!Type.subTypeOf(arg.returnsType(), Type.ATOMIC))
            arg = new Atomize(context, arg);
        steps.add(arg);
    }

    /*
     * (non-Javadoc)
     *
    * @see org.exist.xquery.PathExpr#analyze(org.exist.xquery.Expression)
    */
    @Override
    public void analyze(AnalyzeContextInfo contextInfo) throws XPathException {
        super.analyze(contextInfo);
        List<LocationStep> steps = BasicExpressionVisitor.findLocationSteps(getArgument(0));
        if (!steps.isEmpty()) {
            LocationStep firstStep = steps.get(0);
            LocationStep lastStep = steps.get(steps.size() - 1);
            if (firstStep != null && steps.size() == 1 && firstStep.getAxis() == Constants.SELF_AXIS) {
                Expression outerExpr = contextInfo.getContextStep();
                if (outerExpr != null && outerExpr instanceof LocationStep) {
                    LocationStep outerStep = (LocationStep) outerExpr;
                    NodeTest test = outerStep.getTest();
                    if (!test.isWildcardTest() && test.getName() != null) {
                        contextQName = new QName(test.getName());
                        if (outerStep.getAxis() == Constants.ATTRIBUTE_AXIS
                            || outerStep.getAxis() == Constants.DESCENDANT_ATTRIBUTE_AXIS)
                            contextQName.setNameType(ElementValue.ATTRIBUTE);
                        contextStep = firstStep;
                        axis = outerStep.getAxis();
                        optimizeSelf = true;
                    }
                }
            } else if (lastStep != null && firstStep != null) {
                NodeTest test = lastStep.getTest();
                if (!test.isWildcardTest() && test.getName() != null) {
                    contextQName = new QName(test.getName());
                    if (lastStep.getAxis() == Constants.ATTRIBUTE_AXIS
                        || lastStep.getAxis() == Constants.DESCENDANT_ATTRIBUTE_AXIS)
                        contextQName.setNameType(ElementValue.ATTRIBUTE);
                    axis = firstStep.getAxis();
                    optimizeChild = steps.size() == 1 &&
                        (axis == Constants.CHILD_AXIS || axis == Constants.ATTRIBUTE_AXIS);
                    contextStep = lastStep;
                }
            }
        }
    }

    @Override
    public boolean canOptimize(Sequence contextSequence) {
        return contextQName != null;
    }

    @Override
    public boolean optimizeOnSelf() {
        return optimizeSelf;
    }

    public boolean optimizeOnChild() {
        return optimizeChild;
    }

    @Override
    public int getOptimizeAxis() {
        return axis;
    }

    @Override
    public NodeSet preSelect(Sequence contextSequence, boolean useContext) throws XPathException {
        // the expression can be called multiple times, so we need to clear the previous preselectResult
        preselectResult = null;

        long start = System.currentTimeMillis();

        NGramIndexWorker index = (NGramIndexWorker) context.getBroker().getIndexController().getWorkerByIndexId(
            NGramIndex.ID);
        DocumentSet docs = contextSequence.getDocumentSet();
        String key = getArgument(1).eval(contextSequence).getStringValue();
        List<QName> qnames = new ArrayList<QName>(1);
        qnames.add(contextQName);
        preselectResult = processMatches(index, docs, qnames, key, useContext ? contextSequence.toNodeSet() : null,
            NodeSet.DESCENDANT);

        if( context.getProfiler().traceFunctions() ) {
            // report index use
            context.getProfiler().traceIndexUsage( context, "ngram", this, PerformanceStats.OPTIMIZED_INDEX, System.currentTimeMillis() - start );
        }
        return preselectResult;
    }

    @Override
    public Sequence eval(Sequence contextSequence, Item contextItem) throws XPathException {
        if (contextItem != null)
      contextSequence = contextItem.toSequence();

        NodeSet result;
        if (preselectResult == null) {
            Sequence input = getArgument(0).eval(contextSequence, contextItem);
            if (input.isEmpty())
                result = NodeSet.EMPTY_SET;
            else {
                long start = System.currentTimeMillis();
                NodeSet inNodes = input.toNodeSet();
                DocumentSet docs = inNodes.getDocumentSet();
                NGramIndexWorker index = (NGramIndexWorker) context.getBroker().getIndexController()
                    .getWorkerByIndexId(NGramIndex.ID);
                //Alternate design
                // NGramIndexWorker index =
                // (NGramIndexWorker)context.getBroker().getBrokerPool().getIndexManager().getIndexById(NGramIndex.ID).getWorker();

                String key = getArgument(1).eval(contextSequence, contextItem).getStringValue();
                List<QName> qnames = null;
                if (contextQName != null) {
                    qnames = new ArrayList<QName>(1);
                    qnames.add(contextQName);
                }
                result = processMatches(index, docs, qnames, key, inNodes, NodeSet.ANCESTOR);
                if( context.getProfiler().traceFunctions() ) {
                    // report index use
                    context.getProfiler().traceIndexUsage( context, "ngram", this, PerformanceStats.BASIC_INDEX, System.currentTimeMillis() - start );
                }
            }
        } else {
            contextStep.setPreloadedData(contextSequence.getDocumentSet(), preselectResult);
            result = getArgument(0).eval(contextSequence).toNodeSet();
        }
        return result;
    }

    private String getLocalName() {
        return getSignature().getName().getLocalName();
    }

    private NodeSet processMatches(
        NGramIndexWorker index, DocumentSet docs, List<QName> qnames, String query, NodeSet nodeSet, int axis)
        throws XPathException {

        EvaluatableExpression parsedQuery = null;

        if (getLocalName().equals("wildcard-contains"))
            parsedQuery = parseQuery(query);
              else
            parsedQuery = new FixedString(this, query);

        LOG.debug("Parsed Query: " + parsedQuery);
        NodeSet result = parsedQuery.eval(index, docs, qnames, nodeSet, axis, this
                .getExpressionId());

        if (getLocalName().startsWith("starts-with"))
            result = NodeSets.getNodesMatchingAtStart(result, getExpressionId());
        else if (getLocalName().startsWith("ends-with"))
            result = NodeSets.getNodesMatchingAtEnd(result, getExpressionId());

        result = NodeSets.fmapNodes(result, new F<NodeProxy, NodeProxy>() {

            @Override
            public NodeProxy f(NodeProxy a) {
                return NodeProxies.fmapOwnMatches(a, new F<Match, Match>() {

                    @Override
                    public Match f(Match a) {
                        return a.filterOutOverlappingOffsets();
                    }
                }, getExpressionId());
            }
        });

        return result;
    }

    private EvaluatableExpression parseQuery(final String query) throws XPathException {
        List<String> queryTokens = tokenizeQuery(query);

        LOG.trace("Tokenized query: " + queryTokens);

        if (queryTokens.isEmpty())
            return new EmptyExpression();

        List<WildcardedExpression> expressions = new ArrayList<WildcardedExpression>();

        if (queryTokens.get(0).equals("^")) {
            expressions.add(new StartAnchor());
            queryTokens.remove(0);
        }

        if (queryTokens.isEmpty())
            return new EmptyExpression();

        boolean endAnchorPresent = false;
        if (queryTokens.get(queryTokens.size() - 1).equals("$")) {
            endAnchorPresent = true;
            queryTokens.remove(queryTokens.size() - 1);
        }

        if (queryTokens.isEmpty())
            return new EmptyExpression();

        for (String token : queryTokens) {
            if (token.startsWith(".")) {
                Wildcard wildcard = null;
                if (token.length() == 1) {
                    wildcard = new Wildcard(1, 1);
            } else {
                    String qualifier = token.substring(1);
                    if (qualifier.equals("?"))
                        wildcard = new Wildcard(0, 1);
                    else if (qualifier.equals("*"))
                        wildcard = new Wildcard(0, Integer.MAX_VALUE);
                    else if (qualifier.equals("+"))
                        wildcard = new Wildcard(1, Integer.MAX_VALUE);
                    else {
                        Pattern p = Pattern.compile(INTERVAL_QUALIFIER_PATTERN);
                        Matcher m = p.matcher(qualifier);
                        if (!m.matches()) // Should not happen
                            throw new XPathException(
                            this,
                            ErrorCodes.FTDY0020,
                            "query string violates wildcard qualifier syntax"
                        );
                        try {
                          wildcard = new Wildcard(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)));
                        } catch (NumberFormatException nfe) {
                            throw new XPathException(this,
                            ErrorCodes.FTDY0020,
                            "query string violates wildcard qualifier syntax",
                            new StringValue(query),
                            nfe
                        );
                        }
                    }
                }
                expressions.add(wildcard);
            } else {
                if (token.startsWith("[")) {
                    Set<String> strings = new HashSet<String>(token.length() - 2);
                    for (int i = 1; i < token.length() - 1; i++)
                        strings.add(Character.toString(token.charAt(i)));
                    expressions.add(new AlternativeStrings(this, strings));
                } else {
                    expressions.add(new FixedString(this, unescape(token)));
                }
            }
        }

        if (endAnchorPresent)
            expressions.add(new EndAnchor());

        return new WildcardedExpressionSequence(expressions);
    }

    private static String unescape(final String s) {
        return s.replaceAll("\\\\(.)", "$1");
    }

    private static List<String> tokenizeQuery(final String query) throws XPathException {
        List<String> result = new ArrayList<String>();

        StringBuilder token = new StringBuilder();

        for (int i = 0; i < query.length(); i++) {
            char currentChar = query.charAt(i);
            if (currentChar == '\\') {
                // Escape sequence
                if ((i + 1) < query.length()) {
                    token.append(query.substring(i, i + 2));
                    i++;
                } else {
                    throw new XPathException("err:FTDY0020: query string is terminated by an unescaped backslash");
                }
            } else {
                if (currentChar == '.') {
                    int wildcardEnd = i;
                    if (token.length() > 0) {
                        result.add(token.toString());
                        token = new StringBuilder();
                    }
                    if ((i + 1) < query.length()) {
                        char peek = query.charAt(i + 1);
                        if (peek == '?' || peek == '*' || peek == '+')
                            wildcardEnd = i + 1;
                        if (peek == '{') {
                            wildcardEnd = query.indexOf('}', i + 2);
                            if (wildcardEnd == -1)
                                throw new XPathException(
                                    "err:FTDY0020: query string violates wildcard syntax: Unmatched qualifier start { in query string; marked by <-- HERE in \""
                                        + query.substring(0, i + 2) + " <-- HERE " + query.substring(i + 2) + "\"");
                            if (!query.substring(i + 1, wildcardEnd + 1).matches(INTERVAL_QUALIFIER_PATTERN))
                                throw new XPathException(
                                    "err:FTDY0020: query string violates wildcard qualifier syntax;  marked by <-- HERE in \""
                                        + query.substring(0, wildcardEnd + 1) + " <-- HERE "
                                        + query.substring(wildcardEnd + 1) + "\"");
                        }
                    }
                    result.add(query.substring(i, wildcardEnd + 1));
                    i = wildcardEnd;
                } else {
                    if (currentChar == '[') {
                        int characterClassEnd = query.indexOf(']', i + 2); // Character classses can not be empty, thus
                        // start search for end at i+2
                        if (characterClassEnd == -1)
                            throw new XPathException(
                                "err:FTDY0020: query string violates wildcard syntax: Unmatched [ in query string; marked by <-- HERE in \""
                                    + query.substring(0, i + 1) + " <-- HERE " + query.substring(i + 1) + "\"");
                        if (token.length() > 0) {
                            result.add(token.toString());
                            token = new StringBuilder();
                        }
                        result.add(query.substring(i, characterClassEnd + 1));
                        i = characterClassEnd;
                    } else {
                        if (currentChar == '^') {
                            if (token.length() > 0) {
                                result.add(token.toString());
                                token = new StringBuilder();

                            }
                            result.add("^");
                        } else if (currentChar == '$') {
                            if (token.length() > 0) {
                                result.add(token.toString());
                                token = new StringBuilder();
                            }
                            result.add("$");
                        } else
                            // default case
                            token.append(currentChar);
                        }
                    }
                }
            }

        if (token.length() > 0) {
            result.add(token.toString());
        }

        return result;
    }

    public NodeSet fixedStringSearch(
        final NGramIndexWorker index, final DocumentSet docs, final List<QName> qnames, final String query,
        final NodeSet nodeSet, final int axis) throws XPathException {
        String[] ngrams = NGramSearch.getDistinctNGrams(query, index.getN());

        // Nothing to search for? The find nothing.
        if (ngrams.length == 0)
            return new EmptyNodeSet();

        String firstNgramm = ngrams[0];
        LOG.trace("First NGRAM: " + firstNgramm);
        NodeSet result = index.search(getExpressionId(), docs, qnames, firstNgramm, firstNgramm, context, nodeSet, axis);

        for (int i = 1; i < ngrams.length; i++) {
            String ngram = ngrams[i];
            int len = ngram.codePointCount(0, ngram.length());
            int fillSize = index.getN() - len;
            String filledNgram = ngram;

            // if this ngram is shorter than n,
            // fill it up with characters from the previous ngram. too short
            // ngrams lead to a considerable performance loss.
            if (fillSize > 0) {
                String filler = ngrams[i - 1];
                StringBuilder buf = new StringBuilder();
                int pos = filler.offsetByCodePoints(0, len);
                for (int j = 0; j < fillSize; j++) {
                    int codepoint = filler.codePointAt(pos);
                    pos += Character.charCount(codepoint);
                    buf.appendCodePoint(codepoint);
                }
                buf.append(ngram);
                filledNgram = buf.toString();
                LOG.debug("Filled: " + filledNgram);
            }

            NodeSet nodes = index.search(getExpressionId(), docs, qnames, filledNgram, ngram, context, nodeSet, axis);

            final NodeSet nodesContainingFirstINgrams = result;

            result = NodeSets.fmapNodes(nodes, new F<NodeProxy, NodeProxy>() {

                @Override
                public NodeProxy f(NodeProxy a) {
                    NodeProxy before = nodesContainingFirstINgrams.get(a);
                    if (before != null) {
                        return getContinuousMatches(before, a);
                    } else {
                        return null;
                    }
                }
            });
        }
        return result;
  }

    /**
     * Finds all matches in head which are followed by matches in tail in the specified distance.
     *
     * @param head
     *            a nodeset with matches
     * @param tail
     *            another nodeset with matches
     * @return a nodeset containing all matches from the head which are directly followed by matches in the tail
     */
    private NodeProxy getContinuousMatches(final NodeProxy head, final NodeProxy tail) {
        // NodeSet result = new ExtArrayNodeSet();
        Match continuousMatch = null;
       
        Match headMatch = head.getMatches();
        while (headMatch != null && continuousMatch == null) {
            Match tailMatch = tail.getMatches();
            while (tailMatch != null && continuousMatch == null) {
                continuousMatch = headMatch.continuedBy(tailMatch);
                tailMatch = tailMatch.getNextMatch();
              }
            headMatch = headMatch.getNextMatch();
            }
        if (continuousMatch != null) {
            NodeProxies.filterMatches(tail, new F<Match, Boolean>() {

                @Override
                public Boolean f(Match a) {
                    return (a.getContextId() != getExpressionId());
        }
            });

            tail.addMatch(continuousMatch);
            return tail;
        }
        return null;
    }
   
    @Override
  public int getDependencies() {
        final Expression stringArg = getArgument(0);
        if (Type.subTypeOf(stringArg.returnsType(), Type.NODE)
            && !Dependency.dependsOn(stringArg, Dependency.CONTEXT_ITEM)) {
            return Dependency.CONTEXT_SET;
        } else {
            return Dependency.CONTEXT_SET + Dependency.CONTEXT_ITEM;
        }
    }

    @Override
    public int returnsType() {
        return Type.NODE;
    }

    /**
     * Split the specified string into a sequence of ngrams to be used for querying the index. For example, if we have a
     * 3-gram index, the string 'distinct' will be split into the ngrams 'dis', 'tin' and 'ct'.
     *
     * @param text
     *            the character sequence to split
     * @return a sequence of ngrams. the last item might be shorter than n.
     */
    private static String[] getDistinctNGrams(final String text, final int ngramSize) {
        int len = text.codePointCount(0, text.length());
        int count = len / ngramSize;
        int remainder = len % ngramSize;

        String[] n = new String[(remainder > 0 ? count + 1 : count)];
        int pos = 0;
        for (int i = 0; i < count; i++) {
            StringBuilder bld = new StringBuilder(ngramSize);
            for (int j = 0; j < ngramSize; j++) {
                int next = Character.toLowerCase(text.codePointAt(pos));
                pos += Character.charCount(next);
                bld.appendCodePoint(next);
            }
            n[i] = bld.toString();
        }
        if (remainder > 0) {
            StringBuilder bld = new StringBuilder(remainder);
            for (int j = 0; j < remainder; j++) {
                int next = Character.toLowerCase(text.codePointAt(pos));
                pos += Character.charCount(next);
                bld.appendCodePoint(next);
            }
            n[count] = bld.toString();
        }
        return n;
    }
}
TOP

Related Classes of org.exist.xquery.modules.ngram.NGramSearch

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.