Package org.exist.storage.analysis

Examples of org.exist.storage.analysis.TextToken


                term = terms[j];
            } else {
                break;
            }
            int current_distance = -1;
            TextToken token;
            while ((token = tok.nextToken()) != null) {
                final String word = token.getText().toLowerCase();
                if (current_distance > max_distance) {
                    // reset
                    j = 0;
                    term = terms[j];
                    current_distance = -1;
View Full Code Here


        }
        //Walk through hits and calculate term-distances
        final ExtArrayNodeSet r = new ExtArrayNodeSet(100);
        final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
        Matcher matcher;
        TextToken token;
        for (final NodeProxy current : result) {
            final String value = current.getNodeValueSeparated();
            tok.setText(value);
            int j = 0;
            if (j < patterns.length) {
                matcher = matchers[j];
            } else {
                break;
            }
            int current_distance = -1;
            while ((token = tok.nextToken()) != null) {
                final String word = token.getText().toLowerCase();
                if (current_distance > max_distance) {
                    //Reset
                    j = 0;
                    matcher = matchers[j];
                    current_distance = -1;
View Full Code Here

    public void storeAttribute(AttrImpl node, NodePath currentPath, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
        if ((indexingHint & ATTRIBUTE_BY_QNAME) == ATTRIBUTE_BY_QNAME ||
                (indexingHint & ATTRIBUTE_NOT_BY_QNAME) == ATTRIBUTE_NOT_BY_QNAME) {
            //TODO : case conversion should be handled by the tokenizer -pb
            tokenizer.setText(node.getValue().toLowerCase());
            TextToken token;
            while (null != (token = tokenizer.nextToken())) {
                if (token.length() > MAX_TOKEN_LENGTH) {
                    LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " +
                        token.getText().substring(0,20) + "...");
                    continue;
                }
                if (stoplist.contains(token)) {
                    continue;
                }
                //TODO : the tokenizer should strip unwanted token types itself -pb
                if (!token.isAlpha() && indexSpec != null && !indexSpec.getIncludeAlphaNum()) {
                    continue;
                }
                if (indexingHint == ATTRIBUTE_BY_QNAME)
                    {invertedIndex.addAttribute(token, node, remove);}
                else
View Full Code Here

    //TODO : use an indexSpec member in order to get rid of <code>noTokenizing</code>
    public void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
        if (indexingHint == TOKENIZE || indexingHint == DO_NOT_TOKENIZE) {
            //TODO : case conversion should be handled by the tokenizer -pb
            final XMLString t = node.getXMLString().transformToLower();
            TextToken token;
            if (indexingHint == DO_NOT_TOKENIZE) {
                token = new TextToken(TextToken.ALPHA, t, 0, t.length());
                invertedIndex.addText(token, node.getNodeId(), remove);
            } else if (indexingHint == TOKENIZE){
                tokenizer.setText(t);
                while (null != (token = tokenizer.nextToken())) {
                    if (token.length() > MAX_TOKEN_LENGTH) {
                        LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH +
                            ": " + token.getText().substring(0,20) + "...");
                        continue;
                    }
                    if (stoplist.contains(token)) {
                        continue;
                    }
                    if (indexSpec != null) {
                        //TODO : the tokenizer should strip unwanted token types itself -pb
                        if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) {
                            continue;
                        }
                    }
                    invertedIndex.addText(token, node.getNodeId(), remove);
                }
View Full Code Here

        }
    }

    public void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
        //TODO : case conversion should be handled by the tokenizer -pb
        TextToken token;
        ElementContent.TextSpan span = text.getFirst();
        XMLString data = null;
        int currentOffset = 0;
        while (span != null) {
            if (data == null)
                {data = span.getContent().transformToLower();}
            else {
                currentOffset = data.length();
                data.append(span.getContent().transformToLower());
            }
            tokenizer.setText(data, currentOffset);
            while (null != (token = tokenizer.nextToken())) {
                if (token.length() > MAX_TOKEN_LENGTH) {
                    LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " + token.getText().substring(0,20) + "...");
                    continue;
                }
                if (stoplist.contains(token)) {
                    continue;
                }
                if (indexSpec != null) {
                    //TODO : the tokenizer should strip unwanted token types itself -pb
                    if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) {
                        continue;
                    }
                }
                if (indexingHint == TEXT_BY_QNAME)
                    {invertedIndex.addText(token, (ElementImpl) parent, remove);}
View Full Code Here

     * @param domIterator Description of the Parameter
     */
    //TODO : unify functionalities with storeText -pb
    private void collect(Set words, Iterator domIterator) {

        TextToken token;
        int readOffset;

        final byte[] data = ((Value) domIterator.next()).getData();
        final short type = Signatures.getType(data[OFFSET_NODE_TYPE]);
        switch (type) {
        case Node.ELEMENT_NODE :
            final int childrenCount = ByteConversion.byteToInt(data, OFFSET_ELEMENT_CHILDREN_COUNT);
            for (int i = 0; i < childrenCount; i++)
                //recursive call on children
                collect(words, domIterator);
            break;
        case Node.TEXT_NODE :
            int dlnLen = ByteConversion.byteToShort(data, OFFSET_TEXT_DLN_LENGTH);
            int nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN);

            readOffset = nodeIdLen + OFFSET_DLN;
            final String s = new String(data, readOffset, data.length - readOffset, UTF_8);
            tokenizer.setText(s);

            while (null != (token = tokenizer.nextToken())) {
                final String word = token.getText();
                if (stoplist.contains(word))
                    {continue;}
                words.add(word.toLowerCase());
            }

            break;
        case Node.ATTRIBUTE_NODE :
            final byte idSizeType = (byte) (data[OFFSET_NODE_TYPE] & 0x3);
            final boolean hasNamespace = (data[OFFSET_NODE_TYPE] & 0x10) == 0x10;
            dlnLen = ByteConversion.byteToShort(data, OFFSET_ATTRIBUTE_DLN_LENGTH);
            nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN);

            readOffset = Signatures.getLength(idSizeType) + nodeIdLen + OFFSET_DLN;
            if (hasNamespace) {
                //TODO : check the order in wich both info are read (and discarded)
                readOffset += SymbolTable.LENGTH_LOCAL_NAME; // skip namespace id
                final short prefixLen = ByteConversion.byteToShort(data, readOffset);
                readOffset += prefixLen + SymbolTable.LENGTH_NS_URI; // skip prefix
            }

            final String val = new String(data, readOffset, data.length - readOffset, UTF_8);
            tokenizer.setText(val);

            while (null != (token = tokenizer.nextToken())) {
                final String word = token.getText();
                if (stoplist.contains(word))
                    {continue;}
                words.add(word.toLowerCase());
            }

View Full Code Here

     *
     * @param context
     * @param result
     */
    private Sequence exactMatch(XQueryContext context, String[] terms, NodeSet result) {
        TextToken token;
        final NodeSet r = new ExtArrayNodeSet();
        final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
        //Define search phrase for matches
        String matchTerm = "";
        for (int k = 0; k < terms.length ; k++) {
            matchTerm = matchTerm + terms[k];
            if (k != terms.length - 1)
                {matchTerm = matchTerm + "\\W*";}
        }
        //Iterate on results
        for (final NodeProxy current : result) {
            final Vector<NodeId> matchNodeIDs = new Vector<NodeId>();
            //Get first match
            Match nextMatch = current.getMatches();
            //Remove previously found matches on current
            current.setMatches(null);
            //Iterate on attach matches, with unicity of related nodeproxy gid
            String term;
            while(nextMatch != null) {
                final NodeId nodeId= nextMatch.getNodeId();
                //If current node id has not been previously processed
                if (!matchNodeIDs.contains(nodeId)) {
                    final NodeProxy mcurrent = new NodeProxy(current.getDocument(), nodeId);
                    Match match = null;
                    int firstOffset = -1;
                    matchNodeIDs.add(nodeId);
                    final String value = mcurrent.getNodeValue();
                    tok.setText(value);
                    int j = 0;
                    if (j < terms.length)
                        {term = terms[j];}
                    else
                        {break;}
                    int frequency = 0;
                    while ((token = tok.nextToken()) != null) {
                        final String word = token.getText().toLowerCase();
                        if (word.equalsIgnoreCase(term)) {
                            j++;
                            if (j == terms.length) {
                                //All terms found
                                if (match == null)
                                    {match = nextMatch.createInstance(getExpressionId(),
                                        nodeId, matchTerm);}
                                if (firstOffset < 0)
                                    {firstOffset = token.startOffset();}
                                match.addOffset(firstOffset, token.endOffset() - firstOffset);
                                frequency++;
                                //Start again on fist term
                                j = 0;
                                term = terms[j];
                                continue;
                            } else {
                                term = terms[j];
                                if (firstOffset < 0)
                                    {firstOffset = token.startOffset();}
                            }
                        } else if (j > 0 && word.equalsIgnoreCase(terms[0])) {
                            //First search term found: start again
                            j = 1;
                            term = terms[j];
                            firstOffset = token.startOffset();
                            continue;
                        } else {
                            //Reset
                            j = 0;
                            firstOffset = -1;
View Full Code Here

                    if (j < patterns.length) {
                        matcher = matchers[j];
                    } else
                        {break;}
                    String matchTerm = null;
                    TextToken token;
                    while ((token = tok.nextToken()) != null) {
                        String word = token.getText().toLowerCase();
                        matcher.reset(word);
                        matchers[0].reset(word);
                        if (matcher.matches()) {
                            j++;
                            if (matchTerm == null)
                                {matchTerm=word;}
                            else
                                {matchTerm = matchTerm + "\\W*" + word;}
                            if (j == patterns.length) {
                                //All terms found
                                if (matchTable.containsKey(matchTerm)) {
                                    //Previously found matchTerm
                                    final Match match = matchTable.get(matchTerm);
                                    match.addOffset(token.startOffset(), matchTerm.length());
                                } else {
                                    final Match match = nextMatch.createInstance(getExpressionId(),
                                        nodeId, matchTerm);
                                    match.addOffset(token.startOffset(), matchTerm.length());
                                    matchTable.put(matchTerm,match);
                                }
                                //Start again on fist term
                                j = 0;
                                matcher = matchers[j];
View Full Code Here

      {return Sequence.EMPTY_SEQUENCE;}
   
    final ValueSequence result = new ValueSequence();
    final SimpleTokenizer tokenizer = new SimpleTokenizer();
    tokenizer.setText(args[0].getStringValue());
    TextToken token = tokenizer.nextToken(false);
    while(token != null && token.getType() != TextToken.EOF) {
      result.add(new StringValue(token.getText()));
      token = tokenizer.nextToken(false);
    }
    return result;
  }
View Full Code Here

TOP

Related Classes of org.exist.storage.analysis.TextToken

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.