Package org.exist.xquery.functions.text

Source Code of org.exist.xquery.functions.text.KWICDisplay

/*
*  eXist Open Source Native XML Database
*  Copyright (C) 2001-09 The eXist Project
*  http://exist-db.org
*  This program is free software; you can redistribute it and/or
*  modify it under the terms of the GNU Lesser General Public License
*  as published by the Free Software Foundation; either version 2
*  of the License, or (at your option) any later version.
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*  $Id$
*/
package org.exist.xquery.functions.text;

import java.util.ArrayList;
import java.util.List;

import org.exist.dom.Match;
import org.exist.dom.NodeProxy;
import org.exist.dom.QName;
import org.exist.dom.TextImpl;
import org.exist.memtree.DocumentBuilderReceiver;
import org.exist.memtree.MemTreeBuilder;
import org.exist.util.FastQSort;
import org.exist.xquery.BasicFunction;
import org.exist.xquery.Cardinality;
import org.exist.xquery.FunctionCall;
import org.exist.xquery.FunctionSignature;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.value.FunctionParameterSequenceType;
import org.exist.xquery.value.FunctionReference;
import org.exist.xquery.value.FunctionReturnSequenceType;
import org.exist.xquery.value.IntegerValue;
import org.exist.xquery.value.Item;
import org.exist.xquery.value.NodeValue;
import org.exist.xquery.value.Sequence;
import org.exist.xquery.value.SequenceIterator;
import org.exist.xquery.value.SequenceType;
import org.exist.xquery.value.StringValue;
import org.exist.xquery.value.Type;
import org.exist.xquery.value.ValueSequence;
import org.xml.sax.SAXException;

public class KWICDisplay extends BasicFunction {

  protected static final FunctionParameterSequenceType TEXT_ARG = new FunctionParameterSequenceType("text", Type.TEXT, Cardinality.ZERO_OR_MORE, "The text nodes");
  protected static final FunctionParameterSequenceType WIDTH_ARG = new FunctionParameterSequenceType("width", Type.POSITIVE_INTEGER, Cardinality.EXACTLY_ONE, "The width");
  protected static final FunctionParameterSequenceType CALLBACK_ARG = new FunctionParameterSequenceType("callback-function", Type.FUNCTION_REFERENCE, Cardinality.EXACTLY_ONE, "The callback function");
    protected static final FunctionParameterSequenceType RESULT_CALLBACK_ARG = new FunctionParameterSequenceType("result-callback", Type.FUNCTION_REFERENCE, Cardinality.EXACTLY_ONE, "The result callback function");
  protected static final FunctionParameterSequenceType PARAMETERS_ARG = new FunctionParameterSequenceType("parameters", Type.ITEM, Cardinality.ZERO_OR_MORE, "The parameters passed into the last argument of the callback function");

  public final static FunctionSignature signatures[] = {
        new FunctionSignature(
            new QName("kwic-display", TextModule.NAMESPACE_URI, TextModule.PREFIX),
            "Deprecated: kwic functionality is now provided by an XQuery module, see " +
            "http://exist-org/kwic.html." +
            "This function takes a sequence of text nodes in $a, containing matches from a fulltext search. " +
            "It highlights matching strings within those text nodes in the same way as the text:highlight-matches " +
            "function. However, only a defined portion of the text surrounding the first match (and maybe following matches) " +
            "is returned. If the text preceding the first match is larger than the width specified in the second argument $b, " +
            "it will be truncated to fill no more than (width - keyword-length) / 2 characters. Likewise, the text following " +
            "the match will be truncated in such a way that the whole string sequence fits into width characters. " +
            "The third parameter $c is a callback function (defined with util:function). $d may contain an additional sequence of " +
            "values that will be passed to the last parameter of the callback function. Any matching character sequence is reported " +
            "to the callback function, and the " +
            "result of the function call is inserted into the resulting node set where the matching sequence occurred. " +
            "For example, you can use this to mark all matching terms with a <span class=\"highlight\">abc</span>. " +
            "The callback function should take 3 or 4 arguments: 1) the text sequence corresponding to the match as xs:string, " +
            "2) the text node to which this match belongs, 3) the sequence passed as last argument to kwic-display. " +
            "If the callback function accepts 4 arguments, the last argument will contain additional " +
            "information on the match as a sequence of 4 integers: a) the number of the match if there's more than " +
            "one match in a text node - the first match will be numbered 1; b) the offset of the match into the original text node " +
            "string; c) the length of the match as reported by the index.",
            new SequenceType[]{ TEXT_ARG, WIDTH_ARG, CALLBACK_ARG, PARAMETERS_ARG },
            new FunctionReturnSequenceType(Type.NODE, Cardinality.ZERO_OR_MORE, "the results"),
            "Improved kwic functionality is now provided by a separate XQuery module, see " +
            "http://www.exist-db.org/exist/apps/doc/kwic.xml."),
        new FunctionSignature(
                new QName("kwic-display", TextModule.NAMESPACE_URI, TextModule.PREFIX),
                "This function takes a sequence of text nodes in $a, containing matches from a fulltext search. " +
                "It highlights matching strings within those text nodes in the same way as the text:highlight-matches " +
                "function. However, only a defined portion of the text surrounding the first match (and maybe following matches) " +
                "is returned. If the text preceding the first match is larger than the width specified in the second argument $b, " +
                "it will be truncated to fill no more than (width - keyword-length) / 2 characters. Likewise, the text following " +
                "the match will be truncated in such a way that the whole string sequence fits into width characters. " +
                "The third parameter $c is a callback function (defined with util:function). $d may contain an additional sequence of " +
                "values that will be passed to the last parameter of the callback function. Any matching character sequence is reported " +
                "to the callback function, and the " +
                "result of the function call is inserted into the resulting node set where the matching sequence occurred. " +
                "For example, you can use this to mark all matching terms with a <span class=\"highlight\">abc</span>. " +
                "The callback function should take 3 or 4 arguments: 1) the text sequence corresponding to the match as xs:string, " +
                "2) the text node to which this match belongs, 3) the sequence passed as last argument to kwic-display. " +
                "If the callback function accepts 4 arguments, the last argument will contain additional " +
                "information on the match as a sequence of 4 integers: a) the number of the match if there's more than " +
                "one match in a text node - the first match will be numbered 1; b) the offset of the match into the original text node " +
                "string; c) the length of the match as reported by the index.",
                new SequenceType[]{ TEXT_ARG, WIDTH_ARG, CALLBACK_ARG, RESULT_CALLBACK_ARG, PARAMETERS_ARG },
                new FunctionReturnSequenceType(Type.NODE, Cardinality.ZERO_OR_MORE, "the results"),
                "Improved kwic functionality is now provided by a separate XQuery module, see " +
                "http://www.exist-db.org/exist/apps/doc/kwic.xml.")
    };
   
    public KWICDisplay(XQueryContext context, FunctionSignature signature) {
        super(context, signature);
    }

    public Sequence eval(Sequence[] args, Sequence contextSequence)
            throws XPathException {
        if (args[0].isEmpty())
            {return Sequence.EMPTY_SEQUENCE;}
       
        final FunctionReference call = (FunctionReference) args[2].itemAt(0);
       
        FunctionReference resultCallback = null;
        if (getArgumentCount() == 5) {
            resultCallback = (FunctionReference) args[3].itemAt(0);
        }
       
        final int width = ((IntegerValue)args[1].itemAt(0)).getInt();
       
        context.pushDocumentContext();
       
        final MemTreeBuilder builder = context.getDocumentBuilder();
        final Sequence result = processText(builder, args[0], width, call, resultCallback, args[getArgumentCount() - 1]);
        context.popDocumentContext();
        return result;
    }

    private final Sequence processText(MemTreeBuilder builder, Sequence nodes, int width,
            FunctionReference callback, FunctionReference resultCallback, Sequence extraArgs) throws XPathException {
        final StringBuilder str = new StringBuilder();
        NodeValue node;
        List<Match.Offset> offsets = null;
        NodeProxy firstProxy = null;
       
        // First step: scan the passed node sequence and collect the string values of all nodes.
        // Translate the relative offsets into absolute offsets.
        for (final SequenceIterator i = nodes.iterate(); i.hasNext(); ) {
            node = (NodeValue) i.nextItem();
            if (node.getImplementationType() == NodeValue.IN_MEMORY_NODE)
                {throw new XPathException(this, "Function kwic-display" +
                        " can not be invoked on constructed nodes");}
            NodeProxy proxy = (NodeProxy) node;
            // remember the first node, we need it later
            if (firstProxy == null)
                {firstProxy = proxy;}
            final TextImpl text = (TextImpl) proxy.getNode();
           
            Match next = proxy.getMatches();
            while (next != null) {
                if (next.getNodeId().equals(text.getNodeId())) {
                    if (offsets == null)
                        {offsets = new ArrayList<Match.Offset>();}
                    final int freq = next.getFrequency();
                    for (int j = 0; j < freq; j++) {
                        // translate the relative offset into an absolute offset and add it to the list
                        final Match.Offset offset = next.getOffset(j);
                        offset.setOffset(str.length() + offset.getOffset());
                        offsets.add(offset);
                    }
                }
                next = next.getNextMatch();
            }
           
            // append the string value of the node to the buffer
            str.append(text.getData());
        }
       
        // Second step: output the text
        ValueSequence result = new ValueSequence();
        final DocumentBuilderReceiver receiver = new DocumentBuilderReceiver(builder);
        int nodeNr;
        int currentWidth = 0;
        if (offsets == null) {
            // no matches: just output the entire text
            if (width > str.length())
                {width = str.length();}
            nodeNr = builder.characters(str.substring(0, width));
            result.add(builder.getDocument().getNode(nodeNr));
            currentWidth += width;
        } else {
            // sort the offsets
            FastQSort.sort(offsets, 0, offsets.size() - 1);
           
            int nextOffset = 0;
            int pos = 0;
            int lastNodeNr = -1;
           
            // prepare array for callback function arguments
            final Sequence params[] = new Sequence[callback.getSignature().getArgumentCount()];
            params[1] = firstProxy;
            params[2] = extraArgs;
           
            // handle the first match: if the text to the left of the match
            // is larger than half of the width, truncate it.
            if (str.length() > width) {
                final Match.Offset firstMatch = offsets.get(nextOffset++);
                if (firstMatch.getOffset() > 0) {
                    int leftWidth = (width - firstMatch.getLength()) / 2;
                    if (firstMatch.getOffset() > leftWidth) {
                        pos = truncateStart(str, firstMatch.getOffset() - leftWidth, firstMatch.getOffset());
                        leftWidth = firstMatch.getOffset() - pos;
                    } else
                        {leftWidth = firstMatch.getOffset();}
                    nodeNr = builder.characters(str.substring(pos, pos + leftWidth));
                    // adjacent chunks of text will be merged into one text node. we may
                    // thus get duplicate nodes here. check the nodeNr to avoid adding
                    // the same node twice.
                    if (lastNodeNr != nodeNr)
                      {result.add(builder.getDocument().getNode(nodeNr));}
                    lastNodeNr = nodeNr;
                    currentWidth += leftWidth;
                    pos += leftWidth;
                }
   
                // put the matching term into argument 0 of the callback function
                params[0] = new StringValue(str.substring(firstMatch.getOffset(), firstMatch.getOffset() + firstMatch.getLength()));
                // if the callback function accepts 4 arguments, the last argument should contain additional
                // information on the match:
                if (callback.getSignature().getArgumentCount() == 4) {
                  params[3] = new ValueSequence();
                  params[3].add(new IntegerValue(nextOffset - 1));
                  params[3].add(new IntegerValue(firstMatch.getOffset()));
                  params[3].add(new IntegerValue(firstMatch.getLength()));
                }
                // now execute the callback func.
                final Sequence callbackResult = callback.evalFunction(null, null, params);
                // iterate through the result of the callback
                for (final SequenceIterator iter = callbackResult.iterate(); iter.hasNext(); ) {
                  final Item next = iter.nextItem();
                  if (Type.subTypeOf(next.getType(), Type.NODE)) {
                    nodeNr = builder.getDocument().getLastNode();
                    try {
              next.copyTo(context.getBroker(), receiver);
              result.add(builder.getDocument().getNode(++nodeNr));
              lastNodeNr = nodeNr;
            } catch (final SAXException e) {
              throw new XPathException(this, "Internal error while copying nodes: " + e.getMessage(), e);
            }
                  }
                }
                currentWidth += firstMatch.getLength();
                pos += firstMatch.getLength();
            } else
                {width = str.length();}
           
            // output the rest of the text and matches
            Match.Offset offset;
            for (int i = nextOffset; i < offsets.size() && currentWidth < width; i++) {
                offset = offsets.get(i);
                if (offset.getOffset() > pos) {
                    int len = offset.getOffset() - pos;
                    if (currentWidth + len > width)
                        {len = width - currentWidth;}
                    nodeNr = builder.characters(str.substring(pos, pos + len));
                    if (lastNodeNr != nodeNr)
                      {result.add(builder.getDocument().getNode(nodeNr));}
                    currentWidth += len;
                    pos += len;
                }
               
                if (currentWidth + offset.getLength() < width) {
                  // put the matching term into argument 0 of the callback function
                    params[0] = new StringValue(str.substring(offset.getOffset(), offset.getOffset() + offset.getLength()));
                    // if the callback function accepts 4 arguments, the last argument should contain additional
                    // information on the match:
                    if (callback.getSignature().getArgumentCount() == 4) {
                      params[3] = new ValueSequence();
                      params[3].add(new IntegerValue(i));
                      params[3].add(new IntegerValue(offset.getOffset()));
                      params[3].add(new IntegerValue(offset.getLength()));
                    }
                    // execute the callback function
                    final Sequence callbackResult = callback.evalFunction(null, null, params);
                    for (final SequenceIterator iter = callbackResult.iterate(); iter.hasNext(); ) {
                      final Item next = iter.nextItem();
                      if (Type.subTypeOf(next.getType(), Type.NODE)) {
                        nodeNr = builder.getDocument().getLastNode();
                        try {
                  next.copyTo(context.getBroker(), receiver);
                  result.add(builder.getDocument().getNode(++nodeNr));
                  lastNodeNr = nodeNr;
                } catch (final SAXException e) {
                  throw new XPathException(this, "Internal error while copying nodes: " + e.getMessage(), e);
                }
                      }
                    }
                    currentWidth += offset.getLength();
                    pos += offset.getLength();
                } else
                    {break;}
            }
            // print the final text chunk if more space is available
            if (currentWidth < width && pos < str.length()) {
                boolean truncated = false;
                int len = str.length() - pos;
                if (len > width - currentWidth) {
                    truncated = true;
                    len = width - currentWidth;
                }
                nodeNr = builder.characters(str.substring(pos, pos + len));
                if (lastNodeNr != nodeNr)
                  {result.add(builder.getDocument().getNode(nodeNr));}
                lastNodeNr = nodeNr;
                currentWidth += len;
               
                if (truncated) {
                    nodeNr = builder.characters(" ...");
                    if (lastNodeNr != nodeNr)
                      {result.add(builder.getDocument().getNode(nodeNr));}
                    lastNodeNr = nodeNr;
                }
            }
        }
       
        // if the user specified a result callback function, call it now
        if (resultCallback != null) {
            final Sequence params[] = new Sequence[3];
            params[0] = result;
            params[1] = new IntegerValue(currentWidth);
            params[2] = extraArgs;
            return resultCallback.evalFunction(null, null, params);
        } else
            {return result;}
    }
   
    private final static int truncateStart(StringBuilder buf, int start, int end) {
        if (start > 0 && !Character.isLetterOrDigit(buf.charAt(start - 1)))
            {return start;}
        while (start < end && Character.isLetterOrDigit(buf.charAt(start))) {
            start++;
        }
       
        while (start < end && !Character.isLetterOrDigit(buf.charAt(start))) {
            start++;
        }
        return start;
    }
}
TOP

Related Classes of org.exist.xquery.functions.text.KWICDisplay

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.