Package org.apache.cocoon.transformation

Source Code of org.apache.cocoon.transformation.LuceneIndexTransformerOptimized$IndexHelperField

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cocoon.transformation;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.Stack;

import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.avalon.framework.service.ServiceException;
import org.apache.avalon.framework.service.ServiceManager;
import org.apache.avalon.framework.service.Serviceable;
import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.IndexException;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.components.search.LuceneXMLIndexer;
import org.apache.cocoon.components.search.components.Indexer;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.commons.lang.BooleanUtils;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
* A lucene index creation transformer.
* <p>
* See <a
* href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer
* </a> documentation on the Cocoon Wiki.
* </p>
* <p>
* TODO: Write more documentation.
* </p>
*
* @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko </a>
* @author <a href="mailto:conal@nzetc.org">Conal Tuohy </a>
* @author Nicolas Maisonneuve
*/
public class LuceneIndexTransformerOptimized extends AbstractTransformer implements
        CacheableProcessingComponent, Configurable, Contextualizable,
        Serviceable {

    public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";

    public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";

    public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";

    public static final String DIRECTORY_CONFIG = "directory";

    public static final String DIRECTORY_PARAMETER = "directory";

    public static final String DIRECTORY_DEFAULT = "index";

    public static final String MERGE_FACTOR_CONFIG = "merge-factor";

    public static final String MERGE_FACTOR_PARAMETER = "merge-factor";

    public static final int MERGE_FACTOR_DEFAULT = 20;

    public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";

    public static final String LUCENE_QUERY_ELEMENT = "index";

    public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";

    public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";

    public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";

    public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";

    public static final String LUCENE_DOCUMENT_ELEMENT = "document";

    public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";

    public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";

    public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";

    public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";

    public static final String CDATA = "CDATA";

    // The 3 states of the state machine
    private static final int STATE_GROUND = 0; // initial or "ground" state

    private static final int STATE_QUERY = 1; // processing a lucene:index
                                                // (Query) element

    private static final int STATE_DOCUMENT = 2; // processing a
                                                    // lucene:document element

    // Initialization time variables
    protected File workDir = null;

    // service manager
    private ServiceManager manager;

    private Indexer indexer;

    // Declaration time parameters values (specified in sitemap component
    // config)
    private IndexerConfiguration configureConfiguration;

    // Invocation time parameters values (specified in sitemap transform
    // parameters)
    private IndexerConfiguration setupConfiguration;

    // Parameters specified in the input document
    private IndexerConfiguration queryConfiguration;

    // Runtime variables
    private int processing;

    private boolean createIndex = false;

    private StringBuffer bodyText;

    private Document bodyDocument;

    private String bodyDocumentURL;

    private Stack elementStack = new Stack();

    /**
     * Storage for the document element's attributes until the document has been
     * indexed, so that they can be copied to the output along with a boolean
     * <code>indexed</code> attribute.
     */
    private AttributesImpl documentAttributes;

    private long documentStartTime;

    private static String uid(String url) {
        return url.replace('/', '\u0000'); // + "\u0000" +
                                            // DateField.timeToString(urlConnection.getLastModified());
    }

    public void service(ServiceManager manager) throws ServiceException {
        this.manager = manager;
    }

    /**
     * Configure the transformer. The configuration parameters are stored as
     * general defaults, which may be over-ridden by parameters specified as
     * parameters in the sitemap pipeline, or by attributes of the query
     * element(s) in the XML input document.
     */
    public void configure(Configuration conf) throws ConfigurationException {
        this.configureConfiguration = new IndexerConfiguration(
                conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(
                        ANALYZER_CLASSNAME_DEFAULT), conf.getChild(
                        DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf
                        .getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(
                                MERGE_FACTOR_DEFAULT));
    }

    /**
     * Setup the transformer. Called when the pipeline is assembled. The
     * parameters are those specified as child elements of the
     * <code>&lt;map:transform&gt;</code> element in the sitemap. These
     * parameters are optional: If no parameters are specified here then the
     * defaults are supplied by the component configuration. Any parameters
     * specified here may be over-ridden by attributes of the lucene:index
     * element in the input document.
     */
    public void setup(SourceResolver resolver, Map objectModel, String src,
            Parameters parameters) throws ProcessingException, SAXException,
            IOException {
        setupConfiguration = new IndexerConfiguration(parameters.getParameter(
                ANALYZER_CLASSNAME_PARAMETER,
                configureConfiguration.analyzerClassname), parameters
                .getParameter(DIRECTORY_PARAMETER,
                        configureConfiguration.indexDirectory), parameters
                .getParameterAsInteger(MERGE_FACTOR_PARAMETER,
                        configureConfiguration.mergeFactor));
    }

    /**
     * Contextualize this class
     */
    public void contextualize(Context context) throws ContextException {
        this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
    }

    public void recycle() {
        this.processing = STATE_GROUND;
        if (this.indexer != null) {
            manager.release(indexer);
            indexer = null;
        }

        this.bodyText = null;
        this.bodyDocument = null;
        this.bodyDocumentURL = null;
        this.elementStack.clear();
        super.recycle();
    }

    /**
     * Generate the unique key. This key must be unique inside the space of this
     * component.
     *
     * @return The generated key
     */
    public Serializable getKey() {
        return "1";
    }

    /**
     * Generate the validity object.
     *
     * @return The generated validity object or <code>null</code> if the
     *         component is currently not cacheable.
     */
    public SourceValidity getValidity() {
        return NOPValidity.SHARED_INSTANCE;
    }

    public void startDocument() throws SAXException {
        super.startDocument();
    }

    public void endDocument() throws SAXException {
        super.endDocument();
    }

    /**
     * Begin the scope of a prefix-URI Namespace mapping.
     *
     * @param prefix
     *            The Namespace prefix being declared.
     * @param uri
     *            The Namespace URI the prefix is mapped to.
     */
    public void startPrefixMapping(String prefix, String uri)
            throws SAXException {
        if (processing == STATE_GROUND) {
            super.startPrefixMapping(prefix, uri);
        }
    }

    /**
     * End the scope of a prefix-URI mapping.
     *
     * @param prefix
     *            The prefix that was being mapping.
     */
    public void endPrefixMapping(String prefix) throws SAXException {
        if (processing == STATE_GROUND) {
            super.endPrefixMapping(prefix);
        }
    }

    public void startElement(String namespaceURI, String localName,
            String qName, Attributes atts) throws SAXException {

        if (processing == STATE_GROUND) {
            if (LUCENE_URI.equals(namespaceURI)
                    && LUCENE_QUERY_ELEMENT.equals(localName)) {
                String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
                createIndex = BooleanUtils.toBoolean(sCreate);

                String analyzerClassname = atts
                        .getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
                String indexDirectory = atts
                        .getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
                String mergeFactor = atts
                        .getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);

                queryConfiguration = new IndexerConfiguration(
                        analyzerClassname != null ? analyzerClassname
                                : setupConfiguration.analyzerClassname,
                        indexDirectory != null ? indexDirectory
                                : setupConfiguration.indexDirectory,
                        mergeFactor != null ? Integer.parseInt(mergeFactor)
                                : setupConfiguration.mergeFactor);

                // propagate the lucene:index to the next stage in the pipeline
                super.startElement(namespaceURI, localName, qName, atts);
                processing = STATE_QUERY;
            } else {
                super.startElement(namespaceURI, localName, qName, atts);
            }
        } else if (processing == STATE_QUERY) {
            // processing a lucene:index - expecting a lucene:document
            if (LUCENE_URI.equals(namespaceURI)
                    && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
                this.bodyDocumentURL = atts
                        .getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
                if (this.bodyDocumentURL == null) {
                    throw new SAXException(
                            "<lucene:document> must have @url attribute");
                }

                // Remember the time the document indexing began
                this.documentStartTime = System.currentTimeMillis();
                // remember these attributes so they can be passed on to the
                // next stage in the pipeline,
                // when this document element is ended.
                this.documentAttributes = new AttributesImpl(atts);
                this.bodyText = new StringBuffer();
                this.bodyDocument = new Document();
                this.elementStack.clear();
                processing = STATE_DOCUMENT;
            } else {
                throw new SAXException(
                        "<lucene:index> element can contain only <lucene:document> elements!");
            }
        } else if (processing == STATE_DOCUMENT) {
            elementStack.push(new IndexHelperField(localName,
                    new AttributesImpl(atts)));
        }
    }

    public void endElement(String namespaceURI, String localName, String qName)
            throws SAXException {

        if (processing == STATE_QUERY) {
            if (LUCENE_URI.equals(namespaceURI)
                    && LUCENE_QUERY_ELEMENT.equals(localName)) {
                // propagate the query element to the next stage in the pipeline
                super.endElement(namespaceURI, localName, qName);
                this.processing = STATE_GROUND;
            } else {
                throw new SAXException("</lucene:index> was expected!");
            }
        } else if (processing == STATE_DOCUMENT) {
            if (LUCENE_URI.equals(namespaceURI)
                    && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
                // End document processing
                this.bodyDocument.add(Field.UnStored(
                        LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
                this.bodyText = null;

                this.bodyDocument.add(Field.UnIndexed(
                        LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
                // store: false, index: true, tokenize: false
                this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD,
                        uid(this.bodyDocumentURL), false, true, false));
                try {
                    reindexDocument();
                } catch (IndexException e) {
                    throw new SAXException(e);
                }
                this.bodyDocumentURL = null;

                // propagate the lucene:document element to the next stage in
                // the pipeline
                long elapsedTime = System.currentTimeMillis()
                        - this.documentStartTime;
                // documentAttributes = new AttributesImpl();
                this.documentAttributes.addAttribute("",
                        LUCENE_ELAPSED_TIME_ATTRIBUTE,
                        LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String
                                .valueOf(elapsedTime));
                super.startElement(namespaceURI, localName, qName,
                        this.documentAttributes);
                super.endElement(namespaceURI, localName, qName);
                this.processing = STATE_QUERY;
            } else {
                // End element processing
                IndexHelperField tos = (IndexHelperField) elementStack.pop();
                StringBuffer text = tos.getText();

                Attributes atts = tos.getAttributes();
                boolean attributesToText = atts.getIndex(LUCENE_URI,
                        LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
                for (int i = 0; i < atts.getLength(); i++) {
                    // Ignore Lucene attributes
                    if (LUCENE_URI.equals(atts.getURI(i))) {
                        continue;
                    }

                    String atts_lname = atts.getLocalName(i);
                    String atts_value = atts.getValue(i);
                    bodyDocument.add(Field.UnStored(localName + "@"
                            + atts_lname, atts_value));
                    if (attributesToText) {
                        text.append(atts_value);
                        text.append(' ');
                        bodyText.append(atts_value);
                        bodyText.append(' ');
                    }
                }

                boolean store = atts.getIndex(LUCENE_URI,
                        LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
                if (text != null && text.length() > 0) {
                    if (store) {
                        bodyDocument
                                .add(Field.Text(localName, text.toString()));
                    } else {
                        bodyDocument.add(Field.UnStored(localName, text
                                .toString()));
                    }
                }
            }
        } else {
            // All other tags
            super.endElement(namespaceURI, localName, qName);
        }
    }

    public void characters(char[] ch, int start, int length)
            throws SAXException {

        if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0
                && length > 1 && elementStack.size() > 0) {
            String text = new String(ch, start, length);
            ((IndexHelperField) elementStack.peek()).append(text);
            bodyText.append(text);
            bodyText.append(' ');
        } else if (processing == STATE_GROUND) {
            super.characters(ch, start, length);
        }
    }

    private void openWriter() throws IndexException {
        getLogger().debug("use luceneIndexTransformer with indexer component");
        // lookup the indexer
        try {
            indexer = (Indexer) this.manager.lookup(Indexer.ROLE+"/default");
        } catch (ServiceException e) {
            throw new IndexException(e);
        }

        File indexDirectory = new File(queryConfiguration.indexDirectory);
        if (!indexDirectory.isAbsolute()) {
            indexDirectory = new File(workDir,
                    queryConfiguration.indexDirectory);
        }
        // If the index directory doesn't exist, then always create it.
        boolean indexExists = IndexReader.indexExists(indexDirectory);
        if (!indexExists) {
            createIndex = true;
        }
        // Get the index directory, creating it if necessary
        try {
            Directory directory = LuceneCocoonHelper.getDirectory(
                    indexDirectory, createIndex);
            indexer.setIndex(directory);
        } catch (IOException e) {
            throw new IndexException("set directory " + indexDirectory
                    + " error", e);
        }
        // Get the analyzer
        Analyzer analyzer = LuceneCocoonHelper
                .getAnalyzer(queryConfiguration.analyzerClassname);
        indexer.setAnalyzer(analyzer);

        this.indexer.setMergeFactor(queryConfiguration.mergeFactor);
        if (this.createIndex) {
            this.indexer.clearIndex();
        }
    }

    private void reindexDocument() throws IndexException {
        // The index is being created, so there's no need to delete the doc from
        // an existing index.
        // This means we can keep a single IndexWriter open throughout the
        // process.
        if (this.indexer == null) {
            openWriter();
        }
        this.indexer.index(this.bodyDocument);
        this.bodyDocument = null;
    }

    static class IndexHelperField {
        String localName;

        StringBuffer text;

        Attributes attributes;

        IndexHelperField(String localName, Attributes atts) {
            this.localName = localName;
            this.attributes = atts;
            this.text = new StringBuffer();
        }

        public Attributes getAttributes() {
            return attributes;
        }

        public StringBuffer getText() {
            return text;
        }

        public void append(String text) {
            this.text.append(text);
        }

        public void append(char[] str, int offset, int length) {
            this.text.append(str, offset, length);
        }
    }

    static class IndexerConfiguration {
        String analyzerClassname;

        String indexDirectory;

        int mergeFactor;

        public IndexerConfiguration(String analyzerClassname,
                String indexDirectory, int mergeFactor) {
            this.analyzerClassname = analyzerClassname;
            this.indexDirectory = indexDirectory;
            this.mergeFactor = mergeFactor;
        }
    }

}
TOP

Related Classes of org.apache.cocoon.transformation.LuceneIndexTransformerOptimized$IndexHelperField

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.