Package org.apache.cocoon.components.search

Source Code of org.apache.cocoon.components.search.SimpleLuceneXMLIndexerImpl

/*

============================================================================
                   The Apache Software License, Version 1.1
============================================================================

Copyright (C) 1999-2002 The Apache Software Foundation. All rights reserved.

Redistribution and use in source and binary forms, with or without modifica-
tion, are permitted provided that the following conditions are met:

1. Redistributions of  source code must  retain the above copyright  notice,
    this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
    this list of conditions and the following disclaimer in the documentation
    and/or other materials provided with the distribution.

3. The end-user documentation included with the redistribution, if any, must
    include  the following  acknowledgment:  "This product includes  software
    developed  by the  Apache Software Foundation  (http://www.apache.org/)."
    Alternately, this  acknowledgment may  appear in the software itself,  if
    and wherever such third-party acknowledgments normally appear.

4. The names "Apache Cocoon" and  "Apache Software Foundation" must  not  be
    used to  endorse or promote  products derived from  this software without
    prior written permission. For written permission, please contact
    apache@apache.org.

5. Products  derived from this software may not  be called "Apache", nor may
    "Apache" appear  in their name,  without prior written permission  of the
    Apache Software Foundation.

THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS  FOR A PARTICULAR  PURPOSE ARE  DISCLAIMED.  IN NO  EVENT SHALL  THE
APACHE SOFTWARE  FOUNDATION  OR ITS CONTRIBUTORS  BE LIABLE FOR  ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLU-
DING, BUT NOT LIMITED TO, PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR  PROFITS; OR BUSINESS  INTERRUPTION)  HOWEVER CAUSED AND ON
ANY  THEORY OF LIABILITY,  WHETHER  IN CONTRACT,  STRICT LIABILITY,  OR TORT
(INCLUDING  NEGLIGENCE OR  OTHERWISE) ARISING IN  ANY WAY OUT OF THE  USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

This software  consists of voluntary contributions made  by many individuals
on  behalf of the Apache Software  Foundation and was  originally created by
Stefano Mazzocchi  <stefano@apache.org>. For more  information on the Apache
Software Foundation, please see <http://www.apache.org/>.

*/
package org.apache.cocoon.components.search;

import java.io.*;

import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;

import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.*;

import org.apache.avalon.excalibur.xml.Parser;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.component.ComponentException;
import org.apache.avalon.framework.component.ComponentManager;
import org.apache.avalon.framework.component.Composable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLoggable;

import org.apache.avalon.framework.logger.AbstractLoggable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.avalon.framework.thread.ThreadSafe;
import org.apache.cocoon.ProcessingException;

import org.apache.cocoon.environment.Source;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.lucene.document.DateField;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;

/**
* A simple class building lucene documents from xml content.
*
* @author     <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
* @version CVS $Id: SimpleLuceneXMLIndexerImpl.java,v 1.9.2.2 2002/08/03 02:33:35 vgritsenko Exp $
*/
public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable
         implements LuceneXMLIndexer, Configurable, Composable, ThreadSafe
{

    /**
     * The component manager instance
     *
     * @since
     */
    protected ComponentManager manager = null;

    /**
     * append this string to the url in order to get the
     * content view of the url
     *
     * @since
     */
    final String CONTENT_QUERY = "cocoon-view=content";

    /**
     * set of allowed content types
     *
     * @since
     */
    final HashSet allowedContentType;


    /**
     * @since
     */
    public SimpleLuceneXMLIndexerImpl() {
        allowedContentType = new HashSet();
        allowedContentType.add("text/xml");
        allowedContentType.add("text/xhtml");
    }


    /**
     * configure
     *
     * @param  conf                        Description of Parameter
     * @exception  ConfigurationException  Description of Exception
     * @since
     */
    public void configure(Configuration conf) throws ConfigurationException { }


    /**
     * Set the current <code>ComponentManager</code> instance used by this
     * <code>Composable</code>.
     *
     * @param  manager                 Description of Parameter
     * @exception  ComponentException  Description of Exception
     * @since
     */
    public void compose(ComponentManager manager) throws ComponentException {
        this.manager = manager;
    }


    /**
     * Build lucenen documents from a URL
     *
     * @param  url                      the content of this url gets indexed.
     * @exception  ProcessingException  Description of Exception
     * @since
     */
    public List build(URL url)
             throws ProcessingException {

        try {
            URL contentURL = new URL(url, url.getFile()
                + ((url.getFile().indexOf("?") == -1) ? "?" : "&")
                + CONTENT_QUERY);
            URLConnection contentURLConnection = contentURL.openConnection();
            if (contentURLConnection == null) {
                throw new ProcessingException("Can not open connection to URL "
                        + contentURL + " (null connection)");
            }

            String contentType = contentURLConnection.getContentType();
            if (contentType == null) {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Ignoring " + contentURL + " (no content type)");
                }

                return Collections.EMPTY_LIST;
            }

            int index = contentType.indexOf(';');
            if (index != -1) {
                contentType = contentType.substring(0, index);
            }

            if (allowedContentType.contains(contentType)) {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Indexing " + contentURL + " (" + contentType + ")");
                }

                LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
                indexDocument(contentURLConnection, luceneIndexContentHandler);
                //
                // document is parsed
                //
                Iterator it = luceneIndexContentHandler.iterator();
                while (it.hasNext()) {
                    Document d = (Document) it.next();
                    d.add(Field.UnIndexed(URL_FIELD, url.toString()));
                    // store ... false, index ... true, token ... false
                    d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false));
                }

                return luceneIndexContentHandler.allDocuments();
            } else {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")");
                }

                return Collections.EMPTY_LIST;
            }
        } catch (IOException ioe) {
            throw new ProcessingException("Cannot read URL " + url, ioe);
        }
    }


    /**
     * index input stream producing lucene Documents
     *
     * @param  contentURLConnection       the xml content which should get indexed.
     * @param  luceneIndexContentHandler  ContentHandler for generating
     *   a lucene Document from XML content.
     * @exception  ProcessingException    Description of Exception
     * @since
     */
    private void indexDocument(URLConnection contentURLConnection,
            LuceneIndexContentHandler luceneIndexContentHandler)
             throws ProcessingException {

        InputStream is = null;
        InputSource in = null;
        Parser parser = null;

        try {
            is = contentURLConnection.getInputStream();
            in = new InputSource(is);

            // get an XML parser
            parser = (Parser) this.manager.lookup(Parser.ROLE);
            //reader.setErrorHandler(new CocoonErrorHandler());
            parser.parse(in, luceneIndexContentHandler);
            //
            // document is parsed
            //
        } catch (IOException ioe) {
            throw new ProcessingException("Cannot read!", ioe);
        } catch (SAXException saxe) {
            throw new ProcessingException("Cannot parse!", saxe);
        } catch (ComponentException ce) {
            throw new ProcessingException("Cannot lookup xml parser!", ce);
        } finally {
            if (parser != null) {
                this.manager.release(parser);
            }
        }
    }


    /**
     * return a unique uid of a url connection
     *
     * @param  urlConnection  Description of Parameter
     * @return                String unique uid of a urlConnection
     * @since
     */
    private String uid(URLConnection urlConnection) {
        // Append path and date into a string in such a way that lexicographic
        // sorting gives the same results as a walk of the file hierarchy.  Thus
        // null (\u0000) is used both to separate directory components and to
        // separate the path from the date.
        return urlConnection.toString().replace('/', '\u0000') +
                "\u0000" +
                DateField.timeToString(urlConnection.getLastModified());
    }
}
TOP

Related Classes of org.apache.cocoon.components.search.SimpleLuceneXMLIndexerImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.