Source Code of net.fp.rp.search.back.extractor.XmlDataExtractor

/*
 * Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
 * built with the help of Fast-Soft (fastsoftdev@yahoo.com)
 *
 * released under terms of the GPL license
 * http://www.opensource.org/licenses/gpl-license.php
 *
 * This product includes software developed by the
 * Apache Software Foundation (http://www.apache.org)."
 *
 * This product includes software developed by the
 * Spring Framework Project (http://www.springframework.org)."
 *
 */
package net.fp.rp.search.back.extractor;


import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.Stack;


import javax.xml.parsers.SAXParserFactory;


import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.extractor.xml.IXMLController;
import net.fp.rp.search.back.extractor.xml.SaxXMLBuilder;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;


import org.apache.log4j.Logger;
import org.xml.sax.AttributeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;




/**
 * Extracts information from an XML file in a format that can be added to an
 * Index.
 *
 * @author brownpa
 * Copyright @link www.firstpartners.net/red
 */
public class XmlDataExtractor extends GenericDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());


    /** Links are required */
    private boolean linkRequired;


    /** Link tags */
    private String linkTags;


    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }


    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }


    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info
     *
     * @return Integer value (0 if cann't handle the data)
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "XMLExtractor - validate handling of the information from " +
            info.getUri());


        int returnInt = 0;
        String extension = UtilExtract.getLocationExtension(info.getUri());
        logger.debug("GenericExtractor extension : " + extension);


        //validate if the extension is supported by the extractor 
        if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
            returnInt = 10;
        }


        return returnInt;
    }


    /**
     * Comvert the xml file information into a list of documents
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in xml processing
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("XMLExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());


        InputStream in = UtilExtract.getStream(info.getUri());


        final LinkedList links = new LinkedList();


        //define an DocNode
        final DocumStruct doc = new DocumStruct();


        //use SAX-Parser instead of DOM-Parser, Performance issue
        //get a reader to the data using sax
        try {
            // Create a JAXP "parser factory" for creating SAX parsers
            javax.xml.parsers.SAXParserFactory saxFactory = SAXParserFactory.newInstance();


            // Configure the parser factory for the type of parsers we require
            saxFactory.setValidating(false); // No validation required


            // Now use the parser factory to create a SAXParser object
            // Note that SAXParser is a JAXP class, not a SAX class
            javax.xml.parsers.SAXParser saxParser = saxFactory.newSAXParser();


            // Create a SAX input source for the file argument
            org.xml.sax.InputSource input = new InputSource(in);


            //create the stack
            final Stack nodeStack = new Stack();


            //final NodeStruct actual = new NodeStruct();
          //  final DefaultMutableTreeNode tree = new DefaultMutableTreeNode();


            //define an internal stack
            IXMLController controller = new IXMLController() {
                    // (non-Javadoc)
                    // @see net.fp.rp.back.extractor.xml.IXMLController#handleContent(java.lang.String)
                    //
                    public void handleContent(final String name,
                        final String content) throws SAXException {
                        //split and add the content
                        logger.debug("XMLProcessing - handle the content " +
                            content + " for the name " + name);


                        NodeStruct actual = (NodeStruct) nodeStack.pop();
                        boolean isTupleValueEmpty = false;


                        //if the node has only one tuple (special case) 
                        if (actual.getTuples().size() == 1) {
                            TupleStruct tuple = (TupleStruct) actual.getTuples()
                                                                    .get(0);


                            //last element with empty value ?? 
                            if ((tuple.getKeyword().equals(name)) &&
                                    ("".equals(tuple.getValue()))) {
                                //update the value
                                logger.debug("XMLProcessing - handle the node " +
                                    name + "which has before empty value");
                                isTupleValueEmpty = true;
                            }
                        }


                        //ignore the empty contents
                        if (content.length() > 0) {
                            if (isTupleValueEmpty) {
                                //update the value
                                ((TupleStruct) actual.getTuples().get(0)).setValue(content);
                            } else {
                                actual.addTuple(name, content);
                            }


                            logger.debug("Tag name/value is " + name + "/" +
                                content);


                            //validate if the specified tuple is a link
                            if (linkRequired && (linkTags.indexOf(name) != -1)) {
                                //add the specified link to the list
                                logger.debug("Tag name is a link" + name + "/" +
                                    content);
                                links.add(content);
                            }
                        }


                        if (nodeStack.isEmpty()) {
                            logger.debug(
                                "XMLProcessing - set the content actual node as content for document");
                            doc.setContent(actual);
                        }
                    }


                    // (non-Javadoc)
                    // @see net.fp.rp.back.extractor.xml.IXMLController#handleElemAttributes(java.lang.String, org.xml.sax.AttributeList)
                    //
                    public void handleElemAttributes(String name,
                        AttributeList attributes) throws SAXException {
                        logger.debug(
                            "XMLProcessing - handle the element attributes for name " +
                            name);


                        //create the node struct
                        NodeStruct actual = new NodeStruct();


                        //if exists attributes handle as value for the element:name
                        if (attributes.getLength() > 0) {
                            //iterate on attributes and added as value
                           // StringBuffer buf = new StringBuffer();


                            for (int i = 0; i < attributes.getLength(); i++) {
                                //Encode the attrib. buffer (for the attributes maybe is not necessarilly)
                                actual.addTuple(attributes.getName(i),
                                    UtilExtract.encode(attributes.getValue(i)));
                            }
                        }


                        //add the actual node
                        actual.addTuple(name, "");


                        if (!nodeStack.isEmpty()) {
                            //get the parent 
                            ((NodeStruct) nodeStack.get(nodeStack.size() - 1)).addChild(actual);
                        }


                        nodeStack.push(actual);
                    }
                };


            SaxXMLBuilder builder = new SaxXMLBuilder(controller);


            //parse the input and notify the handler
            saxParser.parse(input, builder);
        } catch (SAXException e) {
            logger.debug("SAXException in processing location" + info.getUri(),
                e);
            throw new RpException("extractor.xml.filenotvalid",
                new Object[] { info.getUri() });
        } catch (Throwable t) {
            logger.debug("Exception in processing the location" +
                info.getUri(), t);
            throw new RpException("app.extract.error",
                new Object[] { info.getUri() });
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (IOException e) {
            }
        }


        /*
        try
        {
            DOMParser parser = new DOMParser();
            parser.parse( new InputSource( in ) );
            Document xmldoc = parser.getDocument();




            //parse the document and generate the conent nodes
            doc.setContent( Translator.translate( xmldoc.getDocumentElement() ) );
        }
        catch ( SAXException e )
        {
            e.printStackTrace(System.out);
        }
        catch ( IOException e )
        {
            e.printStackTrace(System.out);
        }
        */
        //add the document to the list
        doc.setPath(info.getUri());
        doc.setTitle(UtilExtract.getFilenameTitle(info.getUri()));


        //get the summary of the document
        StringBuffer summary = new StringBuffer("");
        boolean isMaxReached = false;


        NodeStruct node = doc.getContent();


        for (int i = 0; (i < node.getTuples().size()) && (!isMaxReached);
                i++) {
            TupleStruct tuple = (TupleStruct) node.getTuples().get(i);


            //add to the summary 
            if (summary.length() <= getMaxLengthSummary()) {
                summary.append(tuple.getValue());
                summary.append(" ");
            }


            if (summary.length() > getMaxLengthSummary()) {
                isMaxReached = true;
            }
        }


        if (isMaxReached) {
            doc.setDescription(summary.toString().substring(0,
                    getMaxLengthSummary()));
        } else {
            doc.setDescription(summary.toString());
        }


        doc.setCategoryName(info.getCategoryName());
        doc.setCategoryLocation(info.getCategoryLocation());


        //store and reindex document
        PluginManager.storeAndAddDocument(doc);


        logger.debug("Level of the information is " + info.getLevel());


        //if the links tags are specified and level is > 0
        if (linkRequired && (info.getLevel() > 0)) {
            //iterate on the links list and process it 
            for (int i = 0; i < links.size(); i++) {
                String location = (String) links.get(i);


                //spider the tuple value
                logger.info("Extract the information from the location " +
                    location);


                AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
                        info.getCategoryName(), location, info.getLevel() - 1);


                IDataExtractor extractor = PluginManager.getBestExtractor(addInfo);


                if (extractor != null) {
                    logger.debug(
                        "Best extractor for handling the information is :" +
                        extractor.getClass().getName());


                    try {
                        extractor.convert(addInfo);
                    } catch (RpException e) {
                        //no exception to be thrown -> continue the add
                        logger.debug("Error in extract the data " +
                            e.getMessage(), e);
                    }
                } else {
                    logger.warn(
                        "No extractor is available for extract the data  " +
                        location);
                }
            }
        }
    }


    /**
     * @return Returns the linkRequired.
     */
    public boolean isLinkRequired() {
        return linkRequired;
    }


    /**
     * @param linkRequired The linkRequired to set.
     */
    public void setLinkRequired(boolean linkRequired) {
        this.linkRequired = linkRequired;
    }


    /**
     * @return Returns the linkTags.
     */
    public String getLinkTags() {
        return linkTags;
    }


    /**
     * @param linkTags The linkTags to set.
     */
    public void setLinkTags(String linkTags) {
        this.linkTags = linkTags;
    }
}
Source Code of net.fp.rp.search.back.extractor.XmlDataExtractor

Related Classes of net.fp.rp.search.back.extractor.XmlDataExtractor