/*
* Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
* built with the help of Fast-Soft (fastsoftdev@yahoo.com)
*
* released under terms of the GPL license
* http://www.opensource.org/licenses/gpl-license.php
*
* This product includes software developed by the
* Apache Software Foundation (http://www.apache.org)."
*
* This product includes software developed by the
* Spring Framework Project (http://www.springframework.org)."
*
*/
package net.fp.rp.search.back.extractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.Stack;
import javax.xml.parsers.SAXParserFactory;
import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.extractor.xml.IXMLController;
import net.fp.rp.search.back.extractor.xml.SaxXMLBuilder;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;
import org.apache.log4j.Logger;
import org.xml.sax.AttributeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Extracts information from an XML file in a format that can be added to an
* Index.
*
* @author brownpa
* Copyright @link www.firstpartners.net/red
*/
public class XmlDataExtractor extends GenericDataExtractor {
/** Logger for this class and subclasses */
protected final Logger logger = Logger.getLogger(getClass());
/** Links are required */
private boolean linkRequired;
/** Link tags */
private String linkTags;
/**
* The original place where we got this data
*
* @return pointer
*/
public String getOriginalUri() {
return "";
}
/**
* Carry out any initiation tasks
*/
public void onLoad() {
}
/**
* How well the plugin thinks it can handle a new piece of information
*
* @param info
*
* @return Integer value (0 if cann't handle the data)
*/
public int canHandle(INewInformation info) {
logger.debug(
"XMLExtractor - validate handling of the information from " +
info.getUri());
int returnInt = 0;
String extension = UtilExtract.getLocationExtension(info.getUri());
logger.debug("GenericExtractor extension : " + extension);
//validate if the extension is supported by the extractor
if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
returnInt = 10;
}
return returnInt;
}
/**
* Comvert the xml file information into a list of documents
*
* @param info Information to be converted
*
* @throws RpException If an error occur in xml processing
*/
public void convert(INewInformation info) throws RpException {
logger.info("XMLExtractor handling location :" + info.getUri() +
" with level " + info.getLevel());
InputStream in = UtilExtract.getStream(info.getUri());
final LinkedList links = new LinkedList();
//define an DocNode
final DocumStruct doc = new DocumStruct();
//use SAX-Parser instead of DOM-Parser, Performance issue
//get a reader to the data using sax
try {
// Create a JAXP "parser factory" for creating SAX parsers
javax.xml.parsers.SAXParserFactory saxFactory = SAXParserFactory.newInstance();
// Configure the parser factory for the type of parsers we require
saxFactory.setValidating(false); // No validation required
// Now use the parser factory to create a SAXParser object
// Note that SAXParser is a JAXP class, not a SAX class
javax.xml.parsers.SAXParser saxParser = saxFactory.newSAXParser();
// Create a SAX input source for the file argument
org.xml.sax.InputSource input = new InputSource(in);
//create the stack
final Stack nodeStack = new Stack();
//final NodeStruct actual = new NodeStruct();
// final DefaultMutableTreeNode tree = new DefaultMutableTreeNode();
//define an internal stack
IXMLController controller = new IXMLController() {
// (non-Javadoc)
// @see net.fp.rp.back.extractor.xml.IXMLController#handleContent(java.lang.String)
//
public void handleContent(final String name,
final String content) throws SAXException {
//split and add the content
logger.debug("XMLProcessing - handle the content " +
content + " for the name " + name);
NodeStruct actual = (NodeStruct) nodeStack.pop();
boolean isTupleValueEmpty = false;
//if the node has only one tuple (special case)
if (actual.getTuples().size() == 1) {
TupleStruct tuple = (TupleStruct) actual.getTuples()
.get(0);
//last element with empty value ??
if ((tuple.getKeyword().equals(name)) &&
("".equals(tuple.getValue()))) {
//update the value
logger.debug("XMLProcessing - handle the node " +
name + "which has before empty value");
isTupleValueEmpty = true;
}
}
//ignore the empty contents
if (content.length() > 0) {
if (isTupleValueEmpty) {
//update the value
((TupleStruct) actual.getTuples().get(0)).setValue(content);
} else {
actual.addTuple(name, content);
}
logger.debug("Tag name/value is " + name + "/" +
content);
//validate if the specified tuple is a link
if (linkRequired && (linkTags.indexOf(name) != -1)) {
//add the specified link to the list
logger.debug("Tag name is a link" + name + "/" +
content);
links.add(content);
}
}
if (nodeStack.isEmpty()) {
logger.debug(
"XMLProcessing - set the content actual node as content for document");
doc.setContent(actual);
}
}
// (non-Javadoc)
// @see net.fp.rp.back.extractor.xml.IXMLController#handleElemAttributes(java.lang.String, org.xml.sax.AttributeList)
//
public void handleElemAttributes(String name,
AttributeList attributes) throws SAXException {
logger.debug(
"XMLProcessing - handle the element attributes for name " +
name);
//create the node struct
NodeStruct actual = new NodeStruct();
//if exists attributes handle as value for the element:name
if (attributes.getLength() > 0) {
//iterate on attributes and added as value
// StringBuffer buf = new StringBuffer();
for (int i = 0; i < attributes.getLength(); i++) {
//Encode the attrib. buffer (for the attributes maybe is not necessarilly)
actual.addTuple(attributes.getName(i),
UtilExtract.encode(attributes.getValue(i)));
}
}
//add the actual node
actual.addTuple(name, "");
if (!nodeStack.isEmpty()) {
//get the parent
((NodeStruct) nodeStack.get(nodeStack.size() - 1)).addChild(actual);
}
nodeStack.push(actual);
}
};
SaxXMLBuilder builder = new SaxXMLBuilder(controller);
//parse the input and notify the handler
saxParser.parse(input, builder);
} catch (SAXException e) {
logger.debug("SAXException in processing location" + info.getUri(),
e);
throw new RpException("extractor.xml.filenotvalid",
new Object[] { info.getUri() });
} catch (Throwable t) {
logger.debug("Exception in processing the location" +
info.getUri(), t);
throw new RpException("app.extract.error",
new Object[] { info.getUri() });
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
}
}
/*
try
{
DOMParser parser = new DOMParser();
parser.parse( new InputSource( in ) );
Document xmldoc = parser.getDocument();
//parse the document and generate the conent nodes
doc.setContent( Translator.translate( xmldoc.getDocumentElement() ) );
}
catch ( SAXException e )
{
e.printStackTrace(System.out);
}
catch ( IOException e )
{
e.printStackTrace(System.out);
}
*/
//add the document to the list
doc.setPath(info.getUri());
doc.setTitle(UtilExtract.getFilenameTitle(info.getUri()));
//get the summary of the document
StringBuffer summary = new StringBuffer("");
boolean isMaxReached = false;
NodeStruct node = doc.getContent();
for (int i = 0; (i < node.getTuples().size()) && (!isMaxReached);
i++) {
TupleStruct tuple = (TupleStruct) node.getTuples().get(i);
//add to the summary
if (summary.length() <= getMaxLengthSummary()) {
summary.append(tuple.getValue());
summary.append(" ");
}
if (summary.length() > getMaxLengthSummary()) {
isMaxReached = true;
}
}
if (isMaxReached) {
doc.setDescription(summary.toString().substring(0,
getMaxLengthSummary()));
} else {
doc.setDescription(summary.toString());
}
doc.setCategoryName(info.getCategoryName());
doc.setCategoryLocation(info.getCategoryLocation());
//store and reindex document
PluginManager.storeAndAddDocument(doc);
logger.debug("Level of the information is " + info.getLevel());
//if the links tags are specified and level is > 0
if (linkRequired && (info.getLevel() > 0)) {
//iterate on the links list and process it
for (int i = 0; i < links.size(); i++) {
String location = (String) links.get(i);
//spider the tuple value
logger.info("Extract the information from the location " +
location);
AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
info.getCategoryName(), location, info.getLevel() - 1);
IDataExtractor extractor = PluginManager.getBestExtractor(addInfo);
if (extractor != null) {
logger.debug(
"Best extractor for handling the information is :" +
extractor.getClass().getName());
try {
extractor.convert(addInfo);
} catch (RpException e) {
//no exception to be thrown -> continue the add
logger.debug("Error in extract the data " +
e.getMessage(), e);
}
} else {
logger.warn(
"No extractor is available for extract the data " +
location);
}
}
}
}
/**
* @return Returns the linkRequired.
*/
public boolean isLinkRequired() {
return linkRequired;
}
/**
* @param linkRequired The linkRequired to set.
*/
public void setLinkRequired(boolean linkRequired) {
this.linkRequired = linkRequired;
}
/**
* @return Returns the linkTags.
*/
public String getLinkTags() {
return linkTags;
}
/**
* @param linkTags The linkTags to set.
*/
public void setLinkTags(String linkTags) {
this.linkTags = linkTags;
}
}