/*
* Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
* built with the help of Fast-Soft (fastsoftdev@yahoo.com)
*
* released under terms of the GPL license
* http://www.opensource.org/licenses/gpl-license.php
*
* This product includes software developed by the
* Apache Software Foundation (http://www.apache.org)."
*
* This product includes software developed by the
* Spring Framework Project (http://www.springframework.org)."
*
*/
package net.fp.rp.search.back.extractor;
import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.Spider;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;
import org.apache.log4j.Logger;
import java.net.URL;
import java.util.LinkedList;
import java.util.regex.Pattern;
/**
* Extracts information Query (like google)
*
* @author brownpa
* Copyright @link www.firstpartners.net/red
*/
public class WebDataExtractor extends GenericDataExtractor {
/** Logger for this class and subclasses */
protected final Logger logger = Logger.getLogger(getClass());
/**
* The original place where we got this data
*
* @return pointer
*/
public String getOriginalUri() {
return "";
}
/**
* Carry out any initiation tasks
*/
public void onLoad() {
}
/**
* How well the plugin thinks it can handle a new piece of information
*
* @param info
*
* @return int , saying how well this plugin thinks it can handle this new
* piece of information
*/
public int canHandle(INewInformation info) {
logger.debug(
"WebExtractor - validate handling of the information from " +
info.getUri());
int returnInt = 0;
String extension = UtilExtract.getLocationExtension(info.getUri());
logger.debug("GenericExtractor extension : " + extension);
//validate if the extension is supported by the extractor
if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
returnInt = 12;
}
//if the location starts with http/https or ftp and extension is unknown (not a file)
else if ((info.getUri().startsWith(UtilExtract.HTTP_BASE) ||
info.getUri().startsWith(UtilExtract.HTTPS_BASE) ||
info.getUri().startsWith(UtilExtract.FTP_BASE))) {
//validate if exists one extractor which can support the location extension
boolean found = false;
try {
IDataExtractor[] extractors = PluginManager.getDataExtractors();
for (int j = 0; (j < extractors.length) && (!found); j++) {
if (UtilExtract.isExtesionSupported(extension,
extractors[j].getListExtensions())) {
found = true;
logger.debug("extractor which can handle the extension" +
extractors[j].getClass().getName());
}
}
} catch (RpException e) {
logger.warn(
"No extractor is available in the system to process the web location");
}
//if no extractor than handle with this extractor
if (!found) {
returnInt = 12;
}
}
return returnInt;
}
/**
* Convert the web information into a list of documents
*
* @param info Information to be converted
*
* @throws RpException If an error occur in processing the data
*/
public void convert(INewInformation info) throws RpException {
logger.info("WebExtractor handling location :" + info.getUri() +
" with level " + info.getLevel());
Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
spider.start();
//process the content from the actual document
//iterate on the links
NodeStruct node = new NodeStruct();
for (int i = 0; (i < spider.getLinks().size()); i++) {
String uri = ((URL) spider.getLinks().get(i)).toString();
node.addTuple(TupleStruct.KEYWORD_NAME, uri);
}
Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
Pattern replacePattern = Pattern.compile(getReplaceChars());
for (int i = 0; (i < spider.getValues().size()); i++) {
String value = ((String) spider.getValues().get(i));
//generate the list of the words for the spidered values
LinkedList listWords = UtilExtract.getValueList(value,
getMinLengthWord(), notIgnorePattern, replacePattern);
for (int j = 0; j < listWords.size(); j++)
node.addTuple(TupleStruct.KEYWORD_GENERIC,
(String) listWords.get(j));
}
//define an DocumentStruct object
DocumStruct doc = new DocumStruct();
doc.setTitle(spider.getTitle());
doc.setPath(spider.getUri());
doc.setDescription(spider.getDescription());
doc.setContent(node);
doc.setCategoryName(info.getCategoryName());
doc.setCategoryLocation(info.getCategoryLocation());
//store and reindex document
PluginManager.storeAndAddDocument(doc);
logger.debug("Level of the information is " + info.getLevel());
//spider the location only if the level is present (>0)
if (info.getLevel() > 0) {
//process the links
for (int i = 0; (i < spider.getLinks().size()); i++) {
String uriLink = ((URL) spider.getLinks().get(i)).toString();
logger.debug("Process the link :" + uriLink);
AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
info.getCategoryName(), uriLink, info.getLevel() - 1);
IDataExtractor extractor = PluginManager.getBestExtractor(addInfo);
if (extractor != null) {
logger.debug(
"Best extractor for handling the information is :" +
extractor.getClass().getName());
try {
extractor.convert(addInfo);
} catch (RpException e) {
//no exception to be thrown -> continue the add
logger.debug("Error in extract the data " +
e.getMessage(), e);
}
} else {
logger.warn(
"No extractor is available for extract the data " +
uriLink);
}
}
} else {
logger.debug(
"Current level for the information is already 0-> no futher process for this location");
}
}
}