Source Code of net.fp.rp.search.back.extractor.WebDataExtractor

/*
 * Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
 * built with the help of Fast-Soft (fastsoftdev@yahoo.com)
 *
 * released under terms of the GPL license
 * http://www.opensource.org/licenses/gpl-license.php
 *
 * This product includes software developed by the
 * Apache Software Foundation (http://www.apache.org)."
 *
 * This product includes software developed by the
 * Spring Framework Project (http://www.springframework.org)."
 *
 */
package net.fp.rp.search.back.extractor;


import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.Spider;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;


import org.apache.log4j.Logger;


import java.net.URL;


import java.util.LinkedList;
import java.util.regex.Pattern;




/**
 * Extracts information Query (like google)
 *
 * @author brownpa
 * Copyright @link www.firstpartners.net/red
 */
public class WebDataExtractor extends GenericDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());


    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }


    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }


    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info
     *
     * @return int , saying how well this plugin thinks it can handle this new
     *         piece of information
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "WebExtractor - validate handling of the information from " +
            info.getUri());


        int returnInt = 0;
        String extension = UtilExtract.getLocationExtension(info.getUri());
        logger.debug("GenericExtractor extension : " + extension);


        //validate if the extension is supported by the extractor 
        if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
            returnInt = 12;
        }
        //if the location starts with http/https or ftp and extension is unknown (not a file)
        else if ((info.getUri().startsWith(UtilExtract.HTTP_BASE) ||
                info.getUri().startsWith(UtilExtract.HTTPS_BASE) ||
                info.getUri().startsWith(UtilExtract.FTP_BASE))) {
            //validate if exists one extractor which can support the location extension
            boolean found = false;


            try {
                IDataExtractor[] extractors = PluginManager.getDataExtractors();


                for (int j = 0; (j < extractors.length) && (!found); j++) {
                    if (UtilExtract.isExtesionSupported(extension,
                                extractors[j].getListExtensions())) {
                        found = true;
                        logger.debug("extractor which can handle the extension" +
                            extractors[j].getClass().getName());
                    }
                }
            } catch (RpException e) {
                logger.warn(
                    "No extractor is available in the system to process the web location");
            }


            //if no extractor than handle with this extractor
            if (!found) {
                returnInt = 12;
            }
        }


        return returnInt;
    }


    /**
     * Convert the web information into a list of documents
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in processing the data
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("WebExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());


        Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
        spider.start();


        //process the content from the actual document
        //iterate on the links
        NodeStruct node = new NodeStruct();


        for (int i = 0; (i < spider.getLinks().size()); i++) {
            String uri = ((URL) spider.getLinks().get(i)).toString();
            node.addTuple(TupleStruct.KEYWORD_NAME, uri);
        }


        Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
        Pattern replacePattern = Pattern.compile(getReplaceChars());


        for (int i = 0; (i < spider.getValues().size()); i++) {
            String value = ((String) spider.getValues().get(i));


            //generate the list of the words for the spidered values
            LinkedList listWords = UtilExtract.getValueList(value,
                    getMinLengthWord(), notIgnorePattern, replacePattern);


            for (int j = 0; j < listWords.size(); j++)
                node.addTuple(TupleStruct.KEYWORD_GENERIC,
                    (String) listWords.get(j));
        }


        //define an DocumentStruct object 
        DocumStruct doc = new DocumStruct();
        doc.setTitle(spider.getTitle());
        doc.setPath(spider.getUri());
        doc.setDescription(spider.getDescription());
        doc.setContent(node);
        doc.setCategoryName(info.getCategoryName());
        doc.setCategoryLocation(info.getCategoryLocation());


        //store and reindex document
        PluginManager.storeAndAddDocument(doc);


        logger.debug("Level of the information is " + info.getLevel());


        //spider the location only if the level is present (>0)
        if (info.getLevel() > 0) {
            //process the links  
            for (int i = 0; (i < spider.getLinks().size()); i++) {
                String uriLink = ((URL) spider.getLinks().get(i)).toString();
                logger.debug("Process the link :" + uriLink);


                AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
                        info.getCategoryName(), uriLink, info.getLevel() - 1);


                IDataExtractor extractor = PluginManager.getBestExtractor(addInfo);


                if (extractor != null) {
                    logger.debug(
                        "Best extractor for handling the information is :" +
                        extractor.getClass().getName());


                    try {
                        extractor.convert(addInfo);
                    } catch (RpException e) {
                        //no exception to be thrown -> continue the add
                        logger.debug("Error in extract the data " +
                            e.getMessage(), e);
                    }
                } else {
                    logger.warn(
                        "No extractor is available for extract the data  " +
                        uriLink);
                }
            }
        } else {
            logger.debug(
                "Current level for the information is already 0-> no futher process for this location");
        }
    }
}
Source Code of net.fp.rp.search.back.extractor.WebDataExtractor

Related Classes of net.fp.rp.search.back.extractor.WebDataExtractor