Package net.fp.rp.search.back.extractor

Source Code of net.fp.rp.search.back.extractor.WebDataExtractor

/*
* Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
* built with the help of Fast-Soft (fastsoftdev@yahoo.com)
*
* released under terms of the GPL license
* http://www.opensource.org/licenses/gpl-license.php
*
* This product includes software developed by the
* Apache Software Foundation (http://www.apache.org)."
*
* This product includes software developed by the
* Spring Framework Project (http://www.springframework.org)."
*
*/
package net.fp.rp.search.back.extractor;

import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.Spider;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.back.struct.DocumStruct;
import net.fp.rp.search.back.struct.NodeStruct;
import net.fp.rp.search.back.struct.TupleStruct;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;

import org.apache.log4j.Logger;

import java.net.URL;

import java.util.LinkedList;
import java.util.regex.Pattern;


/**
* Extracts information Query (like google)
*
* @author brownpa
* Copyright @link www.firstpartners.net/red
*/
public class WebDataExtractor extends GenericDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());

    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }

    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }

    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info
     *
     * @return int , saying how well this plugin thinks it can handle this new
     *         piece of information
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "WebExtractor - validate handling of the information from " +
            info.getUri());

        int returnInt = 0;
        String extension = UtilExtract.getLocationExtension(info.getUri());
        logger.debug("GenericExtractor extension : " + extension);

        //validate if the extension is supported by the extractor
        if (UtilExtract.isExtesionSupported(extension, getListExtensions())) {
            returnInt = 12;
        }
        //if the location starts with http/https or ftp and extension is unknown (not a file)
        else if ((info.getUri().startsWith(UtilExtract.HTTP_BASE) ||
                info.getUri().startsWith(UtilExtract.HTTPS_BASE) ||
                info.getUri().startsWith(UtilExtract.FTP_BASE))) {
            //validate if exists one extractor which can support the location extension
            boolean found = false;

            try {
                IDataExtractor[] extractors = PluginManager.getDataExtractors();

                for (int j = 0; (j < extractors.length) && (!found); j++) {
                    if (UtilExtract.isExtesionSupported(extension,
                                extractors[j].getListExtensions())) {
                        found = true;
                        logger.debug("extractor which can handle the extension" +
                            extractors[j].getClass().getName());
                    }
                }
            } catch (RpException e) {
                logger.warn(
                    "No extractor is available in the system to process the web location");
            }

            //if no extractor than handle with this extractor
            if (!found) {
                returnInt = 12;
            }
        }

        return returnInt;
    }

    /**
     * Convert the web information into a list of documents
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in processing the data
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("WebExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());

        Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
        spider.start();

        //process the content from the actual document
        //iterate on the links
        NodeStruct node = new NodeStruct();

        for (int i = 0; (i < spider.getLinks().size()); i++) {
            String uri = ((URL) spider.getLinks().get(i)).toString();
            node.addTuple(TupleStruct.KEYWORD_NAME, uri);
        }

        Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
        Pattern replacePattern = Pattern.compile(getReplaceChars());

        for (int i = 0; (i < spider.getValues().size()); i++) {
            String value = ((String) spider.getValues().get(i));

            //generate the list of the words for the spidered values
            LinkedList listWords = UtilExtract.getValueList(value,
                    getMinLengthWord(), notIgnorePattern, replacePattern);

            for (int j = 0; j < listWords.size(); j++)
                node.addTuple(TupleStruct.KEYWORD_GENERIC,
                    (String) listWords.get(j));
        }

        //define an DocumentStruct object
        DocumStruct doc = new DocumStruct();
        doc.setTitle(spider.getTitle());
        doc.setPath(spider.getUri());
        doc.setDescription(spider.getDescription());
        doc.setContent(node);
        doc.setCategoryName(info.getCategoryName());
        doc.setCategoryLocation(info.getCategoryLocation());

        //store and reindex document
        PluginManager.storeAndAddDocument(doc);

        logger.debug("Level of the information is " + info.getLevel());

        //spider the location only if the level is present (>0)
        if (info.getLevel() > 0) {
            //process the links 
            for (int i = 0; (i < spider.getLinks().size()); i++) {
                String uriLink = ((URL) spider.getLinks().get(i)).toString();
                logger.debug("Process the link :" + uriLink);

                AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
                        info.getCategoryName(), uriLink, info.getLevel() - 1);

                IDataExtractor extractor = PluginManager.getBestExtractor(addInfo);

                if (extractor != null) {
                    logger.debug(
                        "Best extractor for handling the information is :" +
                        extractor.getClass().getName());

                    try {
                        extractor.convert(addInfo);
                    } catch (RpException e) {
                        //no exception to be thrown -> continue the add
                        logger.debug("Error in extract the data " +
                            e.getMessage(), e);
                    }
                } else {
                    logger.warn(
                        "No extractor is available for extract the data  " +
                        uriLink);
                }
            }
        } else {
            logger.debug(
                "Current level for the information is already 0-> no futher process for this location");
        }
    }
}
TOP

Related Classes of net.fp.rp.search.back.extractor.WebDataExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.