Source Code of net.fp.rp.search.back.extractor.FileTreeDataExtractor

/*
 * Copyright (C) 2004 Paul Browne, http://www.firstpartners.net,
 * built with the help of Fast-Soft (fastsoftdev@yahoo.com)
 *
 * released under terms of the GPL license
 * http://www.opensource.org/licenses/gpl-license.php
 *
 * This product includes software developed by the
 * Apache Software Foundation (http://www.apache.org)."
 *
 * This product includes software developed by the
 * Spring Framework Project (http://www.springframework.org)."
 *
 */
package net.fp.rp.search.back.extractor;


import net.fp.rp.common.exception.RpException;
import net.fp.rp.search.back.extractor.util.UtilExtract;
import net.fp.rp.search.common.AddInfo;
import net.fp.rp.search.mid.global.PluginManager;
import net.fp.rp.search.plugins.IDataExtractor;
import net.fp.rp.search.plugins.INewInformation;


import org.apache.log4j.Logger;


import java.io.File;


import java.net.MalformedURLException;
import java.net.URL;




/**
 * Extracts information from a File Tree (Directory) in a format that can be
 * added to an Index.
 * 
 * @author brownpa
 * Copyright @link www.firstpartners.net/red
 */
public class FileTreeDataExtractor implements IDataExtractor {
    /** Logger for this class and subclasses */
    protected final Logger logger = Logger.getLogger(getClass());


    /** Extractor type */
    private String type;


    /** Maxim process document size */
    private int maxFileSize;


    /**
     * The original place where we got this data
     *
     * @return pointer
     */
    public String getOriginalUri() {
        return "";
    }


    /**
     * Carry out any initiation tasks
     */
    public void onLoad() {
    }


    /**
     * How well the plugin thinks it can handle a new piece of information
     *
     * @param info
     *
     * @return
     */
    public int canHandle(INewInformation info) {
        logger.debug(
            "TreeExtractor - validate handling of the information from " +
            info.getUri());


        int returnInt = 0;


        //if the specified content is a directory (file protocol) then -> maxim value
        if (info.getUri().startsWith(UtilExtract.FILE_BASE)) {
            File f = new File(info.getUri().substring(UtilExtract.FILE_BASE.length()));


            if (f.isDirectory()) {
                returnInt = 14;
            }
        }


        return returnInt;
    }


    /**
     * Comvert the tree information into a list of documents
     *
     * @param info Information to be converted
     *
     * @throws RpException If an error occur in xml processing
     */
    public void convert(INewInformation info) throws RpException {
        logger.info("TreeExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());


        //extraction the information from the files only if the level is present (>0)
        if (info.getLevel() > 0) {
            try {
                URL url = new URL(info.getUri());


                //if the protocol is file and is directory 
                File folder = new File(url.getPath());


                if ((info.getUri().startsWith(UtilExtract.FILE_BASE)) &&
                        folder.isDirectory()) {
                    logger.debug("Process the files inside folder");


                    //get the list of the 
                    File[] list = folder.listFiles();


                    //for each item call recursively the DataExtractor
                    for (int i = 0; i < list.length; i++) {
                        logger.debug("Process the file :" + list[i].getPath());


                        //validate if the size is too big
                        if (list[i].length() > maxFileSize) {
                            logger.warn("File " + list[i].getPath() +
                                " is too big and will not be process");
                        } else {
                            //must specify the category location, category name from the actual information
                            INewInformation childInfo = new AddInfo(info.getCategoryLocation(),
                                    info.getCategoryName(), list[i].getPath(),
                                    info.getLevel() - 1);


                            IDataExtractor extractor = PluginManager.getBestExtractor(childInfo);


                            if (extractor != null) {
                                logger.debug("Best extractor for location " +
                                    list[i].getPath() + " is :" +
                                    extractor.getClass().getName());


                                //process the information with level =1
                                try {
                                    extractor.convert(childInfo);
                                } catch (RpException e) {
                                    //no exception to be thrown -> continue the add
                                    logger.debug("Error in extracting the data " +
                                        e.getMessage(), e);
                                }
                            } else {
                                logger.warn(
                                    "No extractor is available for extract the data  " +
                                    list[i].getPath());
                            }
                        }
                    }
                } else {
                    throw new RpException("treeextractor.folder.notexists",
                        new Object[] { info.getUri() });
                }
            } catch (MalformedURLException e) {
                logger.warn("Specified uri is not valid " + e.getMessage(), e);
                throw new RpException("extractor.tree.foldernotexists",
                    new Object[] { info.getUri() });
            }
        } else {
            logger.debug(
                "Current level for the information is already 0-> no futher process for this location");
        }
    }


    /**
     * Return the list of supported extensions
     *
     * @return List of supported extesnsions
     */
    public String[] getListExtensions() {
        //no extension supported
        return new String[] {  };
    }


    /**
     * Get the maxim file size which can be proces
     *
     * @return Maxim file size
     */
    public int getMaxFileSize() {
        return maxFileSize;
    }


    /**
     * Set the maxim file size which can be process
     *
     * @param maxFileSize Maxim file size allowed.
     */
    public void setMaxFileSize(int maxFileSize) {
        this.maxFileSize = maxFileSize;
    }


    /**
     * Get the extractor type
     *
     * @return Type of the extractor
     */
    public String getType() {
        return type;
    }


    /**
     * Set the extractor type
     *
     * @param type Extractor type
     */
    public void setType(String type) {
        this.type = type;
    }
}
Source Code of net.fp.rp.search.back.extractor.FileTreeDataExtractor

Related Classes of net.fp.rp.search.back.extractor.FileTreeDataExtractor