Package net.fp.rp.search.back.extractor.util

Examples of net.fp.rp.search.back.extractor.util.Spider


    /**
     * Test the spider
     */
    public void test() {
        try {
            Spider spider = new Spider(UtilExtract.getUri(
                        "http://www.google.ie/search?hl=en&lr=&q=www.cnn.com&btnG=Search"),
                    40);
            spider.start();
        } catch (Throwable e) {
            e.printStackTrace(System.out);
            fail();
        }
    }
View Full Code Here


     */
    public void convert(INewInformation info) throws RpException {
        logger.info("WebExtractor handling location :" + info.getUri() +
            " with level " + info.getLevel());

        Spider spider = new Spider(info.getUri(), getMaxLengthSummary());
        spider.start();

        //process the content from the actual document
        //iterate on the links
        NodeStruct node = new NodeStruct();

        for (int i = 0; (i < spider.getLinks().size()); i++) {
            String uri = ((URL) spider.getLinks().get(i)).toString();
            node.addTuple(TupleStruct.KEYWORD_NAME, uri);
        }

        Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
        Pattern replacePattern = Pattern.compile(getReplaceChars());

        for (int i = 0; (i < spider.getValues().size()); i++) {
            String value = ((String) spider.getValues().get(i));

            //generate the list of the words for the spidered values
            LinkedList listWords = UtilExtract.getValueList(value,
                    getMinLengthWord(), notIgnorePattern, replacePattern);

            for (int j = 0; j < listWords.size(); j++)
                node.addTuple(TupleStruct.KEYWORD_GENERIC,
                    (String) listWords.get(j));
        }

        //define an DocumentStruct object
        DocumStruct doc = new DocumStruct();
        doc.setTitle(spider.getTitle());
        doc.setPath(spider.getUri());
        doc.setDescription(spider.getDescription());
        doc.setContent(node);
        doc.setCategoryName(info.getCategoryName());
        doc.setCategoryLocation(info.getCategoryLocation());

        //store and reindex document
        PluginManager.storeAndAddDocument(doc);

        logger.debug("Level of the information is " + info.getLevel());

        //spider the location only if the level is present (>0)
        if (info.getLevel() > 0) {
            //process the links 
            for (int i = 0; (i < spider.getLinks().size()); i++) {
                String uriLink = ((URL) spider.getLinks().get(i)).toString();
                logger.debug("Process the link :" + uriLink);

                AddInfo addInfo = new AddInfo(info.getCategoryLocation(),
                        info.getCategoryName(), uriLink, info.getLevel() - 1);

View Full Code Here

TOP

Related Classes of net.fp.rp.search.back.extractor.util.Spider

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.