Examples of com.flaptor.hounder.crawler.pagedb.Page

Package com.flaptor.hounder.crawler.pagedb

Examples of com.flaptor.hounder.crawler.pagedb.Page

com.flaptor.hounder.crawler.pagedb.Page
@author Flaptor Development Team

    private void selectPagesToFetch (IPageStore toFetch, IPageStore toPass) {
        logger.debug("Selecting pages for the fetchlist");
        int pagesSeen = 0;
        int pagesAdded = 0;
        while (pagesAdded < fetchlistSize && pages.hasNext()) {
            Page page = pages.next(); 
            pagesSeen++;
            // if it is a valid web page (not an image, etc), it is considered, otherwise it is discarded
            if (null == Crawler.urlFilter(page.getUrl())) {
                logger.debug("  discarding page " + page.getUrl() + ", url filters consider it not interesting");
            } else {
                // if the page should be fetched, it goes to the fetchlist, otherwise it is passed
                boolean added = false;
                if (shouldFetch(page)) {
                    try {
                        logger.debug("  adding page " + page.getUrl());
                        toFetch.addPage(page);
                        pagesAdded++;
                        added = true;
                    } catch (IOException e) {
                        logger.error("Trying to add a page to the IPageStore " + toFetch + ": " + e, e);
                    }
                }
                if (!added && null != toPass) {
                    try {
                        logger.debug("  passing page " + page.getUrl());
                        toPass.addPage(page);
                    } catch (IOException e) {
                        logger.error("Trying to add a page to the IPageStore " + toPass + ": " + e, e);
                    }
                }

View Full Code Here

        if (args.length < 2) {
            System.err.println ("\nMissing arguments.\n\nUsage:\n  "+mapperClass.getName()+" <nodeCount> <url>\n");
            System.exit(-1);
        }
        int nodes = Integer.parseInt(args[0]);
        Page page = new Page(args[1],0);
        APageMapper mapper = (APageMapper)mapperClass.getConstructor(new Class<?>[]{Config.class, Integer.TYPE}).newInstance(new Object[]{null,nodes});
        System.out.println("  "+args[1]+" -> "+mapper.mapPage(page));
    }

View Full Code Here

        }


        private void advance () {
            doc = null;
            while (null == doc && pages.hasNext()) {
                Page page = pages.next();
                String url = page.getUrl();
                if (cache.hasItem(url)) {
                    byte[] content = cache.getItem(url).getContent();
                    Map<String,String> header = new HashMap<String,String>(); // this info is lost, it should be stored in the cache along with the page contents.
                    boolean success = true;
                    boolean recoverable = true;

View Full Code Here

     */
    @SuppressWarnings("unchecked")
    @Override
    protected void internalProcess(FetchDocument doc) {


        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring this document.");
            return;
        }
        if (logger.isDebugEnabled()) { 
            logger.debug("Doc has tags: "+doc.getTags().toString());
        } 
        if (doc.hasTag(EMIT_DOC_TAG)) {
            addToIndex(doc);
        } else {
            if (page.isEmitted()) {
                deleteFromIndex(page);
            }
        }
    }

View Full Code Here

        Map<String,Object> indexableAttributes = doc.getIndexableAttributes();


        // build xml doc
        org.dom4j.Document dom = DocumentHelper.createDocument();
        Element root = dom.addElement("documentAdd");
        Page page = doc.getPage();
        String text = doc.getText();
        String url = page.getUrl();
        String host = getHost(url);
        String title = doc.getTitle(titleLengthLimit);
        String tokenizedHost = tokenizeHost(host);
        String anchorText = getAnchorText(page);


        float categoryBoost = calculateCategoryBoost(attributes);
        float pagerankBoost = calculatePagerankBoost(page);
        float spamrankBoost = calculateSpamrankBoost(page);
        float logBoost = calculateLogBoost(page);
        float freshnessBoost = calculateFreshnessBoost(page);


        // add overall score
        float f1 = factor("category",categoryBoost,categoryBoostDamp);
        float f2 = factor("pagerank",pagerankBoost,pagerankBoostDamp);
        float f3 = factor("spamrank",spamrankBoost,spamrankBoostDamp);
        float f4 = factor("log",logBoost,logBoostDamp);
        float f5 = factor("freshness",freshnessBoost,freshnessBoostDamp);
        float f6 = ((Double)attributes.get("boost")).floatValue(); // as calculated by the boost module, or 1.0 if no boost module is defined.
        float boost = f1 * f2 * f3 * f4 * f5 * f6;


        // System.out.println("BOOST url=["+url+"]  category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+")  pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+")  log="+f3+" ("+logBoost+":"+logBoostDamp+")  freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+"  Boost="+boost);


        if (boost < 1e-6f) {
            logger.warn("Boost too low! ("+boost+")  category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+")  pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+")  spamrank="+f3+" ("+spamrankBoost+":"+spamrankBoostDamp+")  log="+f4+" ("+logBoost+":"+logBoostDamp+")  freshness="+f5+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f6);
            boost = 1e-6f;
        }
        
        if (null == title || "".equals(title)) {
            title = "Untitled";
        }


        root.addElement("boost").addText(String.valueOf(boost));
        root.addElement("documentId").addText(getDocumentId(page));


        Map<String,Double> boostMap = (Map<String,Double>)attributes.get("field_boost");


        // add the search fields
        addField(root, "url", url, true, true, true, boostMap);
        addField(root, "site", host, true, true, false, boostMap);
        addField(root, "tokenizedHost", tokenizedHost, false, true, true, boostMap);
        addField(root, "title", title, true, true, true, boostMap);
        addField(root, "text", text, true, true, true, boostMap);
        addField(root, "anchor", anchorText, false, true, true, boostMap);
        addField(root, "crawl", crawlName, false, true, true, boostMap);


        if (sendContent) {
            addBody(root,doc,content);
        }


        // for debugging only
        //addField(root, "boostinfo", "category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+")  pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+")  log="+f3+" ("+logBoost+":"+logBoostDamp+")  freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+"  Boost="+boost, true, false, false, null);


        addAdditionalFields(root, page, boostMap);


        // Adding metainfo from attributes
        Set<Entry<String,Object>> attributeSet = indexableAttributes.entrySet();
        for (Entry<String,Object> attribute : attributeSet) {
            addField(root, attribute.getKey(), attribute.getValue() == null ? "" : attribute.getValue().toString(), true, true, true, boostMap);
        }


        StringBuffer assignedCategories = new StringBuffer();
        if (null != categories) {
            // iterate through the classes the page belongs to add each category and its score
            for (Iterator<String> iter = categories.iterator(); iter.hasNext();) {
                assignedCategories.append(iter.next());
                assignedCategories.append(" ");


                // repeat the field times proportional to the score (this is a way to boost the document by category);
                //for (int rep = 0; rep < score*10; rep++) {
                //    addField(root, "categoryBoost", categ, false, true, false);
                //}
            }
            addField(root, "categories", assignedCategories.toString().trim(), true, true, true, boostMap);
        }


        if (logger.isDebugEnabled()) { 
            logger.debug("Indexing dom: " + DomUtil.domToString(dom));
        }
        // Send the document to the indexer. If the queue is full, wait and retry.
        try {
          int i = pageMapper.mapPage(page);
            while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) {
                try { 
                    Thread.sleep(indexerBusyRetryTime*1000); 
                } catch (InterruptedException e) {
                    logger.debug("Sleep interrupted: " + e, e); 
                }
            }
            page.setEmitted(true);
        } catch (Exception e) {
            logger.error(e,e);
        }
    }

View Full Code Here

            } catch (Exception e) {
                logger.error(e,e);
            }
        } else if ("delete".equals(command.toString())) {
            FetchDocument doc = ((CommandWithDoc)command).getDoc();
            Page page = doc.getPage();
            deleteFromIndex(page);
        } else if ("startCycle".equals(command.toString())) {
            PageDB pagedb = ((CommandWithPageDB)command).getPageDB();
            scoreThreshold = new float[11];
            for (int i = 0; i < 11; i++) {

View Full Code Here

        return doc.getPage().getUrl();
    }
    
    @Override
    protected void internalProcess (FetchDocument doc) {    
        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Fetchdata does not have a page");
            return;
        }
        String text = getDocumentText(doc);

View Full Code Here

        config.set("discovery.front.size", "0");
        config.set("keep.original.url.on.redirect", "true");


        TestUtils.writeFile(tmpDir+"/web/test.html", "<html><head><title>title</title></head><body>"+TestUtils.randomText(25,25)+"</body></html>");


        Page out, in;
        in = PageTest.randomPage();
        in.setUrl(url);


        WebServer server = null;
        Crawler crawler = null;
        try {
            server = new WebServer(8085);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();


            PageDB db = new PageDB(tmpDir+"/testdb");
            db.open(PageDB.WRITE);
            db.addPage(in);
            db.close();


            crawler = new Crawler();


            int tries = 0;
            int maxTries = 10;
            do {
                tries++;


                crawler.crawl(1);


                db.open(PageDB.READ);
                Iterator<Page> pages = db.iterator();
                assertTrue("The crawler lost or discarded the test page", pages.hasNext());
                out = pages.next();
                assertFalse("The crawler has more than the test page", pages.hasNext());
                db.close();
            } while (out.getRetries() > 0 && tries <= maxTries);


        } finally {
            if (null != crawler) {
                crawler.cleanup();
            }
            server.requestStop();
            while (! server.isStopped()) {
                Execute.sleep(20);
            }
        }


        assertTrue("Test page url changed", in.getUrl().equals(out.getUrl()));
        assertTrue("Test hotspot page distance is not 0", out.getDistance() == 0);
        assertTrue("Test page retries is not 0", out.getRetries() == 0);
        assertTrue("Test page fetch time is off by more than one minute", Math.abs(System.currentTimeMillis() - out.getLastAttempt()) <= 1000*60*60);
        assertTrue("Test page success time is off by more than one minute", Math.abs(System.currentTimeMillis() - out.getLastSuccess()) <= 1000*60*60);
        assertTrue("Test page change time is off by more than one minute", Math.abs(System.currentTimeMillis() - out.getLastChange()) <= 1000*60*60);
        assertTrue("Test page score changed", in.getScore() == out.getScore());
        assertTrue("Test page url hash changed", in.getUrlHash().equals(out.getUrlHash()));
        assertTrue("Test page content hash did not change", ! in.getSignature().equals(out.getSignature()));
        assertTrue("Test page doesn't know it has been emitted", out.isEmitted());
        String[] anchorsIn = in.getAnchors();
        String[] anchorsOut = out.getAnchors();
        Arrays.sort(anchorsIn);
        Arrays.sort(anchorsOut);
        assertTrue("Test page anchors changed (in=["+Arrays.toString(anchorsIn)+"] out=["+Arrays.toString(anchorsOut)+"])", Arrays.equals(anchorsIn, anchorsOut));
    }

View Full Code Here

        TestUtils.writeFile(tmpDir+"/web/page two.html", "content");
        
        WebServer server = null;
        Crawler crawler = null;


        Page in, one, two;
        in = PageTest.randomPage();
        in.setUrl(url);
        
        try {
            server = new WebServer(8087);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();

View Full Code Here

        cbv = new ConstantBoostValue(config);
    }


    @TestInfo(testType = TestInfo.TestType.UNIT)
    public void testHasValue() throws Exception {
        FetchDocument doc = new FetchDocument(new Page("",1f));
        assertTrue(cbv.hasValue(doc));
    }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.flaptor.hounder.crawler.pagedb.Page

com.flaptor.hounder.crawler.APageMapper

com.flaptor.hounder.crawler.CacheBean

com.flaptor.hounder.crawler.CrawlerTest

com.flaptor.hounder.crawler.FetchdataProcessor$ProcessorJob

com.flaptor.hounder.crawler.FetcherTest

com.flaptor.hounder.crawler.FetchlistFactory

com.flaptor.hounder.crawler.modules.AProcessorModuleTest

com.flaptor.hounder.crawler.modules.AThresholdModuleTest

com.flaptor.hounder.crawler.modules.ATrueFalseModuleTest

com.flaptor.hounder.crawler.modules.boost.ConstantBoostValueTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.