Package com.flaptor.hounder.crawler.pagedb

Examples of com.flaptor.hounder.crawler.pagedb.Page


        }
       
        public void run() {
            try {
                boolean wasHotspot = false;
                Page page = doc.getPage();
                String pageurl = page.getUrl();
                if (!"main".equals(Thread.currentThread().getName())) {
                    Thread.currentThread().setName("FetchdataProcessorJob("+pageurl+")");
                }
               
                String text = doc.getText(100);
                String title = doc.getTitle(100);
                Link[] links = doc.getLinks();
                String[] anchors = page.getAnchors();
                boolean success = doc.success()
                        && (   (null != title && title.length() > 0)
                            || (null != text && text.length() > 0)
                            || (null != anchors && anchors.length > 0)
                            || (null != links && links.length > 0)
                           );

                if (!success) { // the page could not be fetched

                    logger.debug("  page " + pageurl + " could not be fetched");
                    boolean keep = false;
                    if (doc.recoverable()) {
                        if (doc.internalError()) { // the recoverable error is our own fault
                            keep = true;
                        } else {
                            page.setRetries(page.getRetries() + 1);
                            if (!PageDBTrimmer.tooManyRetries(page)) {
                                keep = true;
                            }
                        }
                    } else {
                        logger.debug("  discarding page " + pageurl);
                    }

                    if (keep) {
                        // if the page is to be kept, store it
                        newPageDB.addPage(page);
                    } else {
                        // otherwise announce it to the modules so they can take appropiate action
                        CommandWithDoc cmd = new CommandWithDoc("delete", doc);
                        modules.applyCommand(cmd);
                    }

                } else { // the page could be fetched.

                    page.setRetries(0); // the page has been successfully fetched, so no retries.
                    if (page.getLastSuccess() == 0) {
                        page.setLastChange(page.getLastAttempt()); // first fetch is considered a change
                    }
                    page.setLastSuccess(page.getLastAttempt()); // this is the time of the last successful fetch: now

                    if (hotspots.match(pageurl)) {
                        doc.setTag(IS_HOTSPOT);
                        wasHotspot = true;
                    }

                    // send it to modules manager
                    modules.process(doc);

                    // propagate the antiscore back to its parents
                    float antiScore = page.getAntiScore();
                    if ((antiScore > 0f) && recordParents) {
                        int numParents = page.getParents().length;
                        for (String parentUrl : page.getParents()) {
                            Page badParent = new Page(parentUrl, -1.0f);
                            badParent.setAntiScore(PageRank.parentContribution(antiScore, numParents));
                            newPageDB.addPage(badParent);
                        }
                    }
                   
                    if (null != links) {
                        // Now add the page's outlinks to the next pagedb,
                        // so they can be fetched in the next cycle
                        if (links.length == 0) {
                            // We need to avoid dangling nodes.
                            // A simple way is to add a link to itself
                            links = new Link[1];
                            links[0] = new Link(pageurl, "");
                        }
                        for (Link link : links) {
                            try {
                                if (!(page.getDistance() > maxDistance && pageurl.equals(link.getUrl()))) { // dont add self-links in a discovery front page
                                    if (Crawler.urlFilter(link.getUrl()) != null) { // if the url is a valid web page (not an image, etc)
                                        logger.debug("    Adding link to " + link + " to the pagedb");
                                        Page child = new Page(CleanURL(link.getUrl()), 1.0f);
                                        child.setRetries(0);
                                        child.setLastAttempt(0L);
                                        child.setLastSuccess(0L);
                                        if (recordParents) {
                                            child.addParent(pageurl);
                                        }
                                        child.addAnchor(link.getAnchor()); // at this point it can only be one anchor
                                        child.setScore(PageRank.parentContribution(page.getScore(), links.length));
                              
                                        // unless the child is a hotspot, it is removed from the fetched page by 1 level
                                        child.setDistance(page.getDistance() + 1);

                                        if (!hotspots.matchAll() && (maxDistance == 0)) {
                                            // If hotspots is "*", all links are set at distance>0 and the trimmer
                                            // will keep those that make it into the front line.
                                            // If hotspots restricts the crawl and maxDistance is 0, we want to make
                                            // sure the distance is 0 when the child is a hotspot, so it will not be
                                            // dropped by the trimmer.
                                            // TODO: check if this is true, or if the presence of a frontline should
                                            // be checked before setting distances to 0.
                                            if (hotspots.match(link.getUrl())) {
                                                child.setDistance(0);
                                                logger.debug("    child hotspot: " + link);
                                            } else {
                                                logger.debug("    child not hotspot: " + link);
                                            }

                                        }
                                        newPageDB.addPage(child);
                                        if (child.getDistance() > maxDistance) {
                                            newDiscoveryPage();
                                        }
                                    } else {
                                        logger.debug("    Dropping uninteresting url " + link);
                                    }
View Full Code Here


    }
   
   
    public synchronized void internalProcess (FetchDocument doc) {
        out.println("--------------------------------------------------");
        Page page = doc.getPage();
        if (null == page) {
            out.println("Null Page");
        } else {
            out.println("Url="+page.getUrl());
            logIt(doc);
            if (logText){
                out.println("TEXT="+doc.getText(80));
            }
            if (logEmmited){
                out.println("Emmited="+page.isEmitted());
            }
        }
        out.println("--------------------------------------------------");
        out.flush();
    }
View Full Code Here

    private boolean fetchPage(String url){
        float page_similarity_threshold=0f;
        float score= 0f;
        int textLengthLimit= 50000;
        try {
            Page page= new Page(url, score, page_similarity_threshold);
            FetchList fetchlist = new FetchList();
            fetchlist.addPage(page);
            fetchlist.close();
            FetchData fetchdata = fetcher.fetch(fetchlist);
            boolean fetched = false;
View Full Code Here

        float score= 0f;
        int textLengthLimit= 50000;
        try {
            FetchList fetchlist = new FetchList();
            for (String url: urls){
                Page page= new Page(url, score, page_similarity_threshold);               
                fetchlist.addPage(page);
            }
            fetchlist.close();
            FetchData fetchdata = fetcher.fetch(fetchlist);
            for (FetchDocument doc : fetchdata) {
View Full Code Here

    }

    //    @Override
    public void internalProcess (FetchDocument doc) {
        try {
            Page page = doc.getPage();
            float spamValue = 0;
           
            if (titleSpamActive) spamValue += titleSpamValue(doc.getTitle());
            if (urlMatchSpamActive) spamValue += urlMatchSpamValue(page);

            page.setAntiScore(spamValue);
        } catch (NullPointerException e) {
            logger.error(e,e);
        }
    }
View Full Code Here

        Execute.close(patterns);
    }
   
    @Override
    public Boolean tfInternalProcess (FetchDocument doc) {
        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring document.");
            return null;
        }
        return patterns.match(page.getUrl());
    }
View Full Code Here

        Execute.close(patterns);
    }


    public Boolean tfInternalProcess(FetchDocument doc) {
        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring document.");
            return false;
        }
        String url = page.getUrl();

 
        Set<String> foundCategories = patterns.getTokens(url);
        boolean foundSomething = (null != foundCategories);
        if (foundSomething) {
View Full Code Here

    }


    @Override
    public Boolean tfInternalProcess (FetchDocument doc) {
        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring document.");
            return null;
        }
        boolean patternMatched = false; // for debugging/logging
        if (patterns.match(page.getUrl())) {
            doc.setTag(IS_HOTSPOT_TAG);
            patternMatched= true;
        }
        logger.debug("  " + (patternMatched?"":"not ") + "hotspot: "+ page.getUrl());
        return patternMatched;
    }
View Full Code Here

     * @param origPageDB the intput pagedb.
     * @param destPageDB the output pagedb that will hold the trimmed result.
     * @param availableDiscoveryPages the number of discovery pages present in the input pagedb.
     */
    public void trimPageDB (PageDB origPageDB, PageDB destPageDB, CrawlerProgress progress) throws IOException, MalformedURLException {
        Page bestPage;
        bestPage = new Page("",0);
        bestPage.setDistance(Integer.MAX_VALUE);
        bestPage.setRetries(0);
        bestPage.setLastAttempt(0);
        bestPage.setLastSuccess(0);
        bestPage.setLastChange(0);
        bestPage.setPriority(-Float.MAX_VALUE);
        bestPage.setEmitted(false);
        bestPage.setSignature(new TextSignature(""));
        bestPage.setAntiScore(0);
        boolean hasAntiScore = false;
        boolean unfetched = false;
        int inlinks = 0;


        logger.debug("Trimming the pagedb");
        origPageDB.open(PageDB.READ);
        destPageDB.open(PageDB.WRITE + PageDB.UNSORTED);
        destPageDB.setSameCycleAs(origPageDB);
        long dbSize = origPageDB.getSize();
        long dbFetchedSize = origPageDB.getFetchedSize();
        if (null != progress) {
            progress.startTrim(dbSize);
            origPageDB.setProgressHandler(progress);
        }
        int counter = 0;

        PageRank pageRank = new PageRank(dbSize);
        PageRank badRank = new PageRank(dbSize);
        PageFilter pageFilter = new PageFilter(maxDistance, maxRetries, dbFetchedSize, discoveryFrontSize, progress.discovered());

        // This code produces one page for each block of same-url pages.
        // The produced page has the best properties of the block,
        // Unfetched pages in the block contribute to the pagerank of the
        // resulting page. If there are no unfetched pages in the block,
        // the fetched page is simply copied.

        for (Page page : origPageDB) {

            if (!Crawler.running()) break;
           
            if (null != progress) {
                counter++;
                if (counter >= 1000) {
                    progress.addTrimmed(counter);
                    progress.report();
                    counter = 0;
                }
            }


            // get data for this page
            String url = page.getUrl().toLowerCase();
            logger.debug("  reading page " + url);
            long lastAttempt = page.getLastAttempt();
            long lastSuccess = page.getLastSuccess();
            long lastChange = page.getLastChange();

            if (url.equals(bestPage.getUrl().toLowerCase())) { // both pages have the same url
                logger.debug("    same page, will keep reading.");

                if (page.getScore() < 0) {
                    // this is not a real link but a back-propagation vector for the anti-score
                    hasAntiScore = true;
                    badRank.addContribution(page.getAntiScore());
                   
                } else {

                    // add the anchor to the list of anchors of this block
                    bestPage.addAnchors(page.getAnchors());
                    // add the urls of the pages linking to this block
                    bestPage.addParents(page.getParents());

                    // if this page has not been fetched, mark the block as unfetched,
                    // add its score to the block score and count it as an incomming link
                    if (lastSuccess == 0L) {
                        unfetched = true;
                        pageRank.addContribution(page.getScore());
                        inlinks++;
                    }

                    // keep the shortest distance
                    int distance = page.getDistance();
                    if (distance < bestPage.getDistance()) {
                        bestPage.setDistance(distance);
                    }

                    // keep the latest fetch
                    if (lastAttempt > bestPage.getLastAttempt()) {
                        bestPage.setLastAttempt(lastAttempt);
                    }

                    // keep the latest success
                    if (lastSuccess > bestPage.getLastSuccess()) {
                        bestPage.setLastSuccess(lastSuccess);
                    }

                    // keep the latest change
                    if (lastChange > bestPage.getLastChange()) {
                        bestPage.setLastChange(lastChange);
                    }

                    // keep the least retries
                    int retries = page.getRetries();
                    if (lastSuccess < lastAttempt || lastSuccess == 0) {
                        // if this page has not been successfuly fetched keep the most retries
                        // (one will be for the actual attempt, the rest will be unattempted links)
                        if (retries > bestPage.getRetries()) {
                            bestPage.setRetries(retries);
                        }
                    }
                    // keep the old priority, antiscore, hash and emitted
                    if (lastSuccess > 0) {
                        bestPage.setSignature(page.getSignature());
                        bestPage.setEmitted(page.isEmitted());
                        bestPage.setPriority(page.getPriority());
                        bestPage.setAntiScore(page.getAntiScore());
                    }

                }
               
            } else { // The page is not a duplicate

                if (bestPage.getUrl().length() > 0) {
                    // if this is not the first page, write the best of the last block of similar pages
                    logger.debug("    new page, will write previous one: " + bestPage.getUrl());
                    bestPage.setNumInlinks(inlinks);
                    if (unfetched) {
                        bestPage.setScore(pageRank.getPageScore());
                    }
                    if (hasAntiScore) {
                        // new antiscore is the average between the original value and the children contributions
                        float antiScore = (badRank.getPageScore() + bestPage.getAntiScore()) / 2f;
                        bestPage.setAntiScore(antiScore);
                    }
                    if (pageFilter.shouldWrite (destPageDB, bestPage)) {
                        updatePriority(bestPage);
                        destPageDB.addPage(bestPage);
                    }
                }
                // this page starts a new block of similar pages, record its properties
                bestPage.setUrl(page.getUrl());
                bestPage.setDistance(page.getDistance());
                bestPage.setLastAttempt(page.getLastAttempt());
                bestPage.setLastSuccess(page.getLastSuccess());
                bestPage.setLastChange(page.getLastChange());
                bestPage.setRetries(page.getRetries());
                bestPage.setAnchors(page.getAnchors());
                bestPage.setParents(page.getParents());
                bestPage.setScore(page.getScore());
                bestPage.setAntiScore(page.getAntiScore());
                bestPage.setPriority(page.getPriority());
                bestPage.setSignature(page.getSignature());
                bestPage.setEmitted(page.isEmitted());
                unfetched = (page.getLastSuccess() == 0L);
                hasAntiScore = (page.getScore() < 0);
                inlinks = 0;
                pageRank.reset();
                badRank.reset();
                if (hasAntiScore) {
                    badRank.addContribution(page.getAntiScore());
                } else if (unfetched) {
                    pageRank.addContribution(page.getScore());
                    inlinks++;
                }
            }
        }
        if (null != progress) {
            progress.addTrimmed(counter);
            progress.report();
        }
        if (bestPage.getUrl().length() > 0) {
            // if the orig pagedb is not empty, write the best of the last similar block of pages
            logger.debug("    pagedb is over, will write last one: " + bestPage.getUrl());
            bestPage.setNumInlinks(inlinks);
            if (unfetched) {
                bestPage.setScore(pageRank.getPageScore());
            }
            if (hasAntiScore) {
                // new antiscore is the average between the original value and the children contributions
                float antiScore = (badRank.getPageScore() + bestPage.getAntiScore()) / 2f;
                bestPage.setAntiScore(antiScore);
            }
            if (pageFilter.shouldWrite (destPageDB, bestPage)) {
                updatePriority(bestPage);
                destPageDB.addPage(bestPage);
            }
View Full Code Here

     * Check the parsed text against the list of filter words.
     * If any word matches, return true, otherwise return false.
     */
    @Override
    public Boolean tfInternalProcess (FetchDocument doc) {
        Page page = doc.getPage();
        if (null == page) {
            logger.warn("Page is null. Ignoring document.");
            return null;
        }
        return phrases.matchText(doc.getText().toLowerCase());
View Full Code Here

TOP

Related Classes of com.flaptor.hounder.crawler.pagedb.Page

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.