Examples of bixo.datum.ScoredUrlDatum

bixo.datum.ScoredUrlDatum


    private static class MatchBlockedByRobotsKey extends ArgumentMatcher<Tuple> {


        @Override
        public boolean matches(Object argument) {
            ScoredUrlDatum datum = new ScoredUrlDatum((Tuple)argument);
            return (datum.getGroupKey().equals(GroupingKey.BLOCKED_GROUPING_KEY));
        }

View Full Code Here

    @Override
    public void run() {
        String redirectedUrl = _url;


        try {
            FetchedDatum fd = _fetcher.get(new ScoredUrlDatum(_url));
            redirectedUrl = fd.getFetchedUrl();
            LOGGER.debug(String.format("No redirection of %s to %s", _url, redirectedUrl));
        } catch (RedirectFetchException e) {
            // We'll get this exception if the URL that's redirected by
            // a link shortening site is to a URL that gets redirected again.

View Full Code Here

     * @param collector tuple output collector
     */
    public static void emptyQueue(Queue<GroupedUrlDatum> urls, String groupingKey, TupleEntryCollector collector, FlowProcess process) {
        GroupedUrlDatum datum;
        while ((datum = urls.poll()) != null) {
            ScoredUrlDatum scoreUrl = new ScoredUrlDatum(datum.getUrl(), groupingKey, UrlStatus.UNFETCHED, 1.0);
            scoreUrl.setPayload(datum.getPayload());
            // TODO KKr - move synchronization up, to avoid lots of contention with other threads?
            synchronized (collector) {
                collector.add(BixoPlatform.clone(scoreUrl.getTuple(), process));
            }
        }
    }

View Full Code Here

     */
    public static BaseRobotRules getRobotRules(BaseFetcher fetcher, BaseRobotsParser parser, URL robotsUrl) {
        
        try {
            String urlToFetch = robotsUrl.toExternalForm();
            ScoredUrlDatum scoredUrl = new ScoredUrlDatum(urlToFetch);
            FetchedDatum result = fetcher.get(scoredUrl);


            // HACK! DANGER! Some sites will redirect the request to the top-level domain
            // page, without returning a 404. So look for a response which has a redirect,
            // and the fetched content is not plain text, and assume it's one of these...

View Full Code Here

                }


                // Use the same key for every URL from this domain
                GroupedUrlDatum datum;
                while ((datum = _urls.poll()) != null) {
                    ScoredUrlDatum scoreUrl;
                    FetchCounters counter;
                    String url = datum.getUrl();


                    if (isDeferred) {
                        counter = FetchCounters.URLS_DEFERRED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.DEFERRED_GROUPING_KEY, UrlStatus.SKIPPED_DEFERRED, 0.0);
                    } else if (!robotRules.isAllowed(url)) {
                        counter = FetchCounters.URLS_BLOCKED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.BLOCKED_GROUPING_KEY, UrlStatus.SKIPPED_BLOCKED, 0.0);
                    } else {
                        double score = _scorer.generateScore(domain, pld, datum);
                        if (score == BaseScoreGenerator.SKIP_SCORE) {
                            counter = FetchCounters.URLS_SKIPPED;
                            scoreUrl = new ScoredUrlDatum(url, GroupingKey.SKIPPED_GROUPING_KEY, UrlStatus.UNFETCHED, score);
                        } else {
                            counter = FetchCounters.URLS_ACCEPTED;
                            scoreUrl = new ScoredUrlDatum(url, validKey, UrlStatus.UNFETCHED, score);
                        }
                    }
                    
                    scoreUrl.setPayload(datum.getPayload());
                    _flowProcess.increment(counter, 1);


                    // collectors aren't thread safe
                    synchronized (_collector) {
                        _collector.add(BixoPlatform.clone(scoreUrl.getTuple(), _flowProcess));
                    }
                }
            }
        } catch (UnknownHostException e) {
            LOGGER.debug("Unknown host: " + _protocolAndDomain);

View Full Code Here

        try {
            // TODO KKr - when fetching the last item, send a Connection: close
            // header to let the server know it doesn't need to keep the socket open.
            Iterator<ScoredUrlDatum> iter = _items.iterator();
            while (!Thread.interrupted() && iter.hasNext()) {
                ScoredUrlDatum item = iter.next();
                FetchedDatum result = new FetchedDatum(item);
                
                // We use status as an extra field on the end of of FetchedDatum that lets
                // us generate a full status pipe, and also a content pipe that only has
                // entries which were fetched. By keying off the type (string == OK,
                // BaseFetchException == bad) the FetchPipe can do this magic.
                Comparable status = null;


                long fetchStartTime = System.currentTimeMillis();
                
                try {
                    process.increment(FetchCounters.URLS_FETCHING, 1);
                    result = _httpFetcher.get(item);
                    long deltaTime = System.currentTimeMillis() - fetchStartTime;


                    process.increment(FetchCounters.FETCHED_TIME, (int)deltaTime);
                    process.increment(FetchCounters.URLS_FETCHED, 1);
                    process.increment(FetchCounters.FETCHED_BYTES, result.getContentLength());
                    process.setStatus(Level.SLF4J_TRACE, "Fetched " + result);


                    status = UrlStatus.FETCHED.toString();
                    
                    // TODO - check keep-alive response (if present), and close the connection/delay
                    // for some amount of time if we exceed this limit.
                } catch (AbortedFetchException e) {
                    LOGGER.info("Aborted while fetching " + item.getUrl() + " due to " + e.getAbortReason());
                    if (e.getAbortReason() == AbortedFetchReason.INTERRUPTED) {
                        process.increment(FetchCounters.URLS_SKIPPED, 1);
                        
                        // Make sure our loop terminates.
                        Thread.currentThread().interrupt();
                    } else {
                        process.increment(FetchCounters.URLS_FAILED, 1);
                    }
                    
                    status = (Comparable)e;
                } catch (BaseFetchException e) {
                    LOGGER.info("Fetch exception while fetching " + item.getUrl(), e);
                    process.increment(FetchCounters.URLS_FAILED, 1);


                    // We can do this because each of the concrete subclasses of BaseFetchException implements
                    // WritableComparable
                    status = (Comparable)e;
                } catch (Exception e) {
                    LOGGER.warn("Unexpected exception while fetching " + item.getUrl(), e);


                    process.increment(FetchCounters.URLS_FAILED, 1);
                    status = new IOFetchException(item.getUrl(), new IOException(e));
                } finally {
                    process.decrement(FetchCounters.URLS_FETCHING, 1);


                    Tuple tuple = result.getTuple();
                    tuple.add(status);
                    _fetchMgr.collect(tuple);


                    // Figure out how long it's been since the start of the request.
                    long fetchInterval = System.currentTimeMillis() - fetchStartTime;


                    // We want to avoid fetching faster than a max acceptable rate. Note that we always do
                    // this, even if there's not another page, so that this setting will have impact even
                    // if the next fetch set is ready right away.
                    if (fetchInterval < minPageFetchInterval) {
                        long delay = minPageFetchInterval - fetchInterval;
                        LOGGER.trace(String.format("FetchTask: sleeping for %dms", delay));


                        try {
                            Thread.sleep(delay);
                        } catch (InterruptedException e) {
                            LOGGER.warn("FetchTask interrupted!");
                            Thread.currentThread().interrupt();
                            continue;
                        }
                    }
                }
            }
            
            // While we still have entries, we need to write them out to avoid losing them.
            while (iter.hasNext()) {
                ScoredUrlDatum item = iter.next();
                FetchedDatum result = new FetchedDatum(item);
                process.increment(FetchCounters.URLS_SKIPPED, 1);
                AbortedFetchException status = new AbortedFetchException(item.getUrl(), AbortedFetchReason.INTERRUPTED);
                
                Tuple tuple = result.getTuple();
                tuple.add(status);
               _fetchMgr.collect(tuple);
            }

View Full Code Here

            return "special grouping key";
        }


        @Override
        public boolean isLHS(TupleEntry tuple) {
            ScoredUrlDatum datum = new ScoredUrlDatum(tuple);
            return GroupingKey.isSpecialKey(datum.getGroupKey());
        }

View Full Code Here

        }


        @SuppressWarnings("rawtypes")
        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            ScoredUrlDatum sd = new ScoredUrlDatum(funcCall.getArguments());
            
            String key = sd.getGroupKey();
            if (!GroupingKey.isSpecialKey(key)) {
                throw new RuntimeException("Can't make skipped status for regular grouping key: " + key);
            }
            
            // Note: Here we share the payload of the ScoredUrlDatum with the
            // StatusDatum we're about to emit, but since we let go after we
            // emit, there shouldn't be an issue with this sharing.
            StatusDatum status = new StatusDatum(sd.getUrl(), GroupingKey.makeUrlStatusFromKey(key), sd.getPayload());
            status.setPayload(sd);
            
            funcCall.getOutputCollector().add(BixoPlatform.clone(status.getTuple(), process));
        }

View Full Code Here

        TupleEntryCollector collector = buffCall.getOutputCollector();


        PartitioningKey newKey = new PartitioningKey(key, _numReduceTasks);
        
        while (safeHasNext()) {
            ScoredUrlDatum scoredDatum = new ScoredUrlDatum(new TupleEntry(values.next()));
            FetchSetInfo setInfo = _policy.nextFetchSet(scoredDatum);
            if (setInfo != null) {
                FetchSetDatum result = makeFetchSetDatum(setInfo, newKey, safeHasNext());
                collector.add(BixoPlatform.clone(result.getTuple(), process));
            }

View Full Code Here

        policy.startFetchSet("groupingKey", crawlDelay);
        
        // Should be nothing yet.
        assertNull(policy.endFetchSet());
        
        assertNull(policy.nextFetchSet(new ScoredUrlDatum("url1")));
        
        FetchSetInfo setInfo = policy.nextFetchSet(new ScoredUrlDatum("url2"));
        assertNotNull(setInfo);
        assertEquals(2, setInfo.getUrls().size());
        assertEquals("url1", setInfo.getUrls().get(0).getUrl());
        assertEquals("url2", setInfo.getUrls().get(1).getUrl());
        assertEquals(crawlDelay * 2, setInfo.getFetchDelay());
        
        FetchSetInfo setInfo2 = policy.nextFetchSet(new ScoredUrlDatum("url3"));
        assertNotNull(setInfo2);
        assertEquals(1, setInfo.getUrls().size());
        assertEquals("url3", setInfo.getUrls().get(0).getUrl());
        
        assertNull(policy.endFetchSet());

View Full Code Here

0 1 2 3

TOP

Related Classes of bixo.datum.ScoredUrlDatum

bixo.config.DefaultFetchJobPolicyTest

bixo.fetcher.FetchTask

bixo.fetcher.http.SimpleHttpFetcherIntegrationTest

bixo.fetcher.SimpleHttpFetcherTest

bixo.operations.FilterAndScoreByUrlAndRobotsTest$MatchBlockedByRobotsKey

bixo.operations.MakeFetchSetsBuffer

bixo.operations.ProcessRobotsTask

bixo.operations.ResolveRedirectsTask

bixo.pipes.FetchPipe$MakeSkippedStatus

bixo.pipes.FetchPipe$SplitIntoSpecialAndRegularKeys

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.