Package bixo.datum

Examples of bixo.datum.UrlStatus


    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> funcCall) {
        UrlDatum datum = new UrlDatum(funcCall.getArguments());
        Long lastFetched = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
        Long lastUpdated = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_UPDATED_FIELD);
        UrlStatus status = UrlStatus.valueOf((String) (datum.getPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD)));
        Integer crawlDepth = (Integer) datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH);

        CrawlDbDatum crawldbDatum = new CrawlDbDatum(datum.getUrl(), lastFetched, lastUpdated, status, crawlDepth);

        funcCall.getOutputCollector().add(crawldbDatum.getTuple());
View Full Code Here


            totalEntries += 1;
   
            String statusLine = entry.getString("line");
            String[] pieces = statusLine.split("\t");
            int pos = StatusDatum.FIELDS.getPos(StatusDatum.STATUS_FN);
            UrlStatus status = UrlStatus.valueOf(pieces[pos]);
            statusCounts[status.ordinal()] += 1;
        }
       
       
        for (int i = 0; i < statusCounts.length; i++) {
          if (statusCounts[i] != 0) {
View Full Code Here

        // [C | S | A | L] [C | S | A | L] [C | S | A | L] [C | S | A | L]

        CrawlDbDatum crawlDbDatum = null;
        StatusDatum statusDatum = null;
        AnalyzedDatum analyzedDatum = null;
        UrlStatus status = null;
        float pageScore = 0;
        float linkScore = 0;

        String url = null;
View Full Code Here

            int numFetched = 0;
            int numPending = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int crawlDepth = datum.getCrawlDepth();
                if (datum.getLastFetched() != 0) {
                    numFetched += 1;

                    assertEquals(UrlStatus.FETCHED, status);
                    assertEquals(0, crawlDepth);
                } else {
                    numPending += 1;
                    assertEquals(UrlStatus.UNFETCHED, status);
                    assertEquals(1, crawlDepth);
                }
            }

            assertEquals(1, numFetched);
            assertEquals(10, numPending);

            // Do it one more time, to verify status gets propagated forward.
            curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);

            flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();
            // Update crawldb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            iter = crawldbTap.openForRead(platform.makeFlowProcess());

            numFetched = 0;
            numPending = 0;
            int numDepth0 = 0;
            int numDepth1 = 0;
            int numDepth2 = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int depth = datum.getCrawlDepth();

                if (datum.getLastFetched() != 0) {
                    numFetched += 1;
                    assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
View Full Code Here

    }

    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        StatusDatum datum = new StatusDatum(funcCall.getArguments());
        UrlStatus status = datum.getStatus();
        String url = datum.getUrl();
        long statusTime = datum.getStatusTime();
       
        long fetchTime = statusTime; // Not exactly true... since in some cases we
                    // may not have fetched the url. But because we are sharing this logic
                    // between the JDBCCrawlTool and the DemoCrawlTool, we use the value
                    // of the fetchTime while selecting the "latest" url. Newly added urls
                    // have a fetchTime of 0, so in order to preserve say a SKIPPED status
                    // we set the fetch time here.

        _numCreated += 1;

        UrlDatum urlDatum = new UrlDatum(url);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, fetchTime);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_UPDATED_FIELD, statusTime);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, status.name());
        // Don't change the crawl depth here - we do that only in the case of a
        // successful parse
        urlDatum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH));

        funcCall.getOutputCollector().add(urlDatum.getTuple());
View Full Code Here

        @Override
        // LHS represents unfetched tuples
        public boolean isLHS(TupleEntry tupleEntry) {
            CrawlDbDatum datum = new CrawlDbDatum(tupleEntry);
            UrlStatus status = datum.getLastStatus();
            if (status == UrlStatus.UNFETCHED               
                            || status == UrlStatus.SKIPPED_DEFERRED
                            || status == UrlStatus.SKIPPED_BY_SCORER
                            || status == UrlStatus.SKIPPED_BY_SCORE
                            || status == UrlStatus.SKIPPED_TIME_LIMIT
View Full Code Here

       
        // Skip all URLs that we've got left.
        if (!values.isEmpty()) {
            trace("Found unprocessed URLs");
           
            UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;
           
            while (!values.isEmpty()) {
                FetchSetDatum datum = values.drain();
                List<ScoredUrlDatum> urls = datum.getUrls();
                trace("Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
View Full Code Here

           
            // Note: Here we share the payload of the FetchedDatum with the
            // StatusDatum we're about to emit, but since we let go after we
            // emit, there shouldn't be an issue with this sharing.
            if (result instanceof String) {
                UrlStatus urlStatus = UrlStatus.valueOf((String)result);
                if (urlStatus == UrlStatus.FETCHED) {
                    status = new StatusDatum(fd.getUrl(), fd.getHeaders(), fd.getHostAddress(), fd.getPayload());
                } else {
                    status = new StatusDatum(fd.getUrl(), urlStatus, fd.getPayload());
                }
View Full Code Here

            throw new RuntimeException("Invalid crawl delay in grouping key: " + key);
        }
    }
   
    public static UrlStatus makeUrlStatusFromKey(String key) {
        UrlStatus status;
       
        if (!isSpecialKey(key)) {
            status = UrlStatus.UNFETCHED;
        } else if (key.equals(GroupingKey.BLOCKED_GROUPING_KEY)) {
            status = UrlStatus.SKIPPED_BLOCKED;
View Full Code Here

TOP

Related Classes of bixo.datum.UrlStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.