Package bixo.datum

Examples of bixo.datum.UrlDatum


public class SimpleUrlFilterTest {

    @Test
    public void testValidUrls() {
        BaseUrlFilter urlFilter = new SimpleUrlFilter();
        Assert.assertFalse(urlFilter.isRemove(new UrlDatum("http://domain.com")));
    }
View Full Code Here


    }
   
    @Test
    public void testInvalidUrls() {
        BaseUrlFilter urlFilter = new SimpleUrlFilter();
        Assert.assertTrue("No protocol", urlFilter.isRemove(new UrlDatum("www.domain.com")));
        Assert.assertTrue("Unknown protocol", urlFilter.isRemove(new UrlDatum("mdata://www.domain.com")));
        Assert.assertTrue("Invalid port", urlFilter.isRemove(new UrlDatum("http://www.domain.com:a")));
    }
View Full Code Here

        LOGGER.info("Crawldb datums created : " + _numCreated);
    }

    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> funcCall) {
        UrlDatum datum = new UrlDatum(funcCall.getArguments());
        Long lastFetched = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
        Long lastUpdated = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_UPDATED_FIELD);
        UrlStatus status = UrlStatus.valueOf((String) (datum.getPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD)));
        Integer crawlDepth = (Integer) datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH);

        CrawlDbDatum crawldbDatum = new CrawlDbDatum(datum.getUrl(), lastFetched, lastUpdated, status, crawlDepth);

        funcCall.getOutputCollector().add(crawldbDatum.getTuple());
        _numCreated++;
    }
View Full Code Here

            _parser.parse(is, _handler, metadata, new ParseContext());

            // _ids now has a list of the mailbox IDs that we use to create URLs.
            for (String id : _ids) {
              String url = String.format("%s/%s.mbox", fetchedDatum.getUrl(), id);
              UrlDatum datum = new UrlDatum(url);
              functionCall.getOutputCollector().add(datum.getTuple());
            }
          } catch (Exception e) {
        LOGGER.error("Exception parsing mod_mbox page", e);
      }
        }
View Full Code Here

                        "-(?i)^http://([a-z0-9]*\\.)*foo.com/bar/",
                        "+(?i)^http://([a-z0-9]*\\.)*foo.com/accept",
        };

        RegexUrlFilter filter = new RegexUrlFilter(patterns);
        UrlDatum datum = new UrlDatum("http://my.foo.com/accept");
        assertFalse(filter.isRemove(datum));

        datum.setUrl("http://my.foo.com/bar/");
        assertTrue(filter.isRemove(datum));

        datum.setUrl("http://my.foo.com/accept/shouldstillberemoved.gif");
        assertTrue(filter.isRemove(datum));

    }
View Full Code Here

        List<String> defaultUrlFilterPatterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
        String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + "foo.com";
        defaultUrlFilterPatterns.add(domainPatterStr);

        RegexUrlFilter filter = new RegexUrlFilter(defaultUrlFilterPatterns.toArray(new String[defaultUrlFilterPatterns.size()]));
        UrlDatum datum = new UrlDatum("http://my.foo.com/");
        assertFalse(filter.isRemove(datum));
  
        datum.setUrl("http://my.foo.com/accept/shouldberemoved.exe");
        assertTrue(filter.isRemove(datum));

    }
View Full Code Here

        }

        @Override
        public void operate(FlowProcess flowProcess, FunctionCall<NullContext> funcCall) {
            CrawlDbDatum datum = new CrawlDbDatum(funcCall.getArguments());
            UrlDatum urlDatum = new UrlDatum(datum.getUrl());
            urlDatum.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, datum.getLastFetched());
            urlDatum.setPayloadValue(CrawlDbDatum.LAST_UPDATED_FIELD, datum.getLastUpdated());
            urlDatum.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, datum.getLastStatus().name());
            urlDatum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, datum.getCrawlDepth());
           
            funcCall.getOutputCollector().add(urlDatum.getTuple());
        }
View Full Code Here

            }
           
          try {
              // Validate the URL
                new URL(url);
                UrlDatum urlDatum = new UrlDatum(url);
                funcCall.getOutputCollector().add(urlDatum.getTuple());
            } catch (MalformedURLException e) {
                LOGGER.error("Invalid URL in input data file: " + url);
            }
    }
View Full Code Here

        for (Outlink outlink : outlinks) {
            String url = outlink.getToUrl();
            url = url.replaceAll("[\n\r]", "");
            url = _normalizer.normalize(url);
            if (_validator.isValid(url)) {
                UrlDatum urlDatum = new UrlDatum(url);
                urlDatum.setPayload(datum.getPayload());
                collector.add(urlDatum.getTuple());
            }
        }
    }
View Full Code Here

        }
    }

    @Override
    public void operate(FlowProcess process, BufferCall<NullContext> bufferCall) {
        UrlDatum bestDatum = null;

        int ignoredUrls = 0;
        long bestFetched = 0;
        Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator();
        while (iter.hasNext()) {
            UrlDatum datum = new UrlDatum(iter.next());
            if (bestDatum == null) {
                bestDatum = new UrlDatum(datum);
                bestFetched = (Long) bestDatum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            } else if ((Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD) > bestFetched) {
                if (bestFetched != 0) {
                    _numLater += 1;
                    // Should never happen that we double-fetch a page
                    LOGGER.warn("Using URL with later fetch time: " + datum.getUrl());
                }

                bestDatum.setUrl(datum.getUrl());   // There's really no need to set the url since it should be same
                bestDatum.setPayload(datum.getPayload());
                bestFetched = (Long) bestDatum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            } else {
                ignoredUrls += 1;
            }
        }
View Full Code Here

TOP

Related Classes of bixo.datum.UrlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.