Package bixo.urls

Examples of bixo.urls.SimpleUrlNormalizer


        Assert.assertEquals(testName + ": " + weird, normal, normalizer.normalize(weird));
    }

    @Before
    public void setupNormalizer() {
        _normalizer = new SimpleUrlNormalizer();
    }
View Full Code Here


    @Test
    public void testAggressive() {
        normalizeTest"http://www.foo.com/foo.php?x=1&user_id=7&something=1",
                        "http://www.foo.com/foo.php?x=1&user_id=7&something=1",
                        "Leave user_id in middle of valid query parameters");
        SimpleUrlNormalizer aggresiveNormalizer = new SimpleUrlNormalizer(false, true);
        normalizeTestaggresiveNormalizer,
                        "http://www.foo.com/foo.php?user=usa",
                        "http://www.foo.com/foo.php",
                        "Remove single user query parameter");
        normalizeTestaggresiveNormalizer,
View Full Code Here

                        "Remove default page");
    }
   
    @Test
    public void testStumbleUponURLs() {
      SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(true);

        normalizeTest(normalizer, "http://www.stumbleupon.com/toolbar/#url=http%3A//links.flashdance.cx/misc-pix/fjortisfangelse.jpg",
                        "http://www.stumbleupon.com/toolbar/#url=http%3a//links.flashdance.cx/misc-pix/fjortisfangelse.jpg",
        "Preserve pseudo-anchor used as qeury");
View Full Code Here

    public void importUrls(boolean debug) throws Exception {


        try {
            Tap urlSource = _platform.makeTap(_platform.makeTextScheme(), _inputFilePath);
            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFromTextFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));

            Tap urlSink = _platform.makeTap(_platform.makeBinaryScheme(CrawlDbDatum.FIELDS), _destDirPath, SinkMode.REPLACE);

            FlowConnector flowConnector = _platform.makeFlowConnector();
            Flow flow = flowConnector.connect(urlSource, urlSink, importPipe);
View Full Code Here

    public static void importOneDomain(BasePlatform platform, String targetDomain, BasePath crawlDbPath) throws Exception {
       
        try {
            Tap urlSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath, SinkMode.REPLACE);
            TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
            SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize("http://" + targetDomain), 0, 0, UrlStatus.UNFETCHED, 0);

            writer.add(datum.getTuple());
            writer.close();
        } catch (Exception e) {
            throw e;
View Full Code Here

                WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
                writableSeqFileSink = platform.makeTap(writableSeqScheme, writableSeqFileDataPath, SinkMode.REPLACE);
            }
       
        Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
        if (urlFilter != null) {
            urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
        }
       
        urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);
View Full Code Here

        String curUrl = null;
       
        try {
            List<String> lines = FileUtils.readLines(new File(args[0]));

            BaseUrlNormalizer urlNormalizer = new SimpleUrlNormalizer();
            for (String url : lines) {
                curUrl = url;
                String normalized = urlNormalizer.normalize(curUrl);
                if (!normalized.equalsIgnoreCase(curUrl)) {
                    System.out.println(curUrl + " ==> " + normalized);
                }
            }
        } catch (Throwable t) {
View Full Code Here

    @SuppressWarnings("rawtypes")
    @Override
    public void prepare(FlowProcess process, OperationCall<NullContext> operationCall) {
        LOGGER.info("Starting creation of outlink URLs");
        _normalizer = new SimpleUrlNormalizer();
        _validator = new SimpleUrlValidator();
    }
View Full Code Here


    @SuppressWarnings({ "unchecked", "rawtypes" })
    public static void importSeedUrls(BasePlatform platform, BasePath crawlDbPath, String fileName) throws Exception  {
       
        SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
       
        InputStream is = null;
        TupleEntryCollector writer = null;
        try {
            Tap urlSink = platform.makeTap(platform.makeTextScheme(), crawlDbPath, SinkMode.REPLACE);
            writer = urlSink.openForWrite(platform.makeFlowProcess());

            is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName);
            if (is == null) {
                throw new FileNotFoundException("The seed urls file doesn't exist");
            }

            List<String> lines = IOUtils.readLines(is);
            for (String line : lines) {
                line = line.trim();
                if (line.startsWith("#")) {
                    continue;
                }

                CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f);
                writer.add(datum.getTuple());
            }

        } catch (IOException e) {
            crawlDbPath.delete(true);
View Full Code Here

TOP

Related Classes of bixo.urls.SimpleUrlNormalizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.