Package com.alibaba.taobao.main

Source Code of com.alibaba.taobao.main.LinearURLMiningMain

package com.alibaba.taobao.main;

import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.TimeUnit;

import com.alibaba.taobao.worker.ConfigurableWorker;
import com.alibaba.taobao.worker.SimpleURLComparator;
import com.alibaba.taobao.worker.WorkerEvent;
import com.alibaba.taobao.worker.WorkerListener;
import com.alibaba.taobao.worker.WorkerTask;
import com.alibaba.taobao.worker.linear.PageURLMiningProcessor;
import com.alibaba.taobao.worker.linear.PageURLMiningTask;

/**
* Linear version of page URL mining. It's slow but simple.
* Average time cost for 1000 URLs is: 3800ms
*
* @author xuanyin.zy E-mail:xuanyin.zy@taobao.com
* @since Sep 16, 2012 5:35:40 PM
*/
public class LinearURLMiningMain implements WorkerListener {
    private static final String EMPTY_STRING = "";

    private static final int URL_SIZE_TO_MINE = 10000;

    private static ConcurrentHashMap<String, WorkerTask<?>> taskID2TaskMap = new ConcurrentHashMap<String, WorkerTask<?>>();

    private static ConcurrentSkipListSet<String> foundURLs = new ConcurrentSkipListSet<String>(new SimpleURLComparator());

    public static void main(String[] args) throws InterruptedException {
        long startTime = System.currentTimeMillis();

        ConfigurableWorker worker = new ConfigurableWorker("W001");
        worker.setTaskProcessor(new PageURLMiningProcessor());

        addTask2Worker(worker, new PageURLMiningTask("http://www.taobao.com"));
        addTask2Worker(worker, new PageURLMiningTask("http://www.xinhuanet.com"));
        addTask2Worker(worker, new PageURLMiningTask("http://www.zol.com.cn"));
        addTask2Worker(worker, new PageURLMiningTask("http://www.163.com"));

        LinearURLMiningMain mainListener = new LinearURLMiningMain();
        worker.addListener(mainListener);

        worker.start();

        String targetURL = EMPTY_STRING;
        while (foundURLs.size() < URL_SIZE_TO_MINE) {
            targetURL = foundURLs.pollFirst();

            if (targetURL == null) {
                TimeUnit.MILLISECONDS.sleep(50);
                continue;
            }

            PageURLMiningTask task = new PageURLMiningTask(targetURL);
            taskID2TaskMap.putIfAbsent(worker.addTask(task), task);

            TimeUnit.MILLISECONDS.sleep(100);
        }

        worker.stop();

        for (String string : foundURLs) {
            System.out.println(string);
        }

        System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms");
    }

    private static void addTask2Worker(ConfigurableWorker mapWorker_1, PageURLMiningTask task) {
        String taskID = mapWorker_1.addTask(task);
        taskID2TaskMap.put(taskID, task);
    }

    @Override
    public List<WorkerEvent> intrests() {
        return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED);
    }

    @Override
    public void onEvent(WorkerEvent event, Object... args) {
        if (WorkerEvent.TASK_FAILED == event) {
            System.err.println("Error while extracting URLs");
            return;
        }

        if (WorkerEvent.TASK_COMPLETE != event)
            return;

        PageURLMiningTask task = (PageURLMiningTask) args[0];
        if (!taskID2TaskMap.containsKey(task.getTaskID()))
            return;

        foundURLs.addAll(task.getMinedURLs());

        System.out.println("Found URL size: " + foundURLs.size());

        taskID2TaskMap.remove(task.getTaskID());
    }
}
TOP

Related Classes of com.alibaba.taobao.main.LinearURLMiningMain

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.