Package bixo.examples.webmining

Source Code of bixo.examples.webmining.DemoWebMiningWorkflowTest$DirectoryResponseHandler

/*
* Copyright (c) 2009-2012 Scale Unlimited
*
* All rights reserved.
*
*/
package bixo.examples.webmining;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.io.FileUtils;
import org.eclipse.jetty.http.HttpException;
import org.eclipse.jetty.server.Handler;
import org.eclipse.jetty.server.Request;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.handler.AbstractHandler;
import org.junit.Before;
import org.junit.Test;

import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.FetcherMode;
import bixo.config.UserAgent;
import bixo.datum.FetchedDatum;
import bixo.utils.CrawlDirUtils;
import cascading.flow.Flow;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryIterator;

import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;


@SuppressWarnings("deprecation")
public class DemoWebMiningWorkflowTest {

    private static final String WORKING_DIR = "build/test/WebMiningWorkflowTest";

    private static final String PAGE1_URL = "http://127.0.0.1:8089/page-1.html";
    private static final String PAGE1_SCORE = "0.021978023";
    private static final String PAGE2_URL = "http://127.0.0.1:8089/page-2.html";
    private static final String PAGE2_SCORE = "0.0";

    /**
     * Handler we use with embedded Jetty to "serve" web pages that are files found in a base
     * directory.
     *
     */
    private class DirectoryResponseHandler extends AbstractHandler {

        private File _baseDir;

        public DirectoryResponseHandler (String baseDir) {
            _baseDir = new File(baseDir);
            if (!_baseDir.exists() || !_baseDir.isDirectory()) {
                throw new RuntimeException("Base dir doesn't exist:" + _baseDir.getAbsolutePath());
            }
        }
       
        @Override
        public void handle(String pathInContext, Request baseReques, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException {
           
            File file = new File(_baseDir, pathInContext);
            if (!file.exists()) {
                throw new HttpException(404, "Resource not found: " + pathInContext);
            }

            try {
                byte[] bytes = new byte[(int) file.length()];
                DataInputStream in = new DataInputStream(new FileInputStream(file));
                in.readFully(bytes);
               
                response.setContentLength(bytes.length);
                response.setContentType("text/html");
                response.setStatus(200);
               
                OutputStream os = response.getOutputStream();
                os.write(bytes);
            } catch (Exception e) {
                throw new HttpException(500, e.getMessage());
            }
        }

    }

    private File _workingDir;
   
    @Before
    public void setup() {
        _workingDir = new File(WORKING_DIR);
        if (_workingDir.exists()) {
            FileUtils.deleteQuietly(_workingDir);
        }
       
        _workingDir.mkdirs();
    }
   
    @SuppressWarnings("rawtypes")
    @Test
    public void testDemoWebMiningWorkflow() throws Exception {
        DemoWebMiningOptions options = new DemoWebMiningOptions();
        options.setWorkingDir(WORKING_DIR);
        options.setAgentName("test-agent");
        options.setLocalPlatformMode(true);
       
        BixoPlatform platform = new BixoPlatform(DemoWebMiningWorkflowTest.class, options.getPlatformMode());
        BasePath workingDirPath = platform.makePath(WORKING_DIR);
        DemoWebMiningTool.setupWorkingDir(platform, workingDirPath, "/test-seedurls.txt");
       
        BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
        BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
       
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/plain");
        validMimeTypes.add("text/html");
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

        Server server = null;
        try {
            server = startServer(new DirectoryResponseHandler("src/test/resources/test-pages"), 8089);
           
            BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 1);

            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
       
            // validate
            BasePath statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 1, "status", true);
   
            BasePath contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 1, "content", false);

            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 3, "crawldb", true);
           
            // run the second loop
            curLoopDirPath =  CrawlDirUtils.makeLoopDir(platform, workingDirPath, 2);
            flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
           
            // validate
            statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 2, "status", true);
   
            contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 2, "content", false);

            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 8, "crawldb", true);
            assertTrue(validatePageScores(platform, crawlDbPath));
           
            BasePath resultsPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
            validateEntryCount(platform, resultsPath, null, 3, "page results", true);
        finally {
            if (server != null) {
                server.stop();
            }
        }
    }
   
    private Server startServer(Handler handler, int port) throws Exception {
        Server server = new Server(port);
        server.setHandler(handler);
        server.start();
        return server;
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void validateEntryCount(BasePlatform platform, BasePath dataPath, Fields fields, int expected, String msgStr, boolean isTextLine) throws Exception {
        Tap sourceTap;
       
        if (isTextLine) {
            sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        } else {
            sourceTap = platform.makeTap(platform.makeBinaryScheme(fields), dataPath);
        }
       
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
          tupleEntryIterator.next();
          numEntries++;
        }
       
        tupleEntryIterator.close();
        assertEquals(msgStr, expected, numEntries);
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private boolean validatePageScores(BasePlatform platform, BasePath  dataPath) throws Exception {
        boolean allOK = false;
        int verifiedCnt = 0;
        Tap sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
          TupleEntry next = tupleEntryIterator.next();
          String line = next.getString("line");
          String[] split = line.split("\t");
          if (split[0].equals(PAGE1_URL)) {
              allOK |= split[3].equals(PAGE1_SCORE);
              verifiedCnt++;
          } else if (split[0].equals(PAGE2_URL)) {
              allOK |= split[3].equals(PAGE2_SCORE);
              verifiedCnt++;
          }
        }
       
        tupleEntryIterator.close();
        return allOK && (verifiedCnt==2);
     }
}
TOP

Related Classes of bixo.examples.webmining.DemoWebMiningWorkflowTest$DirectoryResponseHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.