Package org.archive.crawler.selftest

Source Code of org.archive.crawler.selftest.SelfTestBase

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.selftest;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigInteger;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.archive.crawler.Heritrix;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.TmpDirTestCase;
import org.mortbay.jetty.Handler;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
import org.mortbay.jetty.handler.DefaultHandler;
import org.mortbay.jetty.handler.HandlerList;
import org.mortbay.jetty.handler.ResourceHandler;

/**
* Base class for 'self tests', integrations tests formatted as unit
* tests, where the crawler launches an entire crawl exercising multiple
* features against a test harness website.
*
* @contributor pjack
* @contributor gojomo
*/
public abstract class SelfTestBase extends TmpDirTestCase {

    final private Logger LOGGER =
        Logger.getLogger(SelfTestBase.class.getName());
   
    protected Heritrix heritrix;
    protected Server httpServer;
   
    protected void open() throws Exception {
        // We expect to be run from the project directory.
        // (Both eclipse and maven run junit tests from there).
        String name = getSelfTestName();
       
        // Make sure the project directory contains a selftest profile
        // and content for the self test.
        File src = getTestDataDir();
        if (!src.exists()) {
            throw new Exception("No selftest directory for " + name);
        }
       
        // Create temporary directories for Heritrix to run in.
        File tmpDir = new File(getTmpDir(), "selftest");
        File tmpTestDir = new File(tmpDir, name);
       
        // If we have an old job lying around from a previous run, delete it.
        File tmpJobs = new File(tmpTestDir, "jobs");
        if (tmpJobs.exists()) {
            FileUtils.deleteDirectory(tmpJobs);
        }
       
        // Copy the selftest's profile in the project directory to the
        // default profile in the temporary Heritrix directory.
        File tmpDefProfile = new File(tmpJobs, "selftest-job");
        File profileTemplate = new File(src, "profile");
        if(profileTemplate.exists()) {
            org.apache.commons.io.FileUtils.copyDirectory(profileTemplate, tmpDefProfile);
        } else {
            org.archive.util.FileUtils.ensureWriteableDirectory(tmpDefProfile);
        }
       
        // Start up a Jetty that serves the selftest's content directory.
        startHttpServer();
       
        // Copy configuration for eg Logging over
        File tmpConfDir = new File(tmpTestDir, "conf");
        org.archive.util.FileUtils.ensureWriteableDirectory(tmpConfDir);
        File srcConf = new File(src.getParentFile(), "conf");
        FileUtils.copyDirectory(srcConf, tmpConfDir);

        String crawlerBeansText = FileUtils.readFileToString(
                new File(srcConf, "selftest-crawler-beans.cxml"));
        crawlerBeansText = changeGlobalConfig(crawlerBeansText);
        File crawlerBeans = new File(tmpDefProfile, "selftest-crawler-beans.cxml");
        FileWriter fw = new FileWriter(crawlerBeans);
        fw.write(crawlerBeansText);
        fw.close();
       
        startHeritrix(tmpTestDir.getAbsolutePath());
       
        waitForCrawlFinish();
    }
   
   
    protected String changeGlobalConfig(String config) {
        config = config.replace(
                "@@URL_VALUE@@","http://crawler.archive.org/selftestcrawl");
        // if not already changed, used default self-test start URL
        config = config.replace(
                "@@SEEDS_VALUE@@", getSeedsString());
        // if not already replaced, remove other placeholder
        config = config.replace("@@MORE_PROPERTIES@@","");
        return config;
    }
   
    /**
     * Get seeds for this test. Should be in form that can be
     * spliced into a Java properties-format string (any internal
     * lineends escaped with '\').
     * @return String seeds to use
     */
    protected String getSeedsString() {
        // default barring overrides
        return "http://127.0.0.1:7777/index.html";
    }
   

    protected void close() throws Exception {
        stopHttpServer();
        stopHeritrix();
    }

    public void testSomething() throws Exception {
        try {
            boolean fail = false;
            try {
                open();
                verifyCommon();
                verify();
            } finally {
                try {
                    close();
                } catch (Exception e) {
                    e.printStackTrace();
                    fail = true;
                }
            }
            assertFalse(fail);
        } catch (Exception e) {
            // I hate maven.
            e.printStackTrace();
            throw e;
        }
    }
   
   
    protected abstract void verify() throws Exception;
   
   
    protected void stopHttpServer() throws Exception {
        try {
            httpServer.stop()
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
   
   
    protected void startHttpServer() throws Exception {
        Server server = new Server();
        SocketConnector sc = new SocketConnector();
        sc.setHost("127.0.0.1");
        sc.setPort(7777);
        server.addConnector(sc);
        ResourceHandler rhandler = new ResourceHandler();
        rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath());
       
        HandlerList handlers = new HandlerList();
        handlers.setHandlers(new Handler[] { rhandler, new DefaultHandler() });
        server.setHandler(handlers);
       
        this.httpServer = server;
        server.start();
    }
   
   
    protected void startHeritrix(String path) throws Exception {
        String authPassword =
            (new BigInteger(SecureRandom.getSeed(16))).abs().toString(16);
        String[] args = { "-j", path + "/jobs", "-a", authPassword };
        // TODO: add auth password?
        heritrix = new Heritrix();
        heritrix.instanceMain(args);
       
        configureHeritrix();

        heritrix.getEngine().requestLaunch("selftest-job");
    }
   
   
    protected void configureHeritrix() throws Exception {
        // by default do nothing
    }
   
   
    protected void stopHeritrix() throws Exception {
        heritrix.getEngine().shutdown();
        heritrix.getComponent().stop();
    }
   
    protected void waitForCrawlFinish() throws Exception {
        heritrix.getEngine().waitForNoRunningJobs(0);
    }
   
    protected File getSrcHtdocs() {
        return new File(getTestDataDir(), "htdocs");
    }

    protected File getTestDataDir() {
        File r = new File("testdata");
        if (!r.exists()) {
            r = new File("engine");
            r = new File(r, "testdata");
            if (!r.exists()) {
                throw new IllegalStateException(
                        "Can't find selfest testdata " +
                        "(tried testdata/selftest and " +
                        "heritrix/testdata/selftest)");
            }
        }
        r = new File(r, "selftest");
        r = new File(r, getSelfTestName());
        if (!r.exists()) {
            throw new IllegalStateException("No testdata directory: "
                    + r.getAbsolutePath());
        }
        return r;
    }
   
   
    protected File getCrawlDir() {
        File tmp = getTmpDir();
        File selftest = new File(tmp, "selftest");
        File crawl = new File(selftest, getSelfTestName());
        return crawl;
   
   
    protected File getJobDir() {
        File crawl = getCrawlDir();
        File jobs = new File(crawl, "jobs");
        File theJob = new File(jobs, "selftest-job");
        return theJob;
    }
   
   
    protected File getArcDir() {
        return new File(getJobDir(), "arcs");
    }
   
   
    protected File getLogsDir() {
        return new File(getJobDir(), "logs");
    }



    private String getSelfTestName() {
        String full = getClass().getName();
        int i = full.lastIndexOf('.');
        return full.substring(i + 1);
    }
   
    protected void verifyArcsClosed() {
        File arcsDir = getArcDir();
        if (!arcsDir.exists()) {
            throw new IllegalStateException("Missing arc dir " +
                    arcsDir.getAbsolutePath());
        }
        for (File f: arcsDir.listFiles()) {
            String fn = f.getName();
            if (fn.endsWith(".open")) {
                throw new IllegalStateException(
                        "Arc file not closed at end of crawl: " + f.getAbsolutePath());
            }
        }
    }
   
    protected void verifyLogFileEmpty(String logFileName) {
        File logsDir = getLogsDir();
        File log = new File(logsDir, logFileName);
        if (log.length() != 0) {
            throw new IllegalStateException("Log " + logFileName +
                    " isn't empty.");
        }
    }
   
   
    protected void verifyCommon() throws Exception {
        verifyLogFileEmpty("uri-errors.log");
        verifyLogFileEmpty("runtime-errors.log");
        verifyLogFileEmpty("local-errors.log");
        verifyProgressStatistics();
        verifyArcsClosed();
    }
   
   
    protected void verifyProgressStatistics() throws IOException {
        File logs = new File(getJobDir(), "logs");
        File statsFile = new File(logs, "progress-statistics.log");
        String stats = FileUtils.readFileToString(statsFile);
        if (!stats.contains("CRAWL RUNNING - Preparing")) {
            fail("progress-statistics.log has no Prepared line.");
        }
        if (!stats.contains("CRAWL RUNNING - Running")) {
            fail("progress-statistics.log has no Running line.");
        }
        if (!stats.contains("CRAWL ENDING - Finished")) {
            fail("progress-statistics.log has missing/wrong Finished line.");
        }
        if (!stats.contains("doc/s(avg)")) {
            fail("progress-statistics.log has no legend.");
        }
    }
   
   
    protected List<ArchiveRecordHeader> headersInArcs() throws IOException {
        List<ArchiveRecordHeader> result = new ArrayList<ArchiveRecordHeader>();
        File arcsDir = getArcDir();
        if (!arcsDir.exists()) {
            throw new IllegalStateException("Missing arc dir " +
                    arcsDir.getAbsolutePath());
        }
        File[] files = arcsDir.listFiles();
        if (files == null) {
            return Collections.emptyList();
        }
        for (File f: files) {
            result.addAll(ARCReaderFactory.get(f).validate());
        }
        return result;
    }
   
   
    protected Set<String> filesInArcs() throws IOException {
        List<ArchiveRecordHeader> headers = headersInArcs();
        HashSet<String> result = new HashSet<String>();
        for (ArchiveRecordHeader arh: headers) {
            // ignore 'filedesc:' record
            if(arh.getUrl().startsWith("filedesc:")) {
                continue;
            }
            UURI uuri = UURIFactory.getInstance(arh.getUrl());
            String path = uuri.getPath();
            if (path.startsWith("/")) {
                path = path.substring(1);
            }
            if (arh.getUrl().startsWith("http:")) {
                result.add(path);
            }
        }
        LOGGER.finest(result.toString());
        return result;
    }
}
TOP

Related Classes of org.archive.crawler.selftest.SelfTestBase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.