/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.selftest;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigInteger;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.archive.crawler.Heritrix;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.TmpDirTestCase;
import org.mortbay.jetty.Handler;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
import org.mortbay.jetty.handler.DefaultHandler;
import org.mortbay.jetty.handler.HandlerList;
import org.mortbay.jetty.handler.ResourceHandler;
/**
* Base class for 'self tests', integrations tests formatted as unit
* tests, where the crawler launches an entire crawl exercising multiple
* features against a test harness website.
*
* @contributor pjack
* @contributor gojomo
*/
public abstract class SelfTestBase extends TmpDirTestCase {
final private Logger LOGGER =
Logger.getLogger(SelfTestBase.class.getName());
protected Heritrix heritrix;
protected Server httpServer;
protected void open() throws Exception {
// We expect to be run from the project directory.
// (Both eclipse and maven run junit tests from there).
String name = getSelfTestName();
// Make sure the project directory contains a selftest profile
// and content for the self test.
File src = getTestDataDir();
if (!src.exists()) {
throw new Exception("No selftest directory for " + name);
}
// Create temporary directories for Heritrix to run in.
File tmpDir = new File(getTmpDir(), "selftest");
File tmpTestDir = new File(tmpDir, name);
// If we have an old job lying around from a previous run, delete it.
File tmpJobs = new File(tmpTestDir, "jobs");
if (tmpJobs.exists()) {
FileUtils.deleteDirectory(tmpJobs);
}
// Copy the selftest's profile in the project directory to the
// default profile in the temporary Heritrix directory.
File tmpDefProfile = new File(tmpJobs, "selftest-job");
File profileTemplate = new File(src, "profile");
if(profileTemplate.exists()) {
org.apache.commons.io.FileUtils.copyDirectory(profileTemplate, tmpDefProfile);
} else {
org.archive.util.FileUtils.ensureWriteableDirectory(tmpDefProfile);
}
// Start up a Jetty that serves the selftest's content directory.
startHttpServer();
// Copy configuration for eg Logging over
File tmpConfDir = new File(tmpTestDir, "conf");
org.archive.util.FileUtils.ensureWriteableDirectory(tmpConfDir);
File srcConf = new File(src.getParentFile(), "conf");
FileUtils.copyDirectory(srcConf, tmpConfDir);
String crawlerBeansText = FileUtils.readFileToString(
new File(srcConf, "selftest-crawler-beans.cxml"));
crawlerBeansText = changeGlobalConfig(crawlerBeansText);
File crawlerBeans = new File(tmpDefProfile, "selftest-crawler-beans.cxml");
FileWriter fw = new FileWriter(crawlerBeans);
fw.write(crawlerBeansText);
fw.close();
startHeritrix(tmpTestDir.getAbsolutePath());
waitForCrawlFinish();
}
protected String changeGlobalConfig(String config) {
config = config.replace(
"@@URL_VALUE@@","http://crawler.archive.org/selftestcrawl");
// if not already changed, used default self-test start URL
config = config.replace(
"@@SEEDS_VALUE@@", getSeedsString());
// if not already replaced, remove other placeholder
config = config.replace("@@MORE_PROPERTIES@@","");
return config;
}
/**
* Get seeds for this test. Should be in form that can be
* spliced into a Java properties-format string (any internal
* lineends escaped with '\').
* @return String seeds to use
*/
protected String getSeedsString() {
// default barring overrides
return "http://127.0.0.1:7777/index.html";
}
protected void close() throws Exception {
stopHttpServer();
stopHeritrix();
}
public void testSomething() throws Exception {
try {
boolean fail = false;
try {
open();
verifyCommon();
verify();
} finally {
try {
close();
} catch (Exception e) {
e.printStackTrace();
fail = true;
}
}
assertFalse(fail);
} catch (Exception e) {
// I hate maven.
e.printStackTrace();
throw e;
}
}
protected abstract void verify() throws Exception;
protected void stopHttpServer() throws Exception {
try {
httpServer.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
protected void startHttpServer() throws Exception {
Server server = new Server();
SocketConnector sc = new SocketConnector();
sc.setHost("127.0.0.1");
sc.setPort(7777);
server.addConnector(sc);
ResourceHandler rhandler = new ResourceHandler();
rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath());
HandlerList handlers = new HandlerList();
handlers.setHandlers(new Handler[] { rhandler, new DefaultHandler() });
server.setHandler(handlers);
this.httpServer = server;
server.start();
}
protected void startHeritrix(String path) throws Exception {
String authPassword =
(new BigInteger(SecureRandom.getSeed(16))).abs().toString(16);
String[] args = { "-j", path + "/jobs", "-a", authPassword };
// TODO: add auth password?
heritrix = new Heritrix();
heritrix.instanceMain(args);
configureHeritrix();
heritrix.getEngine().requestLaunch("selftest-job");
}
protected void configureHeritrix() throws Exception {
// by default do nothing
}
protected void stopHeritrix() throws Exception {
heritrix.getEngine().shutdown();
heritrix.getComponent().stop();
}
protected void waitForCrawlFinish() throws Exception {
heritrix.getEngine().waitForNoRunningJobs(0);
}
protected File getSrcHtdocs() {
return new File(getTestDataDir(), "htdocs");
}
protected File getTestDataDir() {
File r = new File("testdata");
if (!r.exists()) {
r = new File("engine");
r = new File(r, "testdata");
if (!r.exists()) {
throw new IllegalStateException(
"Can't find selfest testdata " +
"(tried testdata/selftest and " +
"heritrix/testdata/selftest)");
}
}
r = new File(r, "selftest");
r = new File(r, getSelfTestName());
if (!r.exists()) {
throw new IllegalStateException("No testdata directory: "
+ r.getAbsolutePath());
}
return r;
}
protected File getCrawlDir() {
File tmp = getTmpDir();
File selftest = new File(tmp, "selftest");
File crawl = new File(selftest, getSelfTestName());
return crawl;
}
protected File getJobDir() {
File crawl = getCrawlDir();
File jobs = new File(crawl, "jobs");
File theJob = new File(jobs, "selftest-job");
return theJob;
}
protected File getArcDir() {
return new File(getJobDir(), "arcs");
}
protected File getLogsDir() {
return new File(getJobDir(), "logs");
}
private String getSelfTestName() {
String full = getClass().getName();
int i = full.lastIndexOf('.');
return full.substring(i + 1);
}
protected void verifyArcsClosed() {
File arcsDir = getArcDir();
if (!arcsDir.exists()) {
throw new IllegalStateException("Missing arc dir " +
arcsDir.getAbsolutePath());
}
for (File f: arcsDir.listFiles()) {
String fn = f.getName();
if (fn.endsWith(".open")) {
throw new IllegalStateException(
"Arc file not closed at end of crawl: " + f.getAbsolutePath());
}
}
}
protected void verifyLogFileEmpty(String logFileName) {
File logsDir = getLogsDir();
File log = new File(logsDir, logFileName);
if (log.length() != 0) {
throw new IllegalStateException("Log " + logFileName +
" isn't empty.");
}
}
protected void verifyCommon() throws Exception {
verifyLogFileEmpty("uri-errors.log");
verifyLogFileEmpty("runtime-errors.log");
verifyLogFileEmpty("local-errors.log");
verifyProgressStatistics();
verifyArcsClosed();
}
protected void verifyProgressStatistics() throws IOException {
File logs = new File(getJobDir(), "logs");
File statsFile = new File(logs, "progress-statistics.log");
String stats = FileUtils.readFileToString(statsFile);
if (!stats.contains("CRAWL RUNNING - Preparing")) {
fail("progress-statistics.log has no Prepared line.");
}
if (!stats.contains("CRAWL RUNNING - Running")) {
fail("progress-statistics.log has no Running line.");
}
if (!stats.contains("CRAWL ENDING - Finished")) {
fail("progress-statistics.log has missing/wrong Finished line.");
}
if (!stats.contains("doc/s(avg)")) {
fail("progress-statistics.log has no legend.");
}
}
protected List<ArchiveRecordHeader> headersInArcs() throws IOException {
List<ArchiveRecordHeader> result = new ArrayList<ArchiveRecordHeader>();
File arcsDir = getArcDir();
if (!arcsDir.exists()) {
throw new IllegalStateException("Missing arc dir " +
arcsDir.getAbsolutePath());
}
File[] files = arcsDir.listFiles();
if (files == null) {
return Collections.emptyList();
}
for (File f: files) {
result.addAll(ARCReaderFactory.get(f).validate());
}
return result;
}
protected Set<String> filesInArcs() throws IOException {
List<ArchiveRecordHeader> headers = headersInArcs();
HashSet<String> result = new HashSet<String>();
for (ArchiveRecordHeader arh: headers) {
// ignore 'filedesc:' record
if(arh.getUrl().startsWith("filedesc:")) {
continue;
}
UURI uuri = UURIFactory.getInstance(arh.getUrl());
String path = uuri.getPath();
if (path.startsWith("/")) {
path = path.substring(1);
}
if (arh.getUrl().startsWith("http:")) {
result.add(path);
}
}
LOGGER.finest(result.toString());
return result;
}
}