/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.fetcher;
import static org.archive.modules.fetcher.FetchHTTPTest.BASIC_AUTH_LOGIN;
import static org.archive.modules.fetcher.FetchHTTPTest.BASIC_AUTH_PASSWORD;
import static org.archive.modules.fetcher.FetchHTTPTest.BASIC_AUTH_REALM;
import static org.archive.modules.fetcher.FetchHTTPTest.DEFAULT_GZIPPED_PAYLOAD;
import static org.archive.modules.fetcher.FetchHTTPTest.DEFAULT_PAYLOAD_STRING;
import static org.archive.modules.fetcher.FetchHTTPTest.DIGEST_AUTH_LOGIN;
import static org.archive.modules.fetcher.FetchHTTPTest.DIGEST_AUTH_PASSWORD;
import static org.archive.modules.fetcher.FetchHTTPTest.DIGEST_AUTH_REALM;
import static org.archive.modules.fetcher.FetchHTTPTest.ETAG_TEST_VALUE;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.UnsupportedEncodingException;
import java.net.Inet4Address;
import java.net.InetAddress;
import java.net.NetworkInterface;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.net.ssl.SSLException;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.http.NameValuePair;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicNameValuePair;
import org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.CrawlURI.FetchType;
import org.archive.modules.ProcessorTestBase;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.RejectDecideRule;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.revisit.ServerNotModifiedRevisit;
import org.archive.net.UURI;
import org.jboss.netty.handler.codec.http.HttpRequest;
import org.littleshoot.proxy.DefaultHttpProxyServer;
import org.littleshoot.proxy.HttpFilter;
import org.littleshoot.proxy.HttpRequestFilter;
import org.littleshoot.proxy.ProxyAuthorizationHandler;
/**
* These are the tests that FetchHTTPTest runs. FetchHTTPTest sets up a
* TestSuite that starts up the test servers, and shuts them down after all the
* tests in this class has been run. This class should not be named to match
* *Test, Test*, or *TestCase, or surefire will try to run it outside of the
* FetchHTTPTest suite.
*/
public class FetchHTTPTests extends ProcessorTestBase {
// private static Logger logger = Logger.getLogger(FetchHTTPTests.class.getName());
// static {
// Logger.getLogger("").setLevel(Level.FINE);
// for (java.util.logging.Handler h : Logger.getLogger("").getHandlers()) {
// h.setLevel(Level.ALL);
// h.setFormatter(new OneLineSimpleLogger());
// }
// }
protected FetchHTTP fetcher;
protected FetchHTTP fetcher() throws IOException {
if (fetcher == null) {
fetcher = makeModule();
}
return fetcher;
}
protected String getUserAgentString() {
return getClass().getName();
}
protected void runDefaultChecks(CrawlURI curi, String... exclusionsArray)
throws IOException, UnsupportedEncodingException {
Set<String> exclusions = new HashSet<String>(Arrays.asList(exclusionsArray));
String requestString = httpRequestString(curi);
if (!exclusions.contains("requestLine")) {
assertTrue(requestString.startsWith("GET / HTTP/1.0\r\n"));
}
assertTrue(requestString.contains("User-Agent: " + getUserAgentString() + "\r\n"));
assertTrue(requestString.matches("(?s).*Connection: [Cc]lose\r\n.*"));
if (!exclusions.contains("acceptHeaders")) {
assertTrue(requestString.contains("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"));
}
if (!exclusions.contains("hostHeader")) {
assertTrue(requestString.contains("Host: localhost:7777\r\n"));
}
if (!exclusions.contains("trailingCRLFCRLF")) {
assertTrue(requestString.endsWith("\r\n\r\n"));
}
// check sizes
assertEquals(DEFAULT_PAYLOAD_STRING.length(), curi.getContentLength());
assertEquals(curi.getContentSize(), curi.getRecordedSize());
// check various
assertEquals("sha1:TQ5R6YVOZLTQENRIIENVGXHOPX3YCRNJ", curi.getContentDigestSchemeString());
if (!exclusions.contains("contentType")) {
assertEquals("text/plain;charset=US-ASCII", curi.getContentType());
assertEquals(Charset.forName("US-ASCII"), curi.getRecorder().getCharset());
}
assertTrue(curi.getCredentials().isEmpty());
assertTrue(curi.getFetchDuration() >= 0);
if (!exclusions.contains("fetchStatus")) {
assertTrue(curi.getFetchStatus() == 200);
}
if (!exclusions.contains("fetchTypeGET")) {
assertTrue(curi.getFetchType() == FetchType.HTTP_GET);
}
// check message body, i.e. "raw, possibly chunked-transfer-encoded message contents not including the leading headers"
assertEquals(DEFAULT_PAYLOAD_STRING, messageBodyString(curi));
// check entity, i.e. "message-body after any (usually-unnecessary) transfer-decoding but before any content-encoding (eg gzip) decoding"
assertEquals(DEFAULT_PAYLOAD_STRING, entityString(curi));
// check content, i.e. message-body after possibly tranfer-decoding and after content-encoding (eg gzip) decoding
assertEquals(DEFAULT_PAYLOAD_STRING, contentString(curi));
assertEquals(DEFAULT_PAYLOAD_STRING.substring(0, 10), curi.getRecorder().getContentReplayPrefixString(10));
assertEquals(DEFAULT_PAYLOAD_STRING, curi.getRecorder().getContentReplayCharSequence().toString());
if (!exclusions.contains("httpBindAddress")) {
assertEquals("127.0.0.1", FetchHTTPTest.getLastRequest().getRemoteAddr());
}
assertTrue(curi.getNonFatalFailures().isEmpty());
}
// convenience methods to get strings from raw recorded i/o
/**
* Raw response including headers.
*/
static protected String rawResponseString(CrawlURI curi) throws IOException, UnsupportedEncodingException {
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getReplayInputStream());
return new String(buf, "US-ASCII");
}
/**
* Raw message body, before any unchunking or content-decoding.
*/
static protected String messageBodyString(CrawlURI curi) throws IOException, UnsupportedEncodingException {
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getMessageBodyReplayInputStream());
return new String(buf, "US-ASCII");
}
/**
* Message body after unchunking but before content-decoding.
*/
static protected String entityString(CrawlURI curi) throws IOException, UnsupportedEncodingException {
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getEntityReplayInputStream());
return new String(buf, "US-ASCII");
}
/**
* Unchunked, content-decoded message body.
*/
static protected String contentString(CrawlURI curi) throws IOException, UnsupportedEncodingException {
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getContentReplayInputStream());
return new String(buf, "US-ASCII");
}
static protected String httpRequestString(CrawlURI curi) throws IOException, UnsupportedEncodingException {
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getRecordedOutput().getReplayInputStream());
return new String(buf, "US-ASCII");
}
public void testDefaults() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
// logger.info('\n' + httpRequestString(curi) + rawResponseString(curi));
runDefaultChecks(curi);
}
public void testAcceptHeaders() throws Exception {
List<String> headers = Arrays.asList("header1: value1", "header2: value2");
fetcher().setAcceptHeaders(headers);
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
runDefaultChecks(curi, "acceptHeaders");
// special checks for this test
String requestString = httpRequestString(curi);
assertFalse(requestString.contains("Accept:"));
for (String h: headers) {
assertTrue(requestString.contains(h));
}
}
public void testCookies() throws Exception {
checkSetCookieURI();
// second request to see if cookie is sent
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
runDefaultChecks(curi);
String requestString = httpRequestString(curi);
assertTrue(requestString.contains("Cookie: test-cookie-name=test-cookie-value\r\n"));
}
public void testIgnoreCookies() throws Exception {
fetcher().setIgnoreCookies(true);
checkSetCookieURI();
// second request to see if cookie is NOT sent
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
runDefaultChecks(curi);
String requestString = httpRequestString(curi);
assertFalse(requestString.contains("Cookie:"));
}
public void testBasicAuth() throws Exception {
HttpAuthenticationCredential basicAuthCredential = new HttpAuthenticationCredential();
basicAuthCredential.setRealm(BASIC_AUTH_REALM);
basicAuthCredential.setDomain("localhost:7777");
basicAuthCredential.setLogin(BASIC_AUTH_LOGIN);
basicAuthCredential.setPassword(BASIC_AUTH_PASSWORD);
fetcher().getCredentialStore().getCredentials().put("basic-auth-credential",
basicAuthCredential);
CrawlURI curi = makeCrawlURI("http://localhost:7777/auth/1");
fetcher().process(curi);
// check that we got the expected response and the fetcher did its thing
assertEquals(401, curi.getFetchStatus());
assertEquals("Basic realm=\"basic-auth-realm\"", curi.getHttpResponseHeader("WWW-Authenticate"));
assertTrue(curi.getCredentials().contains(basicAuthCredential));
assertTrue(curi.getHttpAuthChallenges() != null && curi.getHttpAuthChallenges().containsKey("basic"));
// fetch again with the credentials
fetcher().process(curi);
String httpRequestString = httpRequestString(curi);
assertTrue(httpRequestString.contains("Authorization: Basic YmFzaWMtYXV0aC1sb2dpbjpiYXNpYy1hdXRoLXBhc3N3b3Jk\r\n"));
// otherwise should be a normal 200 response
runDefaultChecks(curi, "requestLine");
// fetch a fresh uri to make sure auth info was cached and we don't get another 401
curi = makeCrawlURI("http://localhost:7777/auth/2");
fetcher().process(curi);
httpRequestString = httpRequestString(curi);
assertTrue(httpRequestString.contains("Authorization: Basic YmFzaWMtYXV0aC1sb2dpbjpiYXNpYy1hdXRoLXBhc3N3b3Jk\r\n"));
// otherwise should be a normal 200 response
runDefaultChecks(curi, "requestLine");
}
// server for digest auth is at localhost:7778
public void testDigestAuth() throws Exception {
HttpAuthenticationCredential digestAuthCred = new HttpAuthenticationCredential();
digestAuthCred.setRealm(DIGEST_AUTH_REALM);
digestAuthCred.setDomain("localhost:7778");
digestAuthCred.setLogin(DIGEST_AUTH_LOGIN);
digestAuthCred.setPassword(DIGEST_AUTH_PASSWORD);
fetcher().getCredentialStore().getCredentials().put("digest-auth-credential",
digestAuthCred);
CrawlURI curi = makeCrawlURI("http://localhost:7778/auth/1");
fetcher().process(curi);
// check that we got the expected response and the fetcher did its thing
assertEquals(401, curi.getFetchStatus());
assertTrue(curi.getCredentials().contains(digestAuthCred));
assertTrue(curi.getHttpAuthChallenges() != null && curi.getHttpAuthChallenges().containsKey("digest"));
// stick a basic auth 401 in there to check it doesn't mess with the digest auth we're working on
CrawlURI interferingUri = makeCrawlURI("http://localhost:7777/auth/basic");
fetcher().process(interferingUri);
assertEquals(401, interferingUri.getFetchStatus());
// logger.info('\n' + httpRequestString(interferingUri) + "\n\n" + rawResponseString(interferingUri));
// fetch original again with the credentials
fetcher().process(curi);
String httpRequestString = httpRequestString(curi);
// logger.info('\n' + httpRequestString + "\n\n" + rawResponseString(interferingUri));
assertTrue(httpRequestString.contains("Authorization: Digest"));
// otherwise should be a normal 200 response
runDefaultChecks(curi, "requestLine", "hostHeader");
// fetch a fresh uri to make sure auth info was cached and we don't get another 401
curi = makeCrawlURI("http://localhost:7778/auth/2");
fetcher().process(curi);
httpRequestString = httpRequestString(curi);
assertTrue(httpRequestString.contains("Authorization: Digest"));
// otherwise should be a normal 200 response
runDefaultChecks(curi, "requestLine", "hostHeader");
}
public void test401NoChallenge() throws URIException, IOException, InterruptedException {
CrawlURI curi = makeCrawlURI("http://localhost:7777/401-no-challenge");
fetcher().process(curi);
assertEquals(401, curi.getFetchStatus());
runDefaultChecks(curi, "requestLine", "fetchStatus");
}
protected void checkSetCookieURI() throws URIException, IOException,
InterruptedException, UnsupportedEncodingException {
CrawlURI curi = makeCrawlURI("http://localhost:7777/set-cookie");
fetcher().process(curi);
runDefaultChecks(curi, "requestLine");
// check for set-cookie header
byte[] buf = IOUtils.toByteArray(curi.getRecorder().getReplayInputStream());
String rawResponseString = new String(buf, "US-ASCII");
assertTrue(rawResponseString.contains("Set-Cookie: test-cookie-name=test-cookie-value\r\n"));
}
public void testAcceptCompression() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().setAcceptCompression(true);
fetcher().process(curi);
String httpRequestString = httpRequestString(curi);
// logger.info('\n' + httpRequestString + "\n\n" + rawResponseString(curi));
// logger.info("\n----- begin messageBodyString -----\n" + messageBodyString(curi));
// logger.info("\n----- begin entityString -----\n" + entityString(curi));
// logger.info("\n----- begin contentString -----\n" + contentString(curi));
assertTrue(httpRequestString.contains("Accept-Encoding: gzip,deflate\r\n"));
assertEquals(DEFAULT_GZIPPED_PAYLOAD.length, curi.getContentLength());
assertEquals(curi.getContentSize(), curi.getRecordedSize());
// check various
assertEquals("text/plain;charset=US-ASCII", curi.getContentType());
assertEquals(Charset.forName("US-ASCII"), curi.getRecorder().getCharset());
assertTrue(curi.getCredentials().isEmpty());
assertTrue(curi.getFetchDuration() >= 0);
assertTrue(curi.getFetchStatus() == 200);
assertTrue(curi.getFetchType() == FetchType.HTTP_GET);
// check message body, i.e. "raw, possibly chunked-transfer-encoded message contents not including the leading headers"
assertTrue(Arrays.equals(DEFAULT_GZIPPED_PAYLOAD, IOUtils.toByteArray(curi.getRecorder().getMessageBodyReplayInputStream())));
// check entity, i.e. "message-body after any (usually-unnecessary) transfer-decoding but before any content-encoding (eg gzip) decoding"
assertTrue(Arrays.equals(DEFAULT_GZIPPED_PAYLOAD, IOUtils.toByteArray(curi.getRecorder().getEntityReplayInputStream())));
// check content, i.e. message-body after possibly tranfer-decoding and after content-encoding (eg gzip) decoding
assertEquals(DEFAULT_PAYLOAD_STRING, contentString(curi));
assertEquals("sha1:6HXUWMO6VPBHU4SIPOVJ3OPMCSN6JJW4", curi.getContentDigestSchemeString());
}
// Test will succeed if there ae at least 2 local Inet4Addresses, and
// each can be bound to in turn. (Works better than trying to use 127.0.0.2
// which may not be available as local address by default on MacOS.)
// Usually, the minimum 2 addresses will be 127.0.0.1 and another
// routable (perhaps LAN only eg 192.168.x.x or 10.x.x.x) address.
public void testHttpBindAddress() throws Exception {
List<InetAddress> addrList = new ArrayList<InetAddress>();
for (NetworkInterface ifc: Collections.list(NetworkInterface.getNetworkInterfaces())) {
if (ifc.isUp()) {
for (InetAddress addr : Collections.list(ifc.getInetAddresses())) {
if (addr instanceof Inet4Address) {
addrList.add(addr);
}
}
}
}
if (addrList.size() < 2) {
fail("unable to test binding to different local addresses: only "
+ addrList.size() + " addresses available");
}
for (InetAddress addr : addrList) {
tryHttpBindAddress(addr.getHostAddress());
}
}
public void tryHttpBindAddress(String addr) throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().setHttpBindAddress(addr);
fetcher().process(curi);
// the client bind address isn't recorded anywhere in heritrix as
// far as i can tell, so we get it this way...
assertEquals(addr, FetchHTTPTest.getLastRequest().getRemoteAddr());
runDefaultChecks(curi, "httpBindAddress");
}
protected static class ProxiedRequestRememberer implements HttpRequestFilter {
protected HttpRequest lastProxiedRequest = null;
public HttpRequest getLastProxiedRequest() {
return lastProxiedRequest;
}
@Override
public void filter(HttpRequest httpRequest) {
lastProxiedRequest = httpRequest;
}
public void clear() {
lastProxiedRequest = null;
}
}
public void testHttpProxy() throws Exception {
ProxiedRequestRememberer proxiedRequestRememberer = new ProxiedRequestRememberer();
DefaultHttpProxyServer httpProxyServer = new DefaultHttpProxyServer(7877, proxiedRequestRememberer, new HashMap<String, HttpFilter>());
httpProxyServer.start(true, false);
try {
fetcher().setHttpProxyHost("localhost");
fetcher().setHttpProxyPort(7877);
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
String requestString = httpRequestString(curi);
assertTrue(requestString.startsWith("GET http://localhost:7777/ HTTP/1.0\r\n"));
assertNotNull(curi.getHttpResponseHeader("Via"));
assertTrue(requestString.contains("Proxy-Connection: close\r\n"));
// check that our little proxy server really handled a request
assertNotNull(proxiedRequestRememberer.getLastProxiedRequest());
runDefaultChecks(curi, "requestLine");
} finally {
httpProxyServer.stop();
}
}
public void testHttpProxyAuth() throws Exception {
ProxiedRequestRememberer proxiedRequestRememberer = new ProxiedRequestRememberer();
DefaultHttpProxyServer httpProxyServer = new DefaultHttpProxyServer(7877, proxiedRequestRememberer, new HashMap<String, HttpFilter>());
httpProxyServer.addProxyAuthenticationHandler(new ProxyAuthorizationHandler() {
@Override
public boolean authenticate(String userName, String password) {
// logger.info("username=" + userName + " password=" + password);
return "http-proxy-user".equals(userName) && "http-proxy-password".equals(password);
}
});
httpProxyServer.start(true, false);
try {
fetcher().setHttpProxyHost("localhost");
fetcher().setHttpProxyPort(7877);
fetcher().setHttpProxyUser("http-proxy-user");
fetcher().setHttpProxyPassword("http-proxy-password");
fetcher().setUseHTTP11(true); // proxy auth is a http 1.1 feature
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
String requestString = httpRequestString(curi);
assertTrue(requestString.startsWith("GET http://localhost:7777/ HTTP/1.1\r\n"));
assertTrue(requestString.contains("Proxy-Connection: close\r\n"));
assertNull(proxiedRequestRememberer.getLastProxiedRequest()); // request didn't make it this far
assertNotNull(curi.getHttpResponseHeader("Proxy-Authenticate"));
assertEquals(407, curi.getFetchStatus());
// fetch original again now that credentials should be populated
proxiedRequestRememberer.clear();
curi = makeCrawlURI("http://localhost:7777/");
fetcher().process(curi);
requestString = httpRequestString(curi);
assertTrue(requestString.startsWith("GET http://localhost:7777/ HTTP/1.1\r\n"));
assertTrue(requestString.contains("Proxy-Connection: close\r\n"));
assertNotNull(curi.getHttpResponseHeader("Via"));
assertNotNull(proxiedRequestRememberer.getLastProxiedRequest());
runDefaultChecks(curi, "requestLine");
} finally {
httpProxyServer.stop();
}
}
public void testMaxFetchKBSec() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/200k");
fetcher().setMaxFetchKBSec(100);
// if the wire logger is enabled, it can slow things down enough to make
// this test failed, so disable it temporarily
Level savedWireLevel = Logger.getLogger("org.apache.http.wire").getLevel();
Logger.getLogger("org.apache.http.wire").setLevel(Level.INFO);
fetcher().process(curi);
Logger.getLogger("org.apache.http.wire").setLevel(savedWireLevel);
assertEquals(200000, curi.getContentLength());
assertTrue(curi.getFetchDuration() > 1800 && curi.getFetchDuration() < 2200);
}
public void testMaxLengthBytes() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/200k");
fetcher().setMaxLengthBytes(50000);
fetcher().process(curi);
assertEquals(50001, curi.getRecordedSize());
}
public void testSendRange() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/200k");
fetcher().setMaxLengthBytes(50000);
fetcher().setSendRange(true);
fetcher().process(curi);
// logger.info("\n" + httpRequestString(curi));
assertTrue(httpRequestString(curi).contains("Range: bytes=0-49999\r\n"));
// XXX make server honor range and inspect response?
// assertEquals(50000, curi.getRecordedSize());
}
public void testSendIfModifiedSince() throws Exception {
fetcher().setSendIfModifiedSince(true);
CrawlURI curi = makeCrawlURI("http://localhost:7777/if-modified-since");
fetcher().process(curi);
assertFalse(httpRequestString(curi).toLowerCase().contains("if-modified-since: "));
assertTrue(curi.getHttpResponseHeader("last-modified").equals("Thu, 01 Jan 1970 00:00:00 GMT"));
runDefaultChecks(curi, "requestLine");
// logger.info("before FetchHistoryProcessor fetchHistory=" + Arrays.toString(curi.getFetchHistory()));
FetchHistoryProcessor fetchHistoryProcessor = new FetchHistoryProcessor();
fetchHistoryProcessor.process(curi);
// logger.info("after FetchHistoryProcessor fetchHistory=" + Arrays.toString(curi.getFetchHistory()));
fetcher().process(curi);
// logger.info("\n" + httpRequestString(curi));
// logger.info("\n" + rawResponseString(curi));
assertTrue(httpRequestString(curi).contains("If-Modified-Since: Thu, 01 Jan 1970 00:00:00 GMT\r\n"));
assertTrue(curi.getFetchStatus() == 304);
assertNull(curi.getRevisitProfile());
fetchHistoryProcessor.process(curi);
assertNotNull(curi.getRevisitProfile());
assertTrue(curi.getRevisitProfile() instanceof ServerNotModifiedRevisit);
ServerNotModifiedRevisit revisit = (ServerNotModifiedRevisit) curi.getRevisitProfile();
assertEquals("Thu, 01 Jan 1970 00:00:00 GMT", revisit.getLastModified());
assertNull(revisit.getETag());
}
public void testSendIfNoneMatch() throws Exception {
fetcher().setSendIfNoneMatch(true);
CrawlURI curi = makeCrawlURI("http://localhost:7777/if-none-match");
fetcher().process(curi);
assertFalse(httpRequestString(curi).toLowerCase().contains("if-none-match: "));
assertTrue(curi.getHttpResponseHeader("etag").equals(ETAG_TEST_VALUE));
runDefaultChecks(curi, "requestLine");
FetchHistoryProcessor fetchHistoryProcessor = new FetchHistoryProcessor();
fetchHistoryProcessor.process(curi);
fetcher().process(curi);
// logger.info("\n" + httpRequestString(curi));
// logger.info("\n" + rawResponseString(curi));
assertTrue(httpRequestString(curi).contains("If-None-Match: " + ETAG_TEST_VALUE + "\r\n"));
assertNull(curi.getRevisitProfile());
fetchHistoryProcessor.process(curi);
assertNotNull(curi.getRevisitProfile());
assertTrue(curi.getRevisitProfile() instanceof ServerNotModifiedRevisit);
ServerNotModifiedRevisit revisit = (ServerNotModifiedRevisit) curi.getRevisitProfile();
assertEquals(ETAG_TEST_VALUE, revisit.getETag());
assertNull(revisit.getLastModified());
}
public void testShouldFetchBodyRule() throws Exception {
// CrawlURI curi = makeCrawlURI("http://localhost:7777/");
CrawlURI curi = makeCrawlURI("http://localhost:7777/200k");
fetcher().setShouldFetchBodyRule(new RejectDecideRule());
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET /200k HTTP/1.0\r\n"));
assertEquals("text/plain;charset=US-ASCII", curi.getContentType());
assertTrue(curi.getCredentials().isEmpty());
assertTrue(curi.getFetchDuration() >= 0);
assertTrue(curi.getFetchStatus() == 200);
assertTrue(curi.getFetchType() == FetchType.HTTP_GET);
assertEquals(1, curi.getAnnotations().size());
assertTrue(curi.getAnnotations().contains("midFetchAbort"));
// check for empty body
assertEquals(0, curi.getContentLength());
assertEquals(curi.getContentSize(), curi.getRecordedSize());
assertEquals("", messageBodyString(curi));
assertEquals("", entityString(curi));
}
public void testFetchTimeout() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/slow.txt");
fetcher().setTimeoutSeconds(2);
fetcher().process(curi);
// logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
assertTrue(curi.getAnnotations().contains("timeTrunc"));
assertTrue(curi.getFetchDuration() >= 2000 && curi.getFetchDuration() < 2200);
}
/*
* See http://stackoverflow.com/questions/100841/artificially-create-a-connection-timeout-error
* This test seems to fail if not connected to internet, because
* connection fails immediately insted of timing out.
*/
public void testConnectionTimeout() throws Exception {
CrawlURI curi = makeCrawlURI("http://10.255.255.1/");
fetcher().setSoTimeoutMs(300);
long start = System.currentTimeMillis();
fetcher().process(curi);
long elapsed = System.currentTimeMillis() - start;
assertTrue(elapsed >= 300 && elapsed < 400);
// Httpcomponents throws org.apache.http.conn.ConnectTimeoutException,
// commons-httpclient throws java.net.SocketTimeoutException. Both are
// instances of InterruptedIOException
assertEquals(1, curi.getNonFatalFailures().size());
assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof InterruptedIOException);
assertTrue(curi.getNonFatalFailures().toArray()[0].toString().matches("(?i).*connect.*timed out.*"));
assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
assertEquals(0, curi.getFetchCompletedTime());
}
// XXX testSocketTimeout() (the other kind) - how to simulate?
public void testSslTrustLevel() throws Exception {
// default "open" trust level
CrawlURI curi = makeCrawlURI("https://localhost:7443/");
fetcher().process(curi);
runDefaultChecks(curi, "hostHeader");
// "normal" trust level
curi = makeCrawlURI("https://localhost:7443/");
fetcher().setSslTrustLevel(TrustLevel.NORMAL);
fetcher().process(curi);
assertEquals(1, curi.getNonFatalFailures().size());
assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof SSLException);
assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
assertEquals(0, curi.getFetchCompletedTime());
}
public void testHttp11() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
fetcher().setUseHTTP11(true);
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET / HTTP/1.1\r\n"));
// what else?
runDefaultChecks(curi, "requestLine");
}
public void testChunked() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/chunked.txt");
fetcher().setUseHTTP11(true);
fetcher().setSendConnectionClose(false);
/* XXX Server expects us to close the connection apparently. But we
* don't detect end of chunked transfer. With these small timeouts we
* can finish quickly. A couple of SocketTimeoutExceptions will happen
* within RecordingInputStream.readFullyOrUntil().
*/
fetcher().setSoTimeoutMs(500);
fetcher().setTimeoutSeconds(1);
fetcher().process(curi);
// logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
// logger.info("\n----- rawResponseString -----\n" + rawResponseString(curi));
// logger.info("\n----- contentString -----\n" + contentString(curi));
// logger.info("\n----- entityString -----\n" + entityString(curi));
// logger.info("\n----- messageBodyString -----\n" + messageBodyString(curi));
assertEquals("chunked", curi.getHttpResponseHeader("transfer-encoding"));
assertEquals("25\r\n" + DEFAULT_PAYLOAD_STRING + "\r\n0\r\n\r\n", messageBodyString(curi));
assertEquals(DEFAULT_PAYLOAD_STRING, entityString(curi));
assertEquals(DEFAULT_PAYLOAD_STRING, contentString(curi));
}
protected static class NoResponseServer extends Thread {
protected String listenAddress;
protected int listenPort;
protected boolean isTimeToBeDone = false;
public NoResponseServer(String address, int port) {
this.listenAddress = address;
this.listenPort = port;
}
@Override
public void run() {
ServerSocket listeningSocket = null;
try {
listeningSocket = new ServerSocket(listenPort, 0, Inet4Address.getByName(listenAddress));
listeningSocket.setSoTimeout(600);
while (!isTimeToBeDone) {
try {
Socket connectionSocket = listeningSocket.accept();
// logger.info("accepted connection from " + connectionSocket + ", shutting it down immediately");
connectionSocket.shutdownOutput();
} catch (SocketTimeoutException e) {
}
}
} catch (Exception e) {
// logger.warning("caught exception: " + e);
} finally {
// logger.info("all done suckers");
if (listeningSocket != null) {
try {
listeningSocket.close();
} catch (IOException e) {
}
}
}
}
public void beDone() {
isTimeToBeDone = true;
}
}
// Implicitly tests the retry cycle within httpcomponents
public void testNoResponse() throws Exception {
NoResponseServer noResponseServer = new NoResponseServer("localhost", 7780);
noResponseServer.start();
// CrawlURI curi = makeCrawlURI("http://stats.bbc.co.uk/robots.txt");
CrawlURI curi = makeCrawlURI("http://localhost:7780");
fetcher().process(curi);
assertEquals(1, curi.getNonFatalFailures().size());
assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof NoHttpResponseException);
assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
assertEquals(0, curi.getFetchCompletedTime());
noResponseServer.beDone();
noResponseServer.join();
}
/**
* Tests a URL not correctly url-encoded, but that heritrix lets pass
* through to mimic browser behavior. {@link java.net.URI} would reject this
* url. See class comment on {@link UURI}.
*
* @throws Exception
*/
public void testLaxUrlEncoding() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/99%");
fetcher().process(curi);
// logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
assertTrue(httpRequestString(curi).startsWith("GET /99% HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");
}
public void testTwoQuestionMarks() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/??blahblah");
fetcher().process(curi);
// logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
assertTrue(httpRequestString(curi).startsWith("GET /??blahblah HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");
}
public void testUrlWithSpaces() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/url with spaces?query%20with%20spaces");
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");
curi = makeCrawlURI("http://localhost:7777/url%20with%20spaces?query with spaces");
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
runDefaultChecks(curi, "requestLine");
}
public void testCharsets() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/cp1251");
fetcher().process(curi);
assertEquals("text/plain;charset=cp1251", curi.getHttpResponseHeader("content-type"));
assertEquals(Charset.forName("cp1251"), curi.getRecorder().getCharset());
assertTrue(Arrays.equals(FetchHTTPTest.CP1251_PAYLOAD, IOUtils.toByteArray(curi.getRecorder().getContentReplayInputStream())));
assertEquals("\u041A\u043E\u0447\u0430\u043D\u0438 \u041E\u0440\u043A"
+ "\u0435\u0441\u0442\u0430\u0440 \u0435 \u0435\u0434\u0435"
+ "\u043D \u043E\u0434 \u043D\u0430\u0458\u043F\u043E\u0437"
+ "\u043D\u0430\u0442\u0438\u0442\u0435 \u0438 \u043D\u0430"
+ "\u0458\u043F\u043E\u043F\u0443\u043B\u0430\u0440\u043D"
+ "\u0438\u0442\u0435 \u0431\u043B\u0435\u0445-\u043E\u0440"
+ "\u043A\u0435\u0441\u0442\u0440\u0438 \u0432\u043E \u0441"
+ "\u0432\u0435\u0442\u043E\u0442, \u043A\u043E\u0458 \u0433"
+ "\u043E \u0441\u043E\u0447\u0438\u043D\u0443\u0432\u0430"
+ "\u0430\u0442 \u0434\u0435\u0441\u0435\u0442\u043C\u0438"
+ "\u043D\u0430 \u0420\u043E\u043C\u0438-\u041C\u0430\u043A"
+ "\u0435\u0434\u043E\u043D\u0446\u0438 \u043F\u043E \u043F"
+ "\u043E\u0442\u0435\u043A\u043B\u043E \u043E\u0434 \u041A"
+ "\u043E\u0447\u0430\u043D\u0438, \u043F\u0440\u0435\u0434"
+ "\u0432\u043E\u0434\u0435\u043D\u0438 \u043E\u0434 \u0442"
+ "\u0440\u0443\u0431\u0430\u0447\u043E\u0442 \u041D\u0430"
+ "\u0430\u0442 (\u041D\u0435\u0430\u0442) \u0412\u0435\u043B"
+ "\u0438\u043E\u0432.\n",
curi.getRecorder().getContentReplayCharSequence().toString());
curi = makeCrawlURI("http://localhost:7777/unsupported-charset");
fetcher().process(curi);
assertEquals("text/plain;charset=UNSUPPORTED-CHARSET", curi.getHttpResponseHeader("content-type"));
assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:UNSUPPORTED-CHARSET"));
assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
runDefaultChecks(curi, "requestLine", "contentType");
curi = makeCrawlURI("http://localhost:7777/invalid-charset");
fetcher().process(curi);
assertEquals("text/plain;charset=%%INVALID-CHARSET%%", curi.getHttpResponseHeader("content-type"));
assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:%%INVALID-CHARSET%%"));
assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
runDefaultChecks(curi, "requestLine", "contentType");
}
// see https://webarchive.jira.com/browse/HER-2063
public void testHostHeaderDefaultPort() throws Exception {
CrawlURI curi = makeCrawlURI("http://example.com/");
fetcher().process(curi);
assertTrue(httpRequestString(curi).contains("Host: example.com\r\n"));
curi = makeCrawlURI("https://example.com/");
fetcher().process(curi);
assertTrue(httpRequestString(curi).contains("Host: example.com\r\n"));
}
public void testHttpPost() throws Exception {
CrawlURI curi = makeCrawlURI("http://localhost:7777/");
curi.setFetchType(FetchType.HTTP_POST);
List<NameValuePair> params = new LinkedList<NameValuePair>();
params.add(new BasicNameValuePair("name1", "value1"));
params.add(new BasicNameValuePair("name1", "value2"));
params.add(new BasicNameValuePair("funky name 2", "whoa crazy\t && 🍺 🍻 \n crazier \rooo"));
String submitData = URLEncodedUtils.format(params, "UTF-8");
assertEquals("name1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo", submitData);
curi.getData().put(CoreAttributeConstants.A_SUBMIT_DATA, submitData);
fetcher().process(curi);
assertTrue(httpRequestString(curi).startsWith("POST / HTTP/1.0\r\n"));
assertTrue(httpRequestString(curi).endsWith("\r\n\r\nname1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo"));
assertEquals(FetchType.HTTP_POST, curi.getFetchType());
runDefaultChecks(curi, "requestLine", "trailingCRLFCRLF", "fetchTypeGET");
}
@Override
protected FetchHTTP makeModule() throws IOException {
FetchHTTP fetchHttp = newTestFetchHttp(getUserAgentString());
fetchHttp.start();
return fetchHttp;
}
public static FetchHTTP newTestFetchHttp(String userAgentString) {
FetchHTTP fetchHttp = new FetchHTTP();
fetchHttp.setCookieStore(new SimpleCookieStore());
fetchHttp.setServerCache(new DefaultServerCache());
CrawlMetadata uap = new CrawlMetadata();
uap.setUserAgentTemplate(userAgentString);
fetchHttp.setUserAgentProvider(uap);
fetchHttp.start();
return fetchHttp;
}
@Override
protected void tearDown() throws Exception {
super.tearDown();
if (fetcher != null) {
fetcher.stop();
fetcher = null;
}
}
}