/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.io;
import java.util.Collection;
import java.util.Iterator;
import java.util.logging.Formatter;
import java.util.logging.LogRecord;
import org.archive.io.Preformatter;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
/**
* Formatter for 'crawl.log'. Expects completed CrawlURI as parameter.
*
* @author gojomo
*/
public class UriProcessingFormatter
extends Formatter implements Preformatter, CoreAttributeConstants {
private final static String NA = "-";
/**
* Guess at line length. Used to preallocated the buffer we accumulate the
* log line in. Hopefully we get it right most of the time and no need to
* enlarge except in the rare case.
*
* <p>
* In a sampling of actual Aug 2014 Archive-It crawl logs I found that a
* line length 1000 characters was around the 99th percentile (only 1 in 100
* is longer than that). We put more information in the crawl log now than
* was originally estimated. Exactly what goes in can depend on the
* configuration as well.
*/
private final static int GUESS_AT_LINE_LENGTH = 1000;
/**
* Reusable assembly buffer.
*/
protected final ThreadLocal<StringBuilder> bufLocal =
new ThreadLocal<StringBuilder>() {
@Override
protected StringBuilder initialValue() {
return new StringBuilder(GUESS_AT_LINE_LENGTH);
}
};
protected final ThreadLocal<String> cachedFormat = new ThreadLocal<String>();
protected boolean logExtraInfo;
public UriProcessingFormatter(boolean logExtraInfo) {
this.logExtraInfo = logExtraInfo;
}
public String format(LogRecord lr) {
if(cachedFormat.get()!=null) {
return cachedFormat.get();
}
CrawlURI curi = (CrawlURI)lr.getParameters()[0];
String length = NA;
String mime = null;
if (curi.isHttpTransaction()) {
if(curi.getContentLength() >= 0) {
length = Long.toString(curi.getContentLength());
} else if (curi.getContentSize() > 0) {
length = Long.toString(curi.getContentSize());
}
} else {
if (curi.getContentSize() > 0) {
length = Long.toString(curi.getContentSize());
}
}
mime = MimetypeUtils.truncate(curi.getContentType());
long time = System.currentTimeMillis();
String via = curi.flattenVia();
String digest = curi.getContentDigestSchemeString();
String sourceTag = curi.containsDataKey(A_SOURCE_TAG)
? curi.getSourceTag()
: null;
StringBuilder buffer = bufLocal.get();
buffer.setLength(0);
buffer.append(ArchiveUtils.getLog17Date(time))
.append(" ")
.append(ArchiveUtils.padTo(curi.getFetchStatus(), 5))
.append(" ")
.append(ArchiveUtils.padTo(length, 10))
.append(" ")
.append(curi.getUURI().toString())
.append(" ")
.append(checkForNull(curi.getPathFromSeed()))
.append(" ")
.append(checkForNull(via))
.append(" ")
.append(mime)
.append(" ")
.append("#")
// Pad threads to be 3 digits. For Igor.
.append(ArchiveUtils.padTo(
Integer.toString(curi.getThreadNumber()), 3, '0'))
.append(" ");
// arcTimeAndDuration
if(curi.containsDataKey(A_FETCH_COMPLETED_TIME)) {
long completedTime = curi.getFetchCompletedTime();
long beganTime = curi.getFetchBeginTime();
buffer.append(ArchiveUtils.get17DigitDate(beganTime))
.append("+")
.append(Long.toString(completedTime - beganTime));
} else {
buffer.append(NA);
}
buffer.append(" ")
.append(checkForNull(digest))
.append(" ")
.append(checkForNull(sourceTag))
.append(" ");
Collection<String> anno = curi.getAnnotations();
if ((anno != null) && (anno.size() > 0)) {
Iterator<String> iter = anno.iterator();
buffer.append(iter.next());
while (iter.hasNext()) {
buffer.append(',');
buffer.append(iter.next());
}
} else {
buffer.append(NA);
}
if (logExtraInfo) {
// XXX would we rather have "-" if info's empty?
buffer.append(" ").append(curi.getExtraInfo());
}
buffer.append("\n");
return buffer.toString();
}
/**
* @param str String to check.
* @return Return passed string or <code>NA</code> if null.
*/
protected String checkForNull(String str) {
return (str == null || str.length() <= 0)? NA: str;
}
@Override
public void clear() {
cachedFormat.set(null);
}
@Override
public void preformat(LogRecord record) {
cachedFormat.set(format(record));
}
}