/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.plugin.*;
import org.apache.nutch.fetcher.FetcherOutput;
import java.io.EOFException;
import java.io.File;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Properties;
import java.util.logging.*;
/**
* Parse contents in one segment.
*
* <p>
* It assumes, under given segment, existence of ./fetcher_output/,
* which is typically generated after a non-parsing fetcher run
* (i.e., fetcher is started with option -noParsing).
*
* <p> Contents in one segemnt are parsed and saved in these steps:
* <li> (1) ./fetcher_output/ and ./content/ are looped together
* (possibly by multiple ParserThreads), and content is parsed for each entry.
* The entry number and resultant ParserOutput are saved in ./parser.unsorted.
* <li> (2) ./parser.unsorted is sorted by entry number, result saved as
* ./parser.sorted.
* <li> (3) ./parser.sorted and ./fetcher_output/ are looped together.
* At each entry, ParserOutput is split into ParseDate and ParseText,
* which are saved in ./parse_data/ and ./parse_text/ respectively. Also
* updated is FetcherOutput with parsing status, which is saved in ./fetcher/.
*
* <p> In the end, ./fetcher/ should be identical to one resulted from
* fetcher run WITHOUT option -noParsing.
*
* <p> By default, intermediates ./parser.unsorted and ./parser.sorted
* are removed at the end, unless option -noClean is used. However
* ./fetcher_output/ is kept intact.
*
* <p> Check Fetcher.java and FetcherOutput.java for further discussion.
*
* @author John Xing
*/
public class ParseSegment {
public static final Logger LOG =
LogFormatter.getLogger(ParseSegment.class.getName());
private int threadCount = // max number of threads
NutchConf.get().getInt("parser.threads.parse", 10);
private NutchFileSystem nfs;
// segment dir
private String directory;
// readers for FetcherOutput (no-parsing) and Content
private ArrayFile.Reader fetcherNPReader;
private ArrayFile.Reader contentReader;
// SequenceFile (unsorted) for ParserOutput
private File unsortedFile;
private SequenceFile.Writer parserOutputWriter;
// SequenceFile (sorted) for ParserOutput
private File sortedFile;
// whether dryRun only (i.e., no real parsing is done)
private boolean dryRun = false;
// whether clean intermediate files
private boolean clean = true;
// entry (record number) in fetcherNPReader (same in contentReader)
private long entry = -1;
// for stats
private long start; // start time
private long bytes; // total bytes parsed
private int pages; // total pages parsed
private int errors; // total pages errored
private ThreadGroup group = new ThreadGroup("parser"); // our thread group
/**
* Inner class ParserThread
*/
private class ParserThread extends Thread {
// current entry that this thread is parsing
private long myEntry = -1;
// for detailed stats
private long t0,t1,t2,t3,t4,t5;
public ParserThread() { super(group, "myThread"); }
/**
* This thread participates in looping through
* entries of FetcherOutput and Content
*/
public void run() {
FetcherOutput fetcherOutput = new FetcherOutput();
Content content = new Content();
FetchListEntry fle = null;
String url = null;
while (true) {
if (LogFormatter.hasLoggedSevere()) // something bad happened
break; // exit
t0 = System.currentTimeMillis();
try {
// must be read in order! thus synchronize threads.
synchronized (ParseSegment.this) {
t1 = System.currentTimeMillis();
try {
if (fetcherNPReader.next(fetcherOutput) == null ||
contentReader.next(content) == null)
return;
} catch (EOFException eof) {
// only partial data available, stop this thread,
// other threads will be stopped also.
return;
}
entry++;
myEntry = entry;
if (LOG.isLoggable(Level.FINE))
LOG.fine("Read in entry "+entry);
// safe guard against mismatched files
//if (entry != fetcherNPReader.key() ||
// entry != contentReader.key()) {
// LOG.severe("Mismatched entries under "
// + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
// continue;
//}
}
t2 = System.currentTimeMillis();
fle = fetcherOutput.getFetchListEntry();
url = fle.getPage().getURL().toString();
LOG.fine("parsing " + url); // parse the page
// safe guard against mismatched files
if (!url.equals(content.getUrl())) {
LOG.severe("Mismatched entries under "
+ FetcherOutput.DIR_NAME_NP + " (" + url +
") and " + Content.DIR_NAME + " (" + content.getUrl() + ")");
continue;
}
handleContent(fetcherOutput, content);
synchronized (ParseSegment.this) {
pages++; // record successful parse
bytes += content.getContent().length;
if ((pages % 100) == 0)
status();
}
} catch (ParseException e) {
logError(url, e);
handleError(new ParseStatus(e));
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, t);
handleError(new ParseStatus(t));
} else {
LOG.severe("Unexpected exception");
}
}
}
}
private void logError(String url, Throwable t) {
LOG.info("parse of " + url + " failed with: " + t);
if (LOG.isLoggable(Level.FINE))
LOG.log(Level.FINE, "stack", t); // stack trace
synchronized (ParseSegment.this) { // record failure
errors++;
}
}
private void handleContent(FetcherOutput fo, Content content)
throws ParseException {
String url = fo.getUrl().toString();
if (content != null) {
String contentType = content.getMetadata().getProperty("Content-Type");
if (ParseSegment.this.dryRun) {
LOG.info("To be handled as Content-Type: "+contentType);
return;
}
Parser parser = ParserFactory.getParser(contentType, url);
Parse parse = parser.getParse(content);
outputPage(new ParseText(parse.getText()), parse.getData());
} else {
if (ParseSegment.this.dryRun) {
LOG.info("To be handled as no content");
return;
}
outputPage(new ParseText(""),
new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
"", new Outlink[0], new Properties()));
}
}
private void handleError(ParseStatus status) {
if (ParseSegment.this.dryRun) {
LOG.info("To be handled as error");
return;
}
outputPage(new ParseText(""),
new ParseData(status, "", new Outlink[0], new Properties()));
}
private void outputPage
(ParseText parseText, ParseData parseData) {
try {
t3 = System.currentTimeMillis();
synchronized (parserOutputWriter) {
t4 = System.currentTimeMillis();
parserOutputWriter.append(new LongWritable(myEntry),
new ParserOutput(parseData, parseText));
t5 = System.currentTimeMillis();
if (LOG.isLoggable(Level.FINE))
LOG.fine("Entry: "+myEntry
+" "+parseData.getMetadata().getProperty("Content-Length")
+" wait="+(t1-t0) +" read="+(t2-t1) +" parse="+(t3-t2)
+" wait="+(t4-t3) +" write="+(t5-t4) +"ms");
}
} catch (Throwable t) {
t.printStackTrace();
LOG.severe("error writing output:" + t.toString());
}
}
}
/**
* Inner class ParserOutput: ParseData + ParseText
*/
private class ParserOutput extends VersionedWritable {
public static final String DIR_NAME = "parser";
private final static byte VERSION = 2;
private ParseData parseData = new ParseData();
private ParseText parseText = new ParseText();
public ParserOutput() {}
public ParserOutput(ParseData parseData, ParseText parseText) {
this.parseData = parseData;
this.parseText = parseText;
}
public byte getVersion() { return VERSION; }
public ParseData getParseData() {
return this.parseData;
}
public ParseText getParseText() {
return this.parseText;
}
public final void readFields(DataInput in) throws IOException {
super.readFields(in); // check version
parseData.readFields(in);
parseText.readFields(in);
return;
}
public final void write(DataOutput out) throws IOException {
super.write(out); // write version
parseData.write(out);
parseText.write(out);
return;
}
}
/**
* ParseSegment constructor
*/
public ParseSegment(NutchFileSystem nfs, String directory, boolean dryRun)
throws IOException {
File file;
this.nfs = nfs;
this.directory = directory;
this.dryRun = dryRun;
// FetcherOutput.DIR_NAME_NP must exist
file = new File(directory, FetcherOutput.DIR_NAME_NP);
if (!nfs.exists(file))
throw new IOException("Directory missing: "+FetcherOutput.DIR_NAME_NP);
if (dryRun)
return;
// clean old FetcherOutput.DIR_NAME
file = new File(directory, FetcherOutput.DIR_NAME);
if (nfs.exists(file)) {
LOG.info("Deleting old "+file.getName());
nfs.delete(file);
}
// clean old unsortedFile
this.unsortedFile = new File(directory, ParserOutput.DIR_NAME+".unsorted");
if (nfs.exists(this.unsortedFile)) {
LOG.info("Deleting old "+this.unsortedFile.getName());
nfs.delete(this.unsortedFile);
}
// clean old sortedFile
this.sortedFile = new File(directory, ParserOutput.DIR_NAME+".sorted");
if (nfs.exists(this.sortedFile)) {
LOG.info("Deleting old "+this.sortedFile.getName());
nfs.delete(this.sortedFile);
}
// clean old ParseData.DIR_NAME
file = new File(directory, ParseData.DIR_NAME);
if (nfs.exists(file)) {
LOG.info("Deleting old "+file.getName());
nfs.delete(file);
}
// clean old ParseText.DIR_NAME
file = new File(directory, ParseText.DIR_NAME);
if (nfs.exists(file)) {
LOG.info("Deleting old "+file.getName());
nfs.delete(file);
}
}
/** Set thread count */
public void setThreadCount(int threadCount) {
this.threadCount=threadCount;
}
/** Set the logging level. */
public static void setLogLevel(Level level) {
LOG.setLevel(level);
PluginRepository.LOG.setLevel(level);
ParserFactory.LOG.setLevel(level);
LOG.info("logging at " + level);
}
/** Set if clean intermediates. */
public void setClean(boolean clean) {
this.clean = clean;
}
/** Display the status of the parser run. */
public void status() {
long ms = System.currentTimeMillis() - start;
LOG.info("status: "
+ pages + " pages, "
+ errors + " errors, "
+ bytes + " bytes, "
+ ms + " ms");
LOG.info("status: "
+ (((float)pages)/(ms/1000.0f))+" pages/s, "
+ (((float)bytes*8/1024)/(ms/1000.0f))+" kb/s, "
+ (((float)bytes)/pages) + " bytes/page");
}
/** Parse contents by multiple threads and save as unsorted ParserOutput */
public void parse() throws IOException, InterruptedException {
fetcherNPReader = new ArrayFile.Reader
(nfs, (new File(directory, FetcherOutput.DIR_NAME_NP)).getPath());
contentReader = new ArrayFile.Reader
(nfs, (new File(directory, Content.DIR_NAME)).getPath());
if (!this.dryRun) {
parserOutputWriter = new SequenceFile.Writer
(nfs, unsortedFile.getPath(), LongWritable.class, ParserOutput.class);
}
start = System.currentTimeMillis();
for (int i = 0; i < threadCount; i++) { // spawn threads
ParserThread thread = new ParserThread();
thread.start();
}
do {
Thread.sleep(1000);
if (LogFormatter.hasLoggedSevere())
throw new RuntimeException("SEVERE error logged. Exiting parser.");
} while (group.activeCount() > 0); // wait for threads to finish
fetcherNPReader.close();
contentReader.close();
if (!this.dryRun)
parserOutputWriter.close();
status(); // print final status
}
/** Sort ParserOutput */
public void sort() throws IOException {
if (this.dryRun)
return;
LOG.info("Sorting ParserOutput");
start = System.currentTimeMillis();
SequenceFile.Sorter sorter = new SequenceFile.Sorter
(nfs, new LongWritable.Comparator(), ParserOutput.class);
sorter.sort(unsortedFile.getPath(), sortedFile.getPath());
double localSecs = (System.currentTimeMillis() - start) / 1000.0;
LOG.info("Sorted: " + (pages+errors) + " entries in " + localSecs + "s, "
+ ((pages+errors)/localSecs) + " entries/s");
if (this.clean) {
LOG.info("Deleting intermediate "+unsortedFile.getName());
nfs.delete(unsortedFile);
}
return;
}
/**
* Split sorted ParserOutput into ParseData and ParseText,
* and generate new FetcherOutput with updated status
*/
public void save() throws IOException {
if (this.dryRun)
return;
LOG.info("Saving ParseData and ParseText separately");
start = System.currentTimeMillis();
SequenceFile.Reader parserOutputReader
= new SequenceFile.Reader(nfs, sortedFile.getPath());
ArrayFile.Reader fetcherNPReader = new ArrayFile.Reader(nfs,
(new File(directory, FetcherOutput.DIR_NAME_NP)).getPath());
ArrayFile.Writer fetcherWriter = new ArrayFile.Writer(nfs,
(new File(directory, FetcherOutput.DIR_NAME)).getPath(),
FetcherOutput.class);
ArrayFile.Writer parseDataWriter = new ArrayFile.Writer(nfs,
(new File(directory, ParseData.DIR_NAME)).getPath(), ParseData.class);
ArrayFile.Writer parseTextWriter = new ArrayFile.Writer(nfs,
(new File(directory, ParseText.DIR_NAME)).getPath(), ParseText.class);
try {
LongWritable key = new LongWritable();
ParserOutput val = new ParserOutput();
FetcherOutput fo = new FetcherOutput();
int count = 0;
int status;
while (parserOutputReader.next(key,val)) {
fetcherNPReader.next(fo);
// safe guarding
if (fetcherNPReader.key() != key.get())
throw new IOException("Mismatch between entries under "
+ FetcherOutput.DIR_NAME_NP + " and in " + sortedFile.getName());
fetcherWriter.append(fo);
parseDataWriter.append(val.getParseData());
parseTextWriter.append(val.getParseText());
count++;
}
// safe guard! make sure there are identical entries
// in (fetcher, content) and in (parseData, parseText)
if (count != (pages+errors))
throw new IOException("Missing entries: expect "+(pages+errors)
+", but have "+count+" entries instead.");
} finally {
fetcherNPReader.close();
fetcherWriter.close();
parseDataWriter.close();
parseTextWriter.close();
parserOutputReader.close();
}
double localSecs = (System.currentTimeMillis() - start) / 1000.0;
LOG.info("Saved: " + (pages+errors) + " entries in " + localSecs + "s, "
+ ((pages+errors)/localSecs) + " entries/s");
if (this.clean) {
LOG.info("Deleting intermediate "+sortedFile.getName());
nfs.delete(sortedFile);
}
return;
}
/** main method */
public static void main(String[] args) throws Exception {
int threadCount = -1;
boolean showThreadID = false;
boolean dryRun = false;
String logLevel = "info";
boolean clean = true;
String directory = null;
String usage = "Usage: ParseSegment (-local | -ndfs <namenode:port>) [-threads n] [-showThreadID] [-dryRun] [-logLevel level] [-noClean] dir";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
// parse command line
NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
for (int i = 0; i < args.length; i++) {
if (args[i] == null) {
continue;
} else if (args[i].equals("-threads")) {
threadCount = Integer.parseInt(args[++i]);
} else if (args[i].equals("-showThreadID")) {
showThreadID = true;
} else if (args[i].equals("-dryRun")) {
dryRun = true;
} else if (args[i].equals("-logLevel")) {
logLevel = args[++i];
} else if (args[i].equals("-noClean")) {
clean = false;
} else {
directory = args[i];
}
}
try {
ParseSegment parseSegment = new ParseSegment(nfs, directory, dryRun);
parseSegment.setLogLevel
(Level.parse((new String(logLevel)).toUpperCase()));
if (threadCount != -1)
parseSegment.setThreadCount(threadCount);
if (showThreadID)
LogFormatter.setShowThreadIDs(showThreadID);
parseSegment.setClean(clean);
parseSegment.parse();
parseSegment.sort();
parseSegment.save();
} finally {
nfs.close();
}
}
}