Package org.apache.nutch.tools

Source Code of org.apache.nutch.tools.ParseSegment$ParserThread

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.tools;

import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.plugin.*;

import org.apache.nutch.fetcher.FetcherOutput;

import java.io.EOFException;
import java.io.File;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import java.util.Properties;
import java.util.logging.*;

/**
* Parse contents in one segment.
*
* <p>
* It assumes, under given segment, existence of ./fetcher_output/,
* which is typically generated after a non-parsing fetcher run
* (i.e., fetcher is started with option -noParsing).
*
* <p> Contents in one segemnt are parsed and saved in these steps:
* <li> (1) ./fetcher_output/ and ./content/ are looped together
* (possibly by multiple ParserThreads), and content is parsed for each entry.
* The entry number and resultant ParserOutput are saved in ./parser.unsorted.
* <li> (2) ./parser.unsorted is sorted by entry number, result saved as
* ./parser.sorted.
* <li> (3) ./parser.sorted and ./fetcher_output/ are looped together.
* At each entry, ParserOutput is split into ParseDate and ParseText,
* which are saved in ./parse_data/ and ./parse_text/ respectively. Also
* updated is FetcherOutput with parsing status, which is saved in ./fetcher/.
*
* <p> In the end, ./fetcher/ should be identical to one resulted from
* fetcher run WITHOUT option -noParsing.
*
* <p> By default, intermediates ./parser.unsorted and ./parser.sorted
* are removed at the end, unless option -noClean is used. However
* ./fetcher_output/ is kept intact.
*
* <p> Check Fetcher.java and FetcherOutput.java for further discussion.
*
* @author John Xing
*/

public class ParseSegment {

  public static final Logger LOG =
    LogFormatter.getLogger(ParseSegment.class.getName());

  private int threadCount =                       // max number of threads
    NutchConf.get().getInt("parser.threads.parse", 10);

  private NutchFileSystem nfs;

  // segment dir
  private String directory;

  // readers for FetcherOutput (no-parsing) and Content
  private ArrayFile.Reader fetcherNPReader;
  private ArrayFile.Reader contentReader;

  // SequenceFile (unsorted) for ParserOutput
  private File unsortedFile;
  private SequenceFile.Writer parserOutputWriter;

  // SequenceFile (sorted) for ParserOutput
  private File sortedFile;

  // whether dryRun only (i.e., no real parsing is done)
  private boolean dryRun = false;

  // whether clean intermediate files
  private boolean clean = true;

  // entry (record number) in fetcherNPReader (same in contentReader)
  private long entry = -1;

  // for stats
  private long start;                             // start time
  private long bytes;                             // total bytes parsed
  private int pages;                              // total pages parsed
  private int errors;                             // total pages errored

  private ThreadGroup group = new ThreadGroup("parser"); // our thread group

  /**
   * Inner class ParserThread
   */
  private class ParserThread extends Thread {

    // current entry that this thread is parsing
    private long myEntry = -1;

    // for detailed stats
    private long t0,t1,t2,t3,t4,t5;

    public ParserThread() { super(group, "myThread"); }

    /**
     * This thread participates in looping through
     * entries of FetcherOutput and Content
     */
    public void run() {

      FetcherOutput fetcherOutput = new FetcherOutput();
      Content content = new Content();

      FetchListEntry fle = null;
      String url = null;

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit

        t0 = System.currentTimeMillis();

        try {

          // must be read in order! thus synchronize threads.
          synchronized (ParseSegment.this) {
            t1 = System.currentTimeMillis();

            try {
              if (fetcherNPReader.next(fetcherOutput) == null ||
                contentReader.next(content) == null)
              return;
            } catch (EOFException eof) {
              // only partial data available, stop this thread,
              // other threads will be stopped also.
              return;
            }

            entry++;
            myEntry = entry;
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("Read in entry "+entry);

            // safe guard against mismatched files
            //if (entry != fetcherNPReader.key() ||
            //    entry != contentReader.key()) {
            //  LOG.severe("Mismatched entries under "
            //    + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            //  continue;
            //}
          }

          t2 = System.currentTimeMillis();

          fle = fetcherOutput.getFetchListEntry();
          url = fle.getPage().getURL().toString();

          LOG.fine("parsing " + url);            // parse the page

          // safe guard against mismatched files
          if (!url.equals(content.getUrl())) {
            LOG.severe("Mismatched entries under "
              + FetcherOutput.DIR_NAME_NP + " (" + url +
              ") and " + Content.DIR_NAME + " (" + content.getUrl() + ")");
            continue;
          }

          handleContent(fetcherOutput, content);
          synchronized (ParseSegment.this) {
            pages++;                    // record successful parse
            bytes += content.getContent().length;
            if ((pages % 100) == 0)
              status();
          }

        } catch (ParseException e) {
          logError(url, e);
          handleError(new ParseStatus(e));

        } catch (Throwable t) {                   // an unchecked exception
          if (fle != null) {
            logError(url, t);
            handleError(new ParseStatus(t));
          } else {
            LOG.severe("Unexpected exception");
          }
        }
      }
    }

    private void logError(String url, Throwable t) {
      LOG.info("parse of " + url + " failed with: " + t);
      if (LOG.isLoggable(Level.FINE))
        LOG.log(Level.FINE, "stack", t);               // stack trace
      synchronized (ParseSegment.this) {               // record failure
        errors++;
      }
    }

    private void handleContent(FetcherOutput fo, Content content)
      throws ParseException {

      String url = fo.getUrl().toString();
      if (content != null) {
        String contentType = content.getMetadata().getProperty("Content-Type");
        if (ParseSegment.this.dryRun) {
          LOG.info("To be handled as Content-Type: "+contentType);
          return;
        }

        Parser parser = ParserFactory.getParser(contentType, url);
        Parse parse = parser.getParse(content);
        outputPage(new ParseText(parse.getText()), parse.getData());
       
      } else {
        if (ParseSegment.this.dryRun) {
          LOG.info("To be handled as no content");
          return;
        }
        outputPage(new ParseText(""),
                new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
                        "", new Outlink[0], new Properties()));
      }
    }

    private void handleError(ParseStatus status) {
      if (ParseSegment.this.dryRun) {
        LOG.info("To be handled as error");
        return;
      }
      outputPage(new ParseText(""),
                 new ParseData(status, "", new Outlink[0], new Properties()));
    }
     
    private void outputPage
      (ParseText parseText, ParseData parseData) {
      try {
        t3 = System.currentTimeMillis();
        synchronized (parserOutputWriter) {
          t4 = System.currentTimeMillis();
          parserOutputWriter.append(new LongWritable(myEntry),
            new ParserOutput(parseData, parseText));
          t5 = System.currentTimeMillis();
          if (LOG.isLoggable(Level.FINE))
            LOG.fine("Entry: "+myEntry
              +" "+parseData.getMetadata().getProperty("Content-Length")
              +" wait="+(t1-t0) +" read="+(t2-t1) +" parse="+(t3-t2)
              +" wait="+(t4-t3) +" write="+(t5-t4) +"ms");
        }
      } catch (Throwable t) {
        t.printStackTrace();
        LOG.severe("error writing output:" + t.toString());
      }
    }

  }

  /**
   * Inner class ParserOutput: ParseData + ParseText
   */
  private class ParserOutput extends VersionedWritable {
    public static final String DIR_NAME = "parser";

    private final static byte VERSION = 2;

    private ParseData parseData = new ParseData();
    private ParseText parseText = new ParseText();

    public ParserOutput() {}
   
    public ParserOutput(ParseData parseData, ParseText parseText) {
      this.parseData = parseData;
      this.parseText = parseText;
    }

    public byte getVersion() { return VERSION; }

    public ParseData getParseData() {
      return this.parseData;
    }

    public ParseText getParseText() {
      return this.parseText;
    }

    public final void readFields(DataInput in) throws IOException {
      super.readFields(in);                         // check version
      parseData.readFields(in);
      parseText.readFields(in);
      return;
    }

    public final void write(DataOutput out) throws IOException {
      super.write(out);                             // write version
      parseData.write(out);
      parseText.write(out);
      return;
    }
  }
     
  /**
   * ParseSegment constructor
   */
  public ParseSegment(NutchFileSystem nfs, String directory, boolean dryRun)
    throws IOException {

    File file;

    this.nfs = nfs;
    this.directory = directory;
    this.dryRun = dryRun;

    // FetcherOutput.DIR_NAME_NP must exist
    file = new File(directory, FetcherOutput.DIR_NAME_NP);
    if (!nfs.exists(file))
      throw new IOException("Directory missing: "+FetcherOutput.DIR_NAME_NP);

    if (dryRun)
      return;

    // clean old FetcherOutput.DIR_NAME
    file = new File(directory, FetcherOutput.DIR_NAME);
    if (nfs.exists(file)) {
      LOG.info("Deleting old "+file.getName());
      nfs.delete(file);
    }

    // clean old unsortedFile
    this.unsortedFile = new File(directory, ParserOutput.DIR_NAME+".unsorted");
    if (nfs.exists(this.unsortedFile)) {
      LOG.info("Deleting old "+this.unsortedFile.getName());
      nfs.delete(this.unsortedFile);
    }

    // clean old sortedFile
    this.sortedFile = new File(directory, ParserOutput.DIR_NAME+".sorted");
    if (nfs.exists(this.sortedFile)) {
      LOG.info("Deleting old "+this.sortedFile.getName());
      nfs.delete(this.sortedFile);
    }

    // clean old ParseData.DIR_NAME
    file = new File(directory, ParseData.DIR_NAME);
    if (nfs.exists(file)) {
      LOG.info("Deleting old "+file.getName());
      nfs.delete(file);
    }

    // clean old ParseText.DIR_NAME
    file = new File(directory, ParseText.DIR_NAME);
    if (nfs.exists(file)) {
      LOG.info("Deleting old "+file.getName());
      nfs.delete(file);
    }

  }

  /** Set thread count */
  public void setThreadCount(int threadCount) {
    this.threadCount=threadCount;
  }

  /** Set the logging level. */
  public static void setLogLevel(Level level) {
    LOG.setLevel(level);
    PluginRepository.LOG.setLevel(level);
    ParserFactory.LOG.setLevel(level);
    LOG.info("logging at " + level);
  }

  /** Set if clean intermediates. */
  public void setClean(boolean clean) {
    this.clean = clean;
  }

  /** Display the status of the parser run. */
  public void status() {
    long ms = System.currentTimeMillis() - start;
    LOG.info("status: "
             + pages + " pages, "
             + errors + " errors, "
             + bytes + " bytes, "
             + ms + " ms");
    LOG.info("status: "
             + (((float)pages)/(ms/1000.0f))+" pages/s, "
             + (((float)bytes*8/1024)/(ms/1000.0f))+" kb/s, "
             + (((float)bytes)/pages) + " bytes/page");
  }

  /** Parse contents by multiple threads and save as unsorted ParserOutput */
  public void parse() throws IOException, InterruptedException {

    fetcherNPReader = new ArrayFile.Reader
      (nfs, (new File(directory, FetcherOutput.DIR_NAME_NP)).getPath());
    contentReader = new ArrayFile.Reader
      (nfs, (new File(directory, Content.DIR_NAME)).getPath());

    if (!this.dryRun) {
      parserOutputWriter = new SequenceFile.Writer
        (nfs, unsortedFile.getPath(), LongWritable.class, ParserOutput.class);
    }

    start = System.currentTimeMillis();

    for (int i = 0; i < threadCount; i++) {       // spawn threads
      ParserThread thread = new ParserThread();
      thread.start();
    }

    do {
      Thread.sleep(1000);

      if (LogFormatter.hasLoggedSevere())
        throw new RuntimeException("SEVERE error logged.  Exiting parser.");

    } while (group.activeCount() > 0);            // wait for threads to finish

    fetcherNPReader.close();
    contentReader.close();
    if (!this.dryRun)
      parserOutputWriter.close();

    status();                                     // print final status
  }

  /** Sort ParserOutput */
  public void sort() throws IOException {

    if (this.dryRun)
      return;

    LOG.info("Sorting ParserOutput");

    start = System.currentTimeMillis();

    SequenceFile.Sorter sorter = new SequenceFile.Sorter
      (nfs, new LongWritable.Comparator(), ParserOutput.class);

    sorter.sort(unsortedFile.getPath(), sortedFile.getPath());

    double localSecs = (System.currentTimeMillis() - start) / 1000.0;
    LOG.info("Sorted: " + (pages+errors) + " entries in " + localSecs + "s, "
      + ((pages+errors)/localSecs) + " entries/s");

    if (this.clean) {
      LOG.info("Deleting intermediate "+unsortedFile.getName());
      nfs.delete(unsortedFile);
    }

    return;
  }

  /**
   * Split sorted ParserOutput into ParseData and ParseText,
   * and generate new FetcherOutput with updated status
   */
  public void save() throws IOException {

    if (this.dryRun)
      return;

    LOG.info("Saving ParseData and ParseText separately");

    start = System.currentTimeMillis();

    SequenceFile.Reader parserOutputReader
      = new SequenceFile.Reader(nfs, sortedFile.getPath());

    ArrayFile.Reader fetcherNPReader = new ArrayFile.Reader(nfs,
      (new File(directory, FetcherOutput.DIR_NAME_NP)).getPath());

    ArrayFile.Writer fetcherWriter = new ArrayFile.Writer(nfs,
      (new File(directory, FetcherOutput.DIR_NAME)).getPath(),
      FetcherOutput.class);

    ArrayFile.Writer parseDataWriter = new ArrayFile.Writer(nfs,
      (new File(directory, ParseData.DIR_NAME)).getPath(), ParseData.class);
    ArrayFile.Writer parseTextWriter = new ArrayFile.Writer(nfs,
      (new File(directory, ParseText.DIR_NAME)).getPath(), ParseText.class);

    try {
      LongWritable key = new LongWritable();
      ParserOutput val = new ParserOutput();
      FetcherOutput fo = new FetcherOutput();
      int count = 0;
      int status;
      while (parserOutputReader.next(key,val)) {
        fetcherNPReader.next(fo);
        // safe guarding
        if (fetcherNPReader.key() != key.get())
          throw new IOException("Mismatch between entries under "
            + FetcherOutput.DIR_NAME_NP + " and in " + sortedFile.getName());
        fetcherWriter.append(fo);
        parseDataWriter.append(val.getParseData());
        parseTextWriter.append(val.getParseText());
        count++;
      }
      // safe guard! make sure there are identical entries
      // in (fetcher, content) and in (parseData, parseText)
      if (count != (pages+errors))
        throw new IOException("Missing entries: expect "+(pages+errors)
          +", but have "+count+" entries instead.");
    } finally {
      fetcherNPReader.close();
      fetcherWriter.close();
      parseDataWriter.close();
      parseTextWriter.close();
      parserOutputReader.close();
    }

    double localSecs = (System.currentTimeMillis() - start) / 1000.0;
    LOG.info("Saved: " + (pages+errors) + " entries in " + localSecs + "s, "
      + ((pages+errors)/localSecs) + " entries/s");

    if (this.clean) {
      LOG.info("Deleting intermediate "+sortedFile.getName());
      nfs.delete(sortedFile);
    }

    return;
  }

  /** main method */
  public static void main(String[] args) throws Exception {
    int threadCount = -1;
    boolean showThreadID = false;
    boolean dryRun = false;
    String logLevel = "info";
    boolean clean = true;
    String directory = null;

    String usage = "Usage: ParseSegment (-local | -ndfs <namenode:port>) [-threads n] [-showThreadID] [-dryRun] [-logLevel level] [-noClean] dir";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
     
    // parse command line
    NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);

    for (int i = 0; i < args.length; i++) {
      if (args[i] == null) {
          continue;
      } else if (args[i].equals("-threads")) {
        threadCount =  Integer.parseInt(args[++i]);
      } else if (args[i].equals("-showThreadID")) {
        showThreadID = true;
      } else if (args[i].equals("-dryRun")) {
        dryRun = true;
      } else if (args[i].equals("-logLevel")) {
        logLevel = args[++i];
      } else if (args[i].equals("-noClean")) {
        clean = false;
      } else {
        directory = args[i];
      }
    }

    try {

      ParseSegment parseSegment = new ParseSegment(nfs, directory, dryRun);

      parseSegment.setLogLevel
        (Level.parse((new String(logLevel)).toUpperCase()));

      if (threadCount != -1)
        parseSegment.setThreadCount(threadCount);
      if (showThreadID)
        LogFormatter.setShowThreadIDs(showThreadID);

      parseSegment.setClean(clean);

      parseSegment.parse();
      parseSegment.sort();
      parseSegment.save();

    } finally {
      nfs.close();
    }

  }
}
TOP

Related Classes of org.apache.nutch.tools.ParseSegment$ParserThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.