Source Code of org.archive.mapred.ARCMapRunner$IndexingThread

/*
 * $Id: ImportArcs.java 1494 2007-02-15 17:47:58Z stack-sf $
 * 
 * Copyright (C) 2007 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package org.archive.mapred;


import java.io.IOException;
import java.util.Iterator;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCRecord;


/**
 * MapRunner that passes an ARCRecord to configured mapper.
 * Configured mapper must be implementation of {@link ARCMapRunner}.
 * @author stack
 */
public class ARCMapRunner implements MapRunnable {
    public final Log LOG = LogFactory.getLog(this.getClass().getName());
    protected ARCRecordMapper mapper;
    private enum Counter {ARCS_COUNT, ARCRECORDS_COUNT,
            BAD_ARC_PARSE_COUNT, ARC_FAILED_DOWNLOAD, LONG_ARCRECORDS_COUNT}


    /**
     * How long to spend indexing.
     */
    protected long maxtime;
    
    public void configure(JobConf job) {
      this.mapper = (ARCRecordMapper)ReflectionUtils.
          newInstance(job.getMapperClass(), job);
      // Value is in minutes.
      this.maxtime = job.getLong("wax.index.timeout", 60) * 60 * 1000;
    }
    
    public void run(RecordReader input, OutputCollector output,
            Reporter reporter)
    throws IOException {
        try {
            WritableComparable key = input.createKey();
            Writable value = input.createValue();
            while (input.next(key, value)) {
                doArchive(value.toString(), output, new ARCReporter(reporter));
            }
        } finally {
            this.mapper.close();
        }
    }
    
    protected void doArchive(final String arcurl, final  OutputCollector output,
            final ARCReporter reporter)
    throws IOException {
        if ((arcurl == null) || arcurl.endsWith("work")) {
            reporter.setStatus("skipping " + arcurl, true);
            return;
        }


        // Run indexing in a thread so I can cover it with a timer.
        final Thread thread = new IndexingThread(arcurl, output, reporter);
        startIndexingThread(thread, reporter);
    }
    
    protected void startIndexingThread(Thread thread, ARCReporter reporter)
    throws IOException {
        thread.setDaemon(true);
        thread.start();
        final long start = System.currentTimeMillis();
        try {
            for (long period = this.maxtime; thread.isAlive() && (period > 0);
                period = this.maxtime - (System.currentTimeMillis() - start)) {
                try {
                    thread.join(period);
                } catch (final InterruptedException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            cleanup(thread, reporter);
        }
    }
    
    protected void cleanup(final Thread thread, final ARCReporter reporter)
            throws IOException {
        if (!thread.isAlive()) {
            return;
        }
        reporter.setStatus("Killing indexing thread " + thread.getName(), true);
        thread.interrupt();
        try {
            // Give it some time to die.
            thread.join(1000);
        } catch (final InterruptedException e) {
            e.printStackTrace();
        }
        if (thread.isAlive()) {
            LOG.info(thread.getName() + " will not die");
        }
    }


    protected class IndexingThread extends Thread {
        protected final String location;
        protected final OutputCollector output;
        protected final ARCReporter reporter;
        


        public IndexingThread(final String loc, final OutputCollector o,
                final ARCReporter r) {
            // Name this thread same as ARC location.
            super(loc);
            this.location = loc;
            this.output = o;
            this.reporter = r;
        }
        
        /**
         * @return Null if fails download.
         */
        protected ArchiveReader getArchiveReader() {
            ArchiveReader arc = null;
            // Need a thread that will keep updating TaskTracker during long
            // downloads else tasktracker will kill us.
            Thread reportingDuringDownload = null;
            try {
                this.reporter.setStatus("opening " + this.location, true);
                reportingDuringDownload = new Thread("reportDuringDownload") {
                    public void run() {
                        while (!this.isInterrupted()) {
                            try {
                                synchronized (this) {
                                    sleep(1000 * 60); // Sleep a minute.
                                }
                                reporter.setStatus("downloading " +
                                    location);
                            /* TODO MC - to be compitable with hadoop 0.14    
                            } catch (final IOException e) {
                                e.printStackTrace();
                                // No point hanging around if we're failing
                                // status.
                                break;
                            */    
                            } catch (final InterruptedException e) {
                                // Interrupt flag is cleared. Just fall out.
                                break;
                            }
                        }
                    }
                };
                reportingDuringDownload.setDaemon(true);
                reportingDuringDownload.start();
                arc = ArchiveReaderFactory.get(this.location);
            } catch (final Throwable e) {
                //try {
                    final String msg = "Error opening " + this.location
                        + ": " + e.toString();
                    this.reporter.setStatus(msg, true);
                    this.reporter.incrCounter(Counter.ARC_FAILED_DOWNLOAD, 1);
                    LOG.info(msg);
                /* TODO MC - to be compitable with hadoop 0.14
                } catch (final IOException ioe) {
                    LOG.warn(this.location, ioe);
                }
                */
            } finally {
                if ((reportingDuringDownload != null)
                        && reportingDuringDownload.isAlive()) {
                    reportingDuringDownload.interrupt();
                }
            }
            return arc;
        }


        public void run() {
            if (this.location == null || this.location.length() <= 0) {
                return;
            }
              
            ArchiveReader arc = getArchiveReader();
            if (arc == null) {
                return;
            }


            try {
                ARCMapRunner.this.mapper.onARCOpen();
                this.reporter.incrCounter(Counter.ARCS_COUNT, 1);              
                
                // Iterate over each ARCRecord.
                for (final Iterator i = arc.iterator();
                        i.hasNext() && !currentThread().isInterrupted();) {
                    final ARCRecord rec = (ARCRecord)i.next();
                    this.reporter.incrCounter(Counter.ARCRECORDS_COUNT, 1);
                    
                    
                    try {
                        ARCMapRunner.this.mapper.map(
                            new Text(rec.getMetaData().getUrl()),
                            new ObjectWritable(rec), this.output,
                            this.reporter);
                        
                        final long b = rec.getMetaData().getContentBegin();
                        final long l = rec.getMetaData().getLength();
                        final long recordLength = (l > b)? (l - b): l;
                        if (recordLength >
                                ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE) {
                            // Now, if the content length is larger than a
                            // standard ARC, then it is most likely the last
                            // record in the ARC because ARC is closed after we
                            // exceed 100MB (DEFAULT_MAX_ARC...). Calling
                            // hasNext above will make us read through the
                            // whole record, even if its a 1.7G video. On a
                            // loaded machine, this might cause us timeout with
                            // tasktracker -- so, just skip out here.
                            this.reporter.setStatus("skipping " +
                                this.location + " -- very long record " +
                                rec.getMetaData());
                            this.reporter.
                                incrCounter(Counter.LONG_ARCRECORDS_COUNT, 1);
                            break;
                        }
                    } catch (final Throwable e) {
                        // Failed parse of record. Keep going.
                        LOG.warn("Error processing " + rec.getMetaData(), e);
                    }
                }
                if (currentThread().isInterrupted()) {
                    LOG.info(currentThread().getName() + " interrupted");
                }                
                this.reporter.setStatus("closing " + this.location, true);
                
            } catch (final Throwable e) {
                // Problem parsing arc file.
                this.reporter.incrCounter(Counter.BAD_ARC_PARSE_COUNT, 1);
                final String msg = "Error parsing " + this.location;
                //try {
                    this.reporter.setStatus(msg, true);
                /* TODO MC - to be compitable with hadoop 0.14
                } catch (final IOException ioe) {
                    ioe.printStackTrace();
                }
                */
                LOG.warn("ARCMapRunner - Throwable:"+ msg, e);            
            } 
      finally {
                try {
                    arc.close();
                    ARCMapRunner.this.mapper.onARCClose();
                } catch (final IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}
Source Code of org.archive.mapred.ARCMapRunner$IndexingThread

Related Classes of org.archive.mapred.ARCMapRunner$IndexingThread