Package org.archive.modules.writer

Source Code of org.archive.modules.writer.ARCWriterProcessor

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.modules.writer;

import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG;

import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.arc.ARCWriter;
import org.archive.io.arc.ARCWriterPool;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;

/**
* Processor module for writing the results of successful fetches (and
* perhaps someday, certain kinds of network failures) to the Internet Archive
* ARC file format.
*
* Assumption is that there is only one of these ARCWriterProcessors per
* Heritrix instance.
*
* @author Parker Thompson
*/
public class ARCWriterProcessor extends WriterPoolProcessor {

    final static private String METADATA_TEMPLATE = readMetadataTemplate();
   
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static final Logger logger =
        Logger.getLogger(ARCWriterProcessor.class.getName());

    public long getDefaultMaxFileSize() {
        return 100000000L; // 100 SI mega-bytes (10^8 bytes)
    }
    public List<ConfigPath> getDefaultStorePaths() {
        List<ConfigPath> paths = new ArrayList<ConfigPath>();
        paths.add(new ConfigPath("arcs default store path", "arcs"));
        return paths;
    }

    private transient List<String> cachedMetadata;

    public ARCWriterProcessor() {
    }

    @Override
    protected void setupPool(AtomicInteger serialNo) {
        setPool(new ARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs()));
    }

    /**
     * Writes a CrawlURI and its associated data to store file.
     *
     * Currently this method understands the following uri types: dns, http,
     * and https.
     *
     * @param curi CrawlURI to process.
     */
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
       
        long recordLength = getRecordedSize(curi);
       
        ReplayInputStream ris = null;
        try {
            if (shouldWrite(curi)) {
                ris = curi.getRecorder().getRecordedInput()
                        .getReplayInputStream();
                return write(curi, recordLength, ris, getHostAddress(curi));
            } else {
                logger.info("does not write " + curi.toString());
                copyForwardWriteTagIfDupe(curi);
            }
         } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.SEVERE, "Failed write of Record: " +
                curi.toString(), e);
        } finally {
            IOUtils.closeQuietly(ris);
        }
        return ProcessResult.PROCEED;
    }
   
    protected ProcessResult write(CrawlURI curi, long recordLength,
            InputStream in, String ip)
    throws IOException {
        WriterPoolMember writer = getPool().borrowFile();
        long position = writer.getPosition();
        // See if we need to open a new file because we've exceeded maxBytes.
        // Call to checkFileSize will open new file if we're at maximum for
        // current file.
        writer.checkSize();
        if (writer.getPosition() != position) {
            // We just closed the file because it was larger than maxBytes.
            // Add to the totalBytesWritten the size of the first record
            // in the file, if any.
            setTotalBytesWritten(getTotalBytesWritten() +
              (writer.getPosition() - position));
            position = writer.getPosition();
        }
       
        ARCWriter w = (ARCWriter)writer;
        try {
            if (in instanceof ReplayInputStream) {
                w.write(curi.toString(), curi.getContentType(),
                    ip, curi.getFetchBeginTime(),
                    recordLength, (ReplayInputStream)in);
            } else {
                w.write(curi.toString(), curi.getContentType(),
                    ip, curi.getFetchBeginTime(),
                    recordLength, in);
            }
        } catch (IOException e) {
            // Invalidate this file (It gets a '.invalid' suffix).
            getPool().invalidateFile(writer);
            // Set the writer to null otherwise the pool accounting
            // of how many active writers gets skewed if we subsequently
            // do a returnWriter call on this object in the finally block.
            writer = null;
            throw e;
        } finally {
            if (writer != null) {
              setTotalBytesWritten(getTotalBytesWritten() +
                   (writer.getPosition() - position));
                getPool().returnFile(writer);
               
                String filename = writer.getFile().getName();
                if (filename.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
                    filename = filename.substring(0, filename.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
                }
                curi.addExtraInfo("arcFilename", filename);
               
                Map<String,Object>[] history = curi.getFetchHistory();
                if (history != null && history[0] != null) {
                    history[0].put(A_WRITE_TAG, filename);
                }
            }
        }
        return checkBytesWritten();
    }

    public List<String> getMetadata() {
        if (METADATA_TEMPLATE == null) {
            return null;
        }
       
        if (cachedMetadata != null) {
            return cachedMetadata;
        }
               
        String meta = METADATA_TEMPLATE;
        meta = replace(meta, "${VERSION}", ArchiveUtils.VERSION);
        meta = replace(meta, "${HOST}", getHostName());
        meta = replace(meta, "${IP}", getHostAddress());
       
        if (meta != null) {
            meta = replace(meta, "${JOB_NAME}", getMetadataProvider().getJobName());
            meta = replace(meta, "${DESCRIPTION}", getMetadataProvider().getDescription());
            meta = replace(meta, "${OPERATOR}", getMetadataProvider().getOperator());
            // TODO: fix this to match job-start-date (from UI or operator setting)
            // in the meantime, don't include a slightly-off date
            // meta = replace(meta, "${DATE}", GMT());
            meta = replace(meta, "${USER_AGENT}", getMetadataProvider().getUserAgent());
            meta = replace(meta, "${FROM}", getMetadataProvider().getOperatorFrom());
            meta = replace(meta, "${ROBOTS}", getMetadataProvider().getRobotsPolicyName());
        }

        this.cachedMetadata = Collections.singletonList(meta);
        return this.cachedMetadata;
        // ${VERSION}
        // ${HOST}
        // ${IP}
        // ${JOB_NAME}
        // ${DESCRIPTION}
        // ${OPERATOR}
        // ${DATE}
        // ${USER_AGENT}
        // ${FROM}
        // ${ROBOTS}

    }

    private static String replace(String meta, String find, String replace) {
        replace = StringUtils.defaultString(replace);
        replace = StringEscapeUtils.escapeXml(replace);
        return meta.replace(find, replace);
    }
   
    private static String getHostName() {
        try {
            return InetAddress.getLocalHost().getCanonicalHostName();
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Could not get local host name.", e);
            return "localhost";
        }
    }
   
    private static String getHostAddress() {
        try {
            return InetAddress.getLocalHost().getHostAddress();
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Could not get local host address.", e);
            return "localhost";
        }       
    }

    private static String readMetadataTemplate() {
        InputStream input = ARCWriterProcessor.class.getResourceAsStream(
                "arc_metadata_template.xml");
        if (input == null) {
            logger.severe("No metadata template.");
            return null;
        }
        try {
            return IOUtils.toString(input);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        } finally {
            IOUtils.closeQuietly(input);
        }
    }
}
TOP

Related Classes of org.archive.modules.writer.ARCWriterProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.