Package org.archive.io.warc

Examples of org.archive.io.warc.WARCWriter


    }

    protected ProcessResult write(final String lowerCaseScheme,
            final CrawlURI curi)
    throws IOException {
        WARCWriter writer = (WARCWriter) getPool().borrowFile();
     
        long position = writer.getPosition();
        try {
            // See if we need to open a new file because we've exceeded maxBytes.
            // Call to checkFileSize will open new file if we're at maximum for
            // current file.
            writer.checkSize();
            if (writer.getPosition() != position) {
                // We just closed the file because it was larger than maxBytes.
                // Add to the totalBytesWritten the size of the first record
                // in the file, if any.
                setTotalBytesWritten(getTotalBytesWritten() +
                    (writer.getPosition() - position));
                position = writer.getPosition();
            }
                      
            // Reset writer temp stats so they reflect only this set of records.
            // They'll be added to totals below, in finally block, after records
            // have been written.
            writer.resetTmpStats();
            writer.resetTmpRecordLog();
           
            // Write a request, response, and metadata all in the one
            // 'transaction'.
            final URI baseid = getRecordID();
            final String timestamp =
View Full Code Here


       transform(reader, warc);
   }
  
   protected void transform(final ARCReader reader, final File warc)
   throws IOException {
     WARCWriter writer = null;
     // No point digesting. Digest is available after reading of ARC which
     // is too late for inclusion in WARC.
     reader.setDigest(false);
     try {
       BufferedOutputStream bos =
         new BufferedOutputStream(new FileOutputStream(warc));
       // Get the body of the first ARC record as a String so can dump it
       // into first record of WARC.
       final Iterator<ArchiveRecord> i = reader.iterator();
       ARCRecord firstRecord = (ARCRecord)i.next();
       ByteArrayOutputStream baos =
         new ByteArrayOutputStream((int)firstRecord.getHeader().
             getLength());
       firstRecord.dump(baos);
         // Add ARC first record content as an ANVLRecord.
         ANVLRecord ar = new ANVLRecord();
         ar.addLabelValue("Filedesc", baos.toString());
         List<String> metadata = new ArrayList<String>(1);
         metadata.add(ar.toString());
         // Now create the writer.  If reader was compressed, lets write
         // a compressed WARC.
       writer = new WARCWriter(
                   new AtomicInteger(),
                   bos,
                   warc,
                   new WARCWriterPoolSettingsData(
                           "", "", -1, reader.isCompressed(), null, metadata, generator));
       // Write a warcinfo record with description about how this WARC
       // was made.
       writer.writeWarcinfoRecord(warc.getName(),
           "Made from " + reader.getReaderIdentifier() + " by " +
                 this.getClass().getName() + "/" + getRevision());
       for (; i.hasNext();) {
         write(writer, (ARCRecord)i.next());
       }
     } finally {
       if (reader != null) {
         reader.close();
       }
       if (writer != null) {
         // I don't want the close being logged -- least, not w/o log of
         // an opening (and that'd be a little silly for simple script
         // like this). Currently, it logs at level INFO so that close
         // of files gets written to log files.  Up the log level just
         // for the close.
         Logger l = Logger.getLogger(writer.getClass().getName());
         Level oldLevel = l.getLevel();
         l.setLevel(Level.WARNING);
         try {
           writer.close();
         } finally {
           l.setLevel(oldLevel);
         }
       }
     }
View Full Code Here

public class WARCHeader {
  private void writeHeaderRecord(File target, File fieldsSrc, String id)
  throws IOException {

    WARCWriter writer = null;

    BufferedOutputStream bos =
      new BufferedOutputStream(new FileOutputStream(target));

    FileInputStream is = new FileInputStream(fieldsSrc);
    ANVLRecord ar = ANVLRecord.load(is);

    List<String> metadata = new ArrayList<String>(1);
    metadata.add(ar.toString());

    writer = new WARCWriter(new AtomicInteger(),bos,target,getSettings(true, null, null, metadata));
    // Write a warcinfo record with description about how this WARC
    // was made.
    writer.writeWarcinfoRecord(target.getName(), "Made from "
        + id + " by "
        + this.getClass().getName());

  }
View Full Code Here

TOP

Related Classes of org.archive.io.warc.WARCWriter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.