Package org.archive.wayback.resourceindex.ziplines

Source Code of org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.resourceindex.ziplines;

import java.io.IOException;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.util.iterator.CloseableIterator;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.resourceindex.SearchResultSource;
import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter;
import org.archive.wayback.resourceindex.cdx.format.CDXFlexFormat;
import org.archive.wayback.resourceindex.cdx.format.CDXFormat;
import org.archive.wayback.resourceindex.cdx.format.CDXFormatException;
import org.archive.wayback.util.AdaptedIterator;
import org.archive.wayback.util.flatfile.FlatFile;

/**
* A set of Ziplines files, which are CDX files specially compressed into a
* series of GZipMembers such that:
*
* 1) each member is exactly 128K, padded using a GZip comment header
* 2) each member contains complete lines: no line spans two GZip members
*
* If the data put into these files is sorted, then the data within the files
* can be uncompressed when needed, minimizing the total data to be uncompressed
*
* This SearchResultSource assumes a set of alphabetically partitioned Ziplined
* CDX files, so that each file is sorted, and no regions overlap.
*
* This class takes 2 files as input:
* 1) a specially constructed map of the first N bytes of data from each GZip
*      member, and the filename and offset of that GZip member.
* 2) a mapping of filenames to URLs
*
* Data from #1 is actually stored in a serialized
*
*
*
* @author brad
*
*
* @deprecated Note, this implementation is now superceded by the one in archive-commons
* @see org.archive.format.gzip.zipnum.ZipNumCluster
* That implementation provides support for stream loading of blocks, as well as summary files
* from http and hdfs as well as local filesystem
*/

public class ZiplinesSearchResultSource implements SearchResultSource {
  private static final Logger LOGGER = Logger.getLogger(
      ZiplinesSearchResultSource.class.getName());

  /**
   * Local path containing map of URL,TIMESTAMP,CHUNK,OFFSET for each 128K chunk
   */
  private String chunkIndexPath = null;
  private FlatFile chunkIndex = null;
  /**
   * Local path containing URL for each CHUNK
   */
  private String chunkMapPath = null;
  private HashMap<String,BlockLocation> chunkMap = null;
  private CDXFormat format = null;
  private int maxBlocks = 1000;
  private BlockLoader blockLoader = null;
 
  protected int timestampDedupLength = 0;
 
  public int getTimestampDedupLength() {
    return timestampDedupLength;
  }
  public void setTimestampDedupLength(int timestampDedupLength) {
    this.timestampDedupLength = timestampDedupLength;
  }
 
  public ZiplinesSearchResultSource() {
  }
  public ZiplinesSearchResultSource(CDXFormat format) {
    this.format = format;
  }
  public void init() throws IOException {
    chunkMap = new HashMap<String, BlockLocation>();
    FlatFile ff = new FlatFile(chunkMapPath);
    CloseableIterator<String> lines = ff.getSequentialIterator();
    while(lines.hasNext()) {
      String line = lines.next();
      String[] parts = line.split("\\s");
      if(parts.length < 2) {
        LOGGER.severe("Bad line(" + line +") in (" +
            chunkMapPath + ")");
        throw new IOException("Bad line(" + line +") in (" +
            chunkMapPath + ")");
      }
     
      String locations[] = new String[parts.length - 1];
      for(int i = 1; i < parts.length; i++) {
        locations[i-1] = parts[i];
      }
      BlockLocation bl = new BlockLocation(parts[0], locations);
      chunkMap.put(parts[0],bl);
    }
    lines.close();
    chunkIndex = new FlatFile(chunkIndexPath);
  }
  protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr)
  throws IOException {
    return new AdaptedIterator<String,CaptureSearchResult>(itr,
        new CDXFormatToSearchResultAdapter(format));
  }

  /* (non-Javadoc)
   * @see org.archive.wayback.resourceindex.SearchResultSource#cleanup(org.archive.wayback.util.CloseableIterator)
   */
        @Override
  public void cleanup(CloseableIterator<CaptureSearchResult> c)
      throws IOException {
    c.close();
  }

  /* (non-Javadoc)
   * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixIterator(java.lang.String)
   */
        @Override
  public CloseableIterator<CaptureSearchResult> getPrefixIterator(
      String prefix) throws ResourceIndexNotAvailableException {
    try {
      return adaptIterator(getStringPrefixIterator(prefix));
    } catch (IOException e) {
      throw new ResourceIndexNotAvailableException(e.getMessage());
    }
  }
 
  private String getTimestamp(String line)
  {
    if (timestampDedupLength <= 0) {
      return null;
    }
   
    int space = line.indexOf(' ');
    if (space >= 0) {
      return line.substring(0, space + 1 + timestampDedupLength);
    } else {
      return null;
    }
  }

  private ArrayList<ZiplinedBlock> getBlockListForPrefix(String prefix, String urlkey)
  throws IOException, ResourceIndexNotAvailableException {
    ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>();
    boolean first = true;
    int numBlocks = 0;
    boolean truncated = false;
    CloseableIterator<String> itr = null;
   
   
    try {
      itr = chunkIndex.getRecordIteratorLT(prefix);
     
      String currLine = null;
      String nextLine = null;
     
      if (itr.hasNext()) {
        nextLine = itr.next();
      }
      String timestamp = getTimestamp(nextLine);
      String lastTimestamp = null;
     
      while(nextLine != null) {
       
        currLine = nextLine;
       
        if (itr.hasNext()) {
          nextLine = itr.next();
        } else {
          nextLine = null;
        }
               
        if (nextLine != null && timestamp != null) {
          lastTimestamp = timestamp;
          timestamp = getTimestamp(nextLine);
          if ((timestamp != null) && timestamp.equals(lastTimestamp)) {
            continue;
          }
        }
       
        if(numBlocks >= maxBlocks || (!prefix.equals(urlkey) && numBlocks >= 1)) {
          if (LOGGER.isLoggable(Level.WARNING)) {
            LOGGER.warning("Truncated by blocks for " + prefix);
          }
          truncated = true;
          break;
        }
       
        numBlocks++;
        String blockDescriptor = currLine;
        String parts[] = blockDescriptor.split("\t");
        if((parts.length < 3)) {
          LOGGER.severe("Bad line(" + blockDescriptor +") in (" +
              chunkMapPath + ")");
          throw new ResourceIndexNotAvailableException("Bad line(" +
              blockDescriptor + ")");
        }
        // only compare the correct length:
        String prefCmp = urlkey;
        String blockCmp = parts[0];
        if(first) {
          // always add first:
          first = false;
        } else if(!blockCmp.startsWith(prefCmp)) {
          // all done;
          break;
        }
        // add this and keep lookin...
        BlockLocation bl = chunkMap.get(parts[1]);
        if(bl == null) {
          LOGGER.severe("No locations for block(" + parts[1] +")");
          throw new ResourceIndexNotAvailableException(
              "No locations for block(" + parts[1] + ")");
        }
        long offset = Long.parseLong(parts[2]);
        ZiplinedBlock block;
        if(parts.length == 3) {
          if (LOGGER.isLoggable(Level.INFO)) {
            LOGGER.info("Adding block source(" + parts[1] + "):" + offset);
          }
          block = new ZiplinedBlock(bl.getLocations(), offset);
        } else {
          int length = Integer.parseInt(parts[3]);
          if (LOGGER.isLoggable(Level.INFO)) {
            LOGGER.info("Adding block source(" + parts[1] + "):" + offset + " - " + length);
          }
          block = new ZiplinedBlock(bl.getLocations(), offset, length);
        }
        block.setLoader(blockLoader);
        blocks.add(block);
      }
    } finally {
      if(itr != null) {
        itr.close();
      }
    }
    return blocks;
  }
 
  public Iterator<String> getZiplinesChunkIterator(String prefix, String urlkey) throws ResourceIndexNotAvailableException, IOException
  {
    ArrayList<ZiplinedBlock> blocks = getBlockListForPrefix(prefix, urlkey);
    ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks);
    zci.setTruncated(false);
    return zci;
  }
 
//  public Iterator<String> getMergedZiplinesChunkIterator(String prefix) throws ResourceIndexNotAvailableException, IOException
//  {
//    ArrayList<ZiplinedBlock> blocks = getMergedBlockListForPrefix(prefix);
//    ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks);
//    zci.setTruncated(false);
//    return zci;
//  }
// 
//  public Iterator<String> getStringBoundedRangeIterator(String start, String end, boolean endInclusivePrefix)
//  throws ResourceIndexNotAvailableException, IOException {
//    return new StringBoundedRangeIterator(getZiplinesChunkIterator(start), start, end, endInclusivePrefix);
//  }

  public Iterator<String> getStringPrefixIterator(String prefix)
    throws ResourceIndexNotAvailableException, IOException {
   
    String urlkey = prefix;
    int space = prefix.indexOf(' ');
    if (space >= 0) {
      urlkey = prefix.substring(0, space);
    }
   
    return new StringPrefixIterator(getZiplinesChunkIterator(prefix, urlkey), urlkey);
  }

  /* (non-Javadoc)
   * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixReverseIterator(java.lang.String)
   */
        @Override
  public CloseableIterator<CaptureSearchResult> getPrefixReverseIterator(
      String prefix) throws ResourceIndexNotAvailableException {
    throw new ResourceIndexNotAvailableException("unsupported op");
  }

  /* (non-Javadoc)
   * @see org.archive.wayback.resourceindex.SearchResultSource#shutdown()
   */
        @Override
  public void shutdown() throws IOException {
    // no-op..
  }
  /**
   * @return the format
   */
  public CDXFormat getFormat() {
    return format;
  }
  /**
   * @param format the format to set
   */
  public void setFormat(CDXFormat format) {
    this.format = format;
  }
  /**
   * @return the chunkIndexPath
   */
  public String getChunkIndexPath() {
    return chunkIndexPath;
  }
  /**
   * @param chunkIndexPath the chunkIndexPath to set
   */
  public void setChunkIndexPath(String chunkIndexPath) {
    this.chunkIndexPath = chunkIndexPath;
  }
  /**
   * @return the chunkMapPath
   */
  public String getChunkMapPath() {
    return chunkMapPath;
  }
  /**
   * @param chunkMapPath the chunkMapPath to set
   */
  public void setChunkMapPath(String chunkMapPath) {
    this.chunkMapPath = chunkMapPath;
  }
  /**
   * @return the maxBlocks
   */
  public int getMaxBlocks() {
    return maxBlocks;
  }
  /**
   * @param maxBlocks the maxBlocks to set
   */
  public void setMaxBlocks(int maxBlocks) {
    this.maxBlocks = maxBlocks;
  }

  /**
   * @return the blockLoader
   */
  public BlockLoader getBlockLoader() {
    return blockLoader;
  }

  /**
   * @param blockLoader the blockLoader to set
   */
  public void setBlockLoader(BlockLoader blockLoader) {
    this.blockLoader = blockLoader;
  }

  private static void USAGE() {
    System.err.println("USAGE:");
    System.err.println("");
    System.err.println("zl-bin-search [-format FORMAT] [-max MAX_BLOCKS] SUMMARY LOCATION KEY");
    System.err.println("");
    System.err.println("Search a ziplined compressed CDX format index for key");
    System.err.println("KEY to STDOUT. SUMMARY and LOCATION are paths to the");
    System.err.println("block summary and file location files.");
    System.err.println("With -format, output CDX in format FORMAT.");
    System.err.println("With -max, limit search at most MAX_BLOCKS blocks.");
    System.exit(1);
  }
 
  /**
   * @param args
   */
  public static void main(String[] args) {
//    String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC;
    String cdxSpec = " CDX N b a m s k r V g";
    CDXFormat format = null;
    BlockLoader blockLoader = new GenericBlockLoader();
    try {
      format = new CDXFormat(cdxSpec);
    } catch (CDXFormatException e1) {
      e1.printStackTrace();
      System.exit(1);
    }
    ZiplinesSearchResultSource zl = new ZiplinesSearchResultSource(format);
    PrintWriter pw = new PrintWriter(System.out);
    int idx;
    boolean blockDump = false;
    for(idx = 0; idx < args.length; idx++) {
      if(args[idx].equals("-format")) {
        idx++;
        if(idx >= args.length) {
          USAGE();
        }
        try {
          zl.setFormat(new CDXFormat(args[idx]));
        } catch (CDXFormatException e1) {
          e1.printStackTrace();
          System.exit(1);
        }
      } else if(args[idx].equals("-flexFormat")) {
        try {
          zl.setFormat(new CDXFlexFormat(" CDX A"));
        } catch (CDXFormatException e1) {
          e1.printStackTrace();
          System.exit(1);
        }
      } else if(args[idx].equals("-blockDump")) {
        blockDump = true;
      } else if(args[idx].equals("-hdfs")) {
        idx++;
        if(idx >= args.length) {
          USAGE();
        }
        blockLoader = new HDFSBlockLoader(args[idx]);
        try {
          ((HDFSBlockLoader)blockLoader).init();
        } catch (IOException e) {
          e.printStackTrace();
          USAGE();
          System.exit(1);
        } catch (URISyntaxException e) {
          e.printStackTrace();
          USAGE();
          System.exit(1);
        }
      } else if(args[idx].equals("-max")) {
        idx++;
        if(idx >= args.length) {
          USAGE();
        }
        try {
          zl.setMaxBlocks(Integer.parseInt(args[idx]));
        } catch(NumberFormatException e) {
          USAGE();
          System.exit(1);
        }

      } else if(args[idx].equals("-debug")) {
        Logger.getLogger(
            ZiplinesSearchResultSource.class.getName()).setLevel(Level.ALL);
        Logger.getLogger(
            ZiplinesChunkIterator.class.getName()).setLevel(Level.ALL);
        Logger.getLogger(
            ZiplinedBlock.class.getName()).setLevel(Level.ALL);
       
      } else {
        break;
      }
    }
    if(args.length < idx + 3) {
      USAGE();
    }
    // first is summary path, then location path, then search key:
    zl.setBlockLoader(blockLoader);
    zl.setChunkIndexPath(args[idx++]);
    zl.setChunkMapPath(args[idx++]);
    String key = args[idx++];
   
    try {
      zl.init();
      if(blockDump) {
       
        ArrayList<ZiplinedBlock> blocks = zl.getBlockListForPrefix(key, key);
        for(ZiplinedBlock block : blocks) {
          pw.format("%s\t%s\n", block.urlOrPaths[0], block.offset);
        }
        pw.close();

      } else {
        Iterator<String> itr = zl.getStringPrefixIterator(key);
        boolean truncated = ((StringPrefixIterator)itr).isTruncated();
        while(itr.hasNext()) {
          pw.println(itr.next());
        }
        pw.close();
        if(truncated) {
          System.err.println("Note that results are truncated...");
        }
      }
    } catch (ResourceIndexNotAvailableException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      System.exit(1);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      System.exit(1);
    }
  }
}
TOP

Related Classes of org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.