Package org.broadinstitute.gatk.engine.datasources.reads

Source Code of org.broadinstitute.gatk.engine.datasources.reads.BAMSchedule

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.datasources.reads;

import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.Bin;
import htsjdk.samtools.GATKBAMFileSpan;
import htsjdk.samtools.GATKChunk;
import htsjdk.samtools.util.CloseableIterator;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.*;

/**
* Writes schedules for a single BAM file to a target output file.
*/
public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
    /**
     * File in which to store schedule data.
     */
    private File scheduleFile;

    /**
     * File channel for the schedule file.
     */
    private FileChannel scheduleFileChannel;

    /**
     * The definitive, sorted list of reader IDs.  Order is important here: the order
     * in which the reader IDs are presented here maps to the order in which they appear in the file.
     */
    private final List<SAMReaderID> readerIDs = new ArrayList<SAMReaderID>();

    /**
     * Iterators over the schedule.  Stored in the same order as readerIDs, above.
     */
    private final List<PeekableIterator<BAMScheduleEntry>> scheduleIterators = new ArrayList<PeekableIterator<BAMScheduleEntry>>();

    /**
     * Next schedule entry to be returned.  Null if no additional entries are present.
     */
    private BAMScheduleEntry nextScheduleEntry;

    /**
     * Reference sequence for which to write the schedule.
     */
    private final int referenceSequence;

    /**
     * Sizes of ints and longs in bytes.
     */
    private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
    private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;   

    /**
     * Create a new BAM schedule based on the given index.
     * @param dataSource The SAM data source to use.
     * @param intervals List of
     */
    public BAMSchedule(final SAMDataSource dataSource, final List<GenomeLoc> intervals) {
        if(intervals.isEmpty())
            throw new ReviewedGATKException("Tried to write schedule for empty interval list.");

        referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex();

        createScheduleFile();

        readerIDs.addAll(dataSource.getReaderIDs());

        for(final SAMReaderID reader: readerIDs) {
            final GATKBAMIndex index = dataSource.getIndex(reader);
            final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);

            int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
            Iterator<GenomeLoc> locusIterator = intervals.iterator();
            GenomeLoc currentLocus = locusIterator.next();

            final long readerStartOffset = position();

            int maxChunkCount = 0;

            while(currentBinInLowestLevel < GATKBAMIndex.MAX_BINS && currentLocus != null) {
                final Bin bin = new Bin(referenceSequence,currentBinInLowestLevel);
                final int binStart = index.getFirstLocusInBin(bin);
                final int binStop = index.getLastLocusInBin(bin);

                // In required, pull bin iterator ahead to the point of the next GenomeLoc.
                if(binStop < currentLocus.getStart()) {
                    currentBinInLowestLevel++;
                    continue;
                }

                // At this point, the bin stop is guaranteed to be >= the start of the locus.
                // If the bins have gone past the current locus, update the current locus if at all possible.
                if(binStart > currentLocus.getStop()) {
                    currentLocus = locusIterator.hasNext() ? locusIterator.next() : null;
                    continue;
                }

                // Code at this point knows that the current bin is neither before nor after the current locus,
                // so it must overlap.  Add this region to the filesystem.
                final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin);

                if(!fileSpan.isEmpty()) {
                    // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves.
                    ByteBuffer buffer = allocateByteBuffer(2*INT_SIZE_IN_BYTES + INT_SIZE_IN_BYTES + fileSpan.getGATKChunks().size()*LONG_SIZE_IN_BYTES*2);
                    buffer.putInt(binStart);
                    buffer.putInt(binStop);
                    buffer.putInt(fileSpan.getGATKChunks().size());
                    for(GATKChunk chunk: fileSpan.getGATKChunks()) {
                        buffer.putLong(chunk.getChunkStart());
                        buffer.putLong(chunk.getChunkEnd());
                    }
                    maxChunkCount = Math.max(maxChunkCount,fileSpan.getGATKChunks().size());

                    // Prepare buffer for writing
                    buffer.flip();

                    // And write.
                    write(buffer);
                }

                currentBinInLowestLevel++;
            }

            final long readerStopOffset = position();

            scheduleIterators.add(new PeekableIterator<BAMScheduleEntry>(new BAMScheduleIterator(reader,readerStartOffset,readerStopOffset,maxChunkCount)));

            // Iterator initialization might move the file pointer.  Make sure it gets reset back to where it was before iterator initialization.
            position(readerStopOffset);
        }

        advance();
    }

    /**
     * Determine whether more ScheduleEntries are present in the iterator.
     * @return Next schedule entry to parse.
     */
    @Override
    public boolean hasNext() {
        return nextScheduleEntry != null;   
    }

    /**
     * Retrieve the next schedule entry in the list.
     * @return next schedule entry in the queue.
     */
    @Override
    public BAMScheduleEntry next() {
        BAMScheduleEntry currentScheduleEntry = nextScheduleEntry;
        advance();
        return currentScheduleEntry;
    }

    /**
     * Close down and delete the file.
     */
    @Override
    public void close() {
        try {
            scheduleFileChannel.close();
        }
        catch(IOException ex) {
            throw makeIOFailureException(true, "Unable to close schedule file.", ex);
        }
    }

    /**
     * Convenience routine for creating UserExceptions
     * @param wasWriting
     * @param message
     * @param e
     * @return
     */
    private final GATKException makeIOFailureException(final boolean wasWriting, final String message, final Exception e) {
        if ( wasWriting ) {
            if ( e == null )
                return new UserException.CouldNotCreateOutputFile(scheduleFile, message);
            else
                return new UserException.CouldNotCreateOutputFile(scheduleFile, message, e);
        } else {
            if ( e == null )
                return new UserException.CouldNotReadInputFile(scheduleFile, message);
            else
                return new UserException.CouldNotReadInputFile(scheduleFile, message, e);
        }
    }

    /**
     * Advance to the next schedule entry.
     */
    private void advance() {
        nextScheduleEntry = null;

        BitSet selectedIterators = new BitSet(readerIDs.size());
        int currentStart = Integer.MAX_VALUE;
        int currentStop = Integer.MAX_VALUE;

        // Select every iterator whose next element is the lowest element in the list.
        for(int reader = 0; reader < scheduleIterators.size(); reader++) {
            PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
            if(!scheduleIterator.hasNext())
                continue;

            // If the iterator starts after this one, skip over it.
            if(scheduleIterator.peek().start > currentStart)
                continue;

            // If the iterator starts at the same point as this one, add it to the list.
            if(scheduleIterator.peek().start == currentStart) {
                selectedIterators.set(reader);
                currentStop = Math.min(scheduleIterator.peek().stop,currentStop);
                continue;
            }

            // If the iterator is less than anything seen before it, purge the selections and make this one current.
            if(scheduleIterator.peek().start < currentStart) {
                selectedIterators.clear();
                selectedIterators.set(reader);
                currentStart = scheduleIterator.peek().start;
                currentStop = scheduleIterator.peek().stop;
            }
        }

        // Out of iterators?  Abort early.
        if(selectedIterators.isEmpty())
            return;

        // Create the target schedule entry
        BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop);

        // For each schedule entry with data, load the data into the merged schedule.
        for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) {
            PeekableIterator<BAMScheduleEntry> scheduleIterator = scheduleIterators.get(reader);
            BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek();
            mergedScheduleEntry.mergeInto(individualScheduleEntry);

            // If the schedule iterator ends after this entry, consume it.
            if(individualScheduleEntry.stop <= currentStop)
                scheduleIterator.next();
        }

        // For each schedule entry without data, add a blank entry.
        for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) {
            mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan());
        }

        nextScheduleEntry = mergedScheduleEntry;
    }

    @Override
    public void remove() { throw new UnsupportedOperationException("Unable to remove from a schedule iterator."); }

    /**
     * Create a new schedule file, containing schedule information for all BAM files being dynamically merged.
     */
    private void createScheduleFile() {
        try {
            scheduleFile = File.createTempFile("bamschedule."+referenceSequence,null);
            scheduleFileChannel = new RandomAccessFile(scheduleFile,"rw").getChannel();
        }
        catch(IOException ex) {
            throw new UserException("Unable to create a temporary BAM schedule file.  Please make sure Java can write to the default temp directory or use -Djava.io.tmpdir= to instruct it to use a different temp directory instead.",ex);
        }
        scheduleFile.deleteOnExit();

    }

    /**
     * Creates a new byte buffer of the given size.
     * @param size the size of buffer to allocate.
     * @return Newly allocated byte buffer.
     */
    private ByteBuffer allocateByteBuffer(final int size) {
        ByteBuffer buffer = ByteBuffer.allocate(size);
        buffer.order(ByteOrder.LITTLE_ENDIAN);
        return buffer;
    }

    /**
     * Reads the contents at the current position on disk into the given buffer.
     * @param buffer buffer to fill.
     */
    private int read(final ByteBuffer buffer) {
        try {
            return scheduleFileChannel.read(buffer);
        }
        catch(IOException ex) {
            throw makeIOFailureException(false, "Unable to read data from BAM schedule file.", ex);
        }
    }

    private void write(final ByteBuffer buffer) {
        try {
            scheduleFileChannel.write(buffer);
            if(buffer.remaining() > 0)
                throw makeIOFailureException(true, "Unable to write entire buffer to file.", null);
        }
        catch(IOException ex) {
            throw makeIOFailureException(true, "Unable to write data to BAM schedule file.", ex);
        }
    }

    /**
     * Reads the current position from the file channel.
     * @return Current position within file channel.
     */
    private long position() {
        try {
            return scheduleFileChannel.position();
        }
        catch(IOException ex) {
            throw makeIOFailureException(false, "Unable to retrieve position of BAM schedule file.", ex);
        }
    }

    /**
     * Reposition the file channel to the specified offset wrt the start of the file.
     * @param position The position.
     */
    private void position(final long position) {
        try {
            scheduleFileChannel.position(position);
        }
        catch(IOException ex) {
            throw makeIOFailureException(false, "Unable to position BAM schedule file.",ex);
        }
    }

    /**
     * An iterator over the schedule for a single BAM file.
     */
    private class BAMScheduleIterator implements Iterator<BAMScheduleEntry> {
        /**
         * ID of the reader associated with the given schedule.
         */
        private final SAMReaderID reader;

        /**
         * Current position in the file.
         */
        private long currentPosition;

        /**
         * Stopping file position of last bin in file for this reader, exclusive.
         */
        private final long stopPosition;

        /**
         * Byte buffer used to store BAM header info.
         */
        private final ByteBuffer binHeader;

        /**
         * Byte buffer used to store chunk data.
         */
        private final ByteBuffer chunkData;

        public BAMScheduleIterator(final SAMReaderID reader, final long startPosition, final long stopPosition, final int maxChunkCount) {
            this.reader = reader;
            this.currentPosition = startPosition;
            this.stopPosition = stopPosition;
            binHeader = allocateByteBuffer(INT_SIZE_IN_BYTES*3);
            chunkData = allocateByteBuffer(maxChunkCount*LONG_SIZE_IN_BYTES*2);
        }

        @Override
        public boolean hasNext() {
            return currentPosition < stopPosition;
        }

        @Override
        public BAMScheduleEntry next() {
            position(currentPosition);

            // Read data.
            int binHeaderBytesRead = read(binHeader);

            // Make sure we read in a complete bin header:
            if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) {
                throw new ReviewedGATKException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " +
                                                               "The BAM schedule file is likely incomplete/corrupt.",
                                                               scheduleFile.getAbsolutePath(), reader.getSamFilePath()));
            }

            // Decode contents.
            binHeader.flip();
            final int start = binHeader.getInt();
            final int stop = binHeader.getInt();
            final int numChunks = binHeader.getInt();

            // Prepare bin buffer for next read.
            binHeader.flip();

            // Prepare a target buffer for chunks.
            GATKChunk[] chunks = new GATKChunk[numChunks];

            // Read all chunk data.
            chunkData.limit(numChunks*LONG_SIZE_IN_BYTES*2);
            long bytesRead = read(chunkData);
            if(bytesRead != numChunks*LONG_SIZE_IN_BYTES*2)
                throw new ReviewedGATKException("Unable to read all chunks from file");

            // Prepare for reading.
            chunkData.flip();

            for(int i = 0; i < numChunks; i++)
                chunks[i] = new GATKChunk(chunkData.getLong(),chunkData.getLong());

            // Prepare chunk buffer for next read.
            chunkData.flip();

            BAMScheduleEntry nextScheduleEntry = new BAMScheduleEntry(start,stop);
            nextScheduleEntry.addFileSpan(reader,new GATKBAMFileSpan(chunks));

            // Reset the position of the iterator at the next contig.
            currentPosition = position();

            return nextScheduleEntry;
        }

        /**
         * Not supported.
         */
        @Override
        public void remove() {
            throw new UnsupportedOperationException("Unable to remove from a BAMScheduleIterator");
        }

    }
}

/**
* A single proto-shard to be processed.
*/
class BAMScheduleEntry {
    /**
     * Starting position for the genomic entry.
     */
    public final int start;

    /**
     * Ending position for the genomic entry.
     */
    public final int stop;

    /**
     * The spans representing the given region.
     */
    public final Map<SAMReaderID,GATKBAMFileSpan> fileSpans = new HashMap<SAMReaderID,GATKBAMFileSpan>();

    BAMScheduleEntry(final int start, final int stop) {
        this.start = start;
        this.stop = stop;
    }

    /**
     * Add a new file span to this schedule.
     * @param reader Reader associated with the span.
     * @param fileSpan Blocks to read in the given reader.
     */
    public void addFileSpan(final SAMReaderID reader, final GATKBAMFileSpan fileSpan) {
        fileSpans.put(reader,fileSpan);
    }

    /**
     * A naive merge operation.  Merge the fileSpans in other into this, blowing up if conflicts are
     * detected. Completely ignores merging start and stop.
     * @param other Other schedule entry to merging into this one.
     */
    public void mergeInto(final BAMScheduleEntry other) {
        final int thisSize = fileSpans.size();
        final int otherSize = other.fileSpans.size();
        fileSpans.putAll(other.fileSpans);
        if(fileSpans.size() != thisSize+otherSize)
            throw new ReviewedGATKException("Unable to handle overlaps when merging BAM schedule entries.");
    }

    /**
     * Returns true if the location of this bin tree is before the given position.
     * @param locus Locus to test.
     * @return True if this bin sits completely before the given locus; false otherwise.
     */
    public boolean isBefore(final GenomeLoc locus) {
        return stop < locus.getStart();
    }

    /**
     * Checks overlap between this bin tree and other bin trees.
     * @param position the position over which to detect overlap.
     * @return True if the segment overlaps.  False otherwise.
     */
    public boolean overlaps(final GenomeLoc position) {
        return !(position.getStop() < start || position.getStart() > stop);
    }
}
TOP

Related Classes of org.broadinstitute.gatk.engine.datasources.reads.BAMSchedule

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.