/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.engine.datasources.reads;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.*;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.ReadMetrics;
import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.engine.arguments.ValidationExclusion;
import org.broadinstitute.gatk.engine.downsampling.*;
import org.broadinstitute.gatk.engine.filters.CountingFilteringIterator;
import org.broadinstitute.gatk.engine.filters.ReadFilter;
import org.broadinstitute.gatk.engine.iterators.*;
import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
import org.broadinstitute.gatk.utils.SimpleTimer;
import org.broadinstitute.gatk.utils.baq.ReadTransformingIterator;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.concurrent.Callable;
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* Converts shards to SAM iterators over the specified region
*/
public class SAMDataSource {
final private static GATKSamRecordFactory factory = new GATKSamRecordFactory();
/** Backing support for reads. */
protected final ReadProperties readProperties;
/**
* Runtime metrics of reads filtered, etc.
*/
private final ReadMetrics readMetrics;
/**
* Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering.
*/
protected final GenomeLocParser genomeLocParser;
/**
* Identifiers for the readers driving this data source.
*/
private final Collection<SAMReaderID> readerIDs;
/**
* How strict are the readers driving this data source.
*/
private final ValidationStringency validationStringency;
/**
* Do we want to remove the program records from this data source?
*/
private final boolean removeProgramRecords;
/**
* Store BAM indices for each reader present.
*/
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
/**
* The merged header.
*/
private final SAMFileHeader mergedHeader;
/**
* The constituent headers of the unmerged files.
*/
private final Map<SAMReaderID,SAMFileHeader> headers = new HashMap<SAMReaderID,SAMFileHeader>();
/**
* The sort order of the BAM files. Files without a sort order tag are assumed to be
* in coordinate order.
*/
private SAMFileHeader.SortOrder sortOrder = null;
/**
* Whether the read groups in overlapping files collide.
*/
private final boolean hasReadGroupCollisions;
/**
* Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids
* are always unique, we can simply use a map here, no need to stratify by reader.
*/
private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping();
/**
* Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified
* by readers, since there can be readgroup id collision: different bam files (readers) can list the
* same read group id, which will be disambiguated when these input streams are merged.
*/
private final Map<SAMReaderID,ReadGroupMapping> originalToMergedReadGroupMappings = new HashMap<SAMReaderID,ReadGroupMapping>();
/**
* Mapping from input file path to new sample name. Used only when doing on-the-fly sample renaming.
*/
private Map<String, String> sampleRenameMap = null;
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(SAMDataSource.class);
/**
* A collection of readers driving the merging process.
*/
private final SAMResourcePool resourcePool;
/**
* Asynchronously loads BGZF blocks.
*/
private final BGZFBlockLoadingDispatcher dispatcher;
/**
* How are threads allocated.
*/
private final ThreadAllocation threadAllocation;
/**
* How are adjacent intervals merged by the sharder?
*/
private final IntervalMergingRule intervalMergingRule;
/**
* Static set of unsupported programs that create bam files.
* The key is the PG record ID and the value is the name of the tool that created it
*/
private static Map<String, String> unsupportedPGs = new HashMap<>();
static {
unsupportedPGs.put("GATK ReduceReads", "ReduceReads");
}
/**
* Create a new SAM data source given the supplied read metadata.
*
* For testing purposes
*
* @param samFiles list of reads files.
*/
public SAMDataSource(Collection<SAMReaderID> samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
this(
samFiles,
threadAllocation,
numFileHandles,
genomeLocParser,
false,
ValidationStringency.STRICT,
null,
null,
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
false);
}
/**
* See complete constructor. Does not enable BAQ by default.
*
* For testing purposes
*/
public SAMDataSource(
Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
ValidationStringency strictness,
Integer readBufferSize,
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
boolean includeReadsWithDeletionAtLoci) {
this( samFiles,
threadAllocation,
numFileHandles,
genomeLocParser,
useOriginalBaseQualities,
strictness,
readBufferSize,
downsamplingMethod,
exclusionList,
supplementalFilters,
Collections.<ReadTransformer>emptyList(),
includeReadsWithDeletionAtLoci,
(byte) -1,
false,
false,
null,
IntervalMergingRule.ALL);
}
/**
* Create a new SAM data source given the supplied read metadata.
* @param samFiles list of reads files.
* @param useOriginalBaseQualities True if original base qualities should be used.
* @param strictness Stringency of reads file parsing.
* @param readBufferSize Number of reads to hold in memory per BAM.
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
* @param keepReadsInLIBS should we keep a unique list of reads in LIBS?
* @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming.
* Will be null if we're not doing sample renaming.
* @param intervalMergingRule how are adjacent intervals merged by the sharder
*/
public SAMDataSource(
Collection<SAMReaderID> samFiles,
ThreadAllocation threadAllocation,
Integer numFileHandles,
GenomeLocParser genomeLocParser,
boolean useOriginalBaseQualities,
ValidationStringency strictness,
Integer readBufferSize,
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
boolean includeReadsWithDeletionAtLoci,
byte defaultBaseQualities,
boolean removeProgramRecords,
final boolean keepReadsInLIBS,
final Map<String, String> sampleRenameMap,
final IntervalMergingRule intervalMergingRule) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
this.intervalMergingRule = intervalMergingRule;
readerIDs = samFiles;
this.threadAllocation = threadAllocation;
// TODO: Consider a borrowed-thread dispatcher implementation.
if(this.threadAllocation.getNumIOThreads() > 0) {
logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads());
dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1);
}
else
dispatcher = null;
validationStringency = strictness;
this.removeProgramRecords = removeProgramRecords;
if(readBufferSize != null)
ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests
else {
// Choose a sensible default for the read buffer size.
// Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once.
// Now we are simply setting it to 100K reads
ReadShard.setReadBufferSize(100000);
}
this.sampleRenameMap = sampleRenameMap;
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
SAMReaders readers = resourcePool.getAvailableReaders();
// Determine the sort order.
for(SAMReaderID readerID: readerIDs) {
if (! readerID.samFile.canRead() )
throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " +
"Please check that the file is present and readable and try again.");
// Get the sort order, forcing it to coordinate if unsorted.
SAMFileReader reader = readers.getReader(readerID);
SAMFileHeader header = reader.getFileHeader();
headers.put(readerID,header);
if ( header.getReadGroups().isEmpty() ) {
throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile,
"SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups");
}
SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate;
// Validate that all input files are sorted in the same order.
if(this.sortOrder != null && this.sortOrder != sortOrder)
throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder));
// Update the sort order.
this.sortOrder = sortOrder;
}
mergedHeader = readers.getMergedHeader();
hasReadGroupCollisions = readers.hasReadGroupCollisions();
readProperties = new ReadProperties(
samFiles,
mergedHeader,
sortOrder,
useOriginalBaseQualities,
strictness,
downsamplingMethod,
exclusionList,
supplementalFilters,
readTransformers,
includeReadsWithDeletionAtLoci,
defaultBaseQualities,
keepReadsInLIBS);
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
for(SAMReaderID id: readerIDs) {
SAMFileReader reader = readers.getReader(id);
ReadGroupMapping mappingToMerged = new ReadGroupMapping();
List<SAMReadGroupRecord> readGroups = reader.getFileHeader().getReadGroups();
for(SAMReadGroupRecord readGroup: readGroups) {
if(hasReadGroupCollisions) {
mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId()));
mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId());
} else {
mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId());
}
}
originalToMergedReadGroupMappings.put(id,mappingToMerged);
}
for(SAMReaderID id: readerIDs) {
File indexFile = findIndexFile(id.samFile);
if(indexFile != null)
bamIndices.put(id,new GATKBAMIndex(indexFile));
}
resourcePool.releaseReaders(readers);
}
/**
* Checks whether the provided SAM header if from a reduced bam file.
* @param header the SAM header for a given file
* @throws UserException if the header is from a reduced bam
*/
private void checkForUnsupportedBamFile(final SAMFileHeader header) {
for ( final SAMProgramRecord PGrecord : header.getProgramRecords() ) {
if ( unsupportedPGs.containsKey(PGrecord.getId()) )
throw new UserException("The GATK no longer supports running off of BAMs produced by " + unsupportedPGs.get(PGrecord.getId()));
}
}
public void close() {
SAMReaders readers = resourcePool.getAvailableReaders();
for(SAMReaderID readerID: readerIDs) {
SAMFileReader reader = readers.getReader(readerID);
reader.close();
}
}
/**
* Returns Reads data structure containing information about the reads data sources placed in this pool as well as
* information about how they are downsampled, sorted, and filtered
* @return
*/
public ReadProperties getReadsInfo() { return readProperties; }
/**
* Checks to see whether any reads files are supplying data.
* @return True if no reads files are supplying data to the traversal; false otherwise.
*/
public boolean isEmpty() {
return readProperties.getSAMReaderIDs().size() == 0;
}
/**
* Gets the SAM file associated with a given reader ID.
* @param id The reader for which to retrieve the source file.
* @return the file actually associated with the id.
*/
public File getSAMFile(SAMReaderID id) {
return id.samFile;
}
/**
* Returns readers used by this data source.
* @return A list of SAM reader IDs.
*/
public Collection<SAMReaderID> getReaderIDs() {
return readerIDs;
}
/**
* Retrieves the id of the reader which built the given read.
* @param read The read to test.
* @return ID of the reader.
*/
public SAMReaderID getReaderID(SAMRecord read) {
return resourcePool.getReaderID(read.getFileSource().getReader());
}
/**
* Gets the merged header from the SAM file.
* @return The merged header.
*/
public SAMFileHeader getHeader() {
return mergedHeader;
}
public SAMFileHeader getHeader(SAMReaderID id) {
return headers.get(id);
}
/**
* Gets the revised read group id mapped to this 'original' read group id.
* @param reader for which to grab a read group.
* @param originalReadGroupId ID of the original read group.
* @return Merged read group ID.
*/
public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) {
return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId);
}
/**
* Gets the original read group id (as it was specified in the original input bam file) that maps onto
* this 'merged' read group id.
* @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream).
* @return Merged read group ID.
*/
public String getOriginalReadGroupId(final String mergedReadGroupId) {
return mergedToOriginalReadGroupMappings.get(mergedReadGroupId);
}
/**
* True if all readers have an index.
* @return True if all readers have an index.
*/
public boolean hasIndex() {
return readerIDs.size() == bamIndices.size();
}
/**
* Gets the index for a particular reader. Always preloaded.
* @param id Id of the reader.
* @return The index. Will preload the index if necessary.
*/
public GATKBAMIndex getIndex(final SAMReaderID id) {
return bamIndices.get(id);
}
/**
* Retrieves the sort order of the readers.
* @return Sort order. Can be unsorted, coordinate order, or query name order.
*/
public SAMFileHeader.SortOrder getSortOrder() {
return sortOrder;
}
/**
* Gets the cumulative read metrics for shards already processed.
* @return Cumulative read metrics.
*/
public ReadMetrics getCumulativeReadMetrics() {
// don't return a clone here because the engine uses a pointer to this object
return readMetrics;
}
/**
* Incorporate the given read metrics into the cumulative read metrics.
* @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics.
*/
public void incorporateReadMetrics(final ReadMetrics readMetrics) {
this.readMetrics.incrementMetrics(readMetrics);
}
public GATKSAMIterator seek(Shard shard) {
if(shard.buffersReads()) {
return shard.iterator();
}
else {
return getIterator(shard);
}
}
/**
* Gets the reader associated with the given read.
* @param readers Available readers.
* @param read
* @return
*/
private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) {
for(SAMReaderID id: getReaderIDs()) {
if(readers.getReader(id) == read.getFileSource().getReader())
return id;
}
throw new ReviewedGATKException("Unable to find id for reader associated with read " + read.getReadName());
}
/**
* Get the initial reader positions across all BAM files
*
* @return the start positions of the first chunk of reads for all BAM files
*/
protected Map<SAMReaderID, GATKBAMFileSpan> getInitialReaderPositions() {
Map<SAMReaderID, GATKBAMFileSpan> initialPositions = new HashMap<SAMReaderID, GATKBAMFileSpan>();
SAMReaders readers = resourcePool.getAvailableReaders();
for ( SAMReaderID id: getReaderIDs() ) {
initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
}
resourcePool.releaseReaders(readers);
return initialPositions;
}
/**
* Get an iterator over the data types specified in the shard.
*
* @param shard The shard specifying the data limits.
* @return An iterator over the selected data.
*/
protected GATKSAMIterator getIterator( Shard shard ) {
return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard);
}
/**
* Get an iterator over the data types specified in the shard.
* @param readers Readers from which to load data.
* @param shard The shard specifying the data limits.
* @param enableVerification True to verify. For compatibility with old sharding strategy.
* @return An iterator over the selected data.
*/
private GATKSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) {
// Set up merging to dynamically merge together multiple BAMs.
Map<SAMFileReader,CloseableIterator<SAMRecord>> iteratorMap = new HashMap<SAMFileReader,CloseableIterator<SAMRecord>>();
for(SAMReaderID id: getReaderIDs()) {
CloseableIterator<SAMRecord> iterator = null;
// TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin.
// TODO: Kill this check once we've proven that the design elements are gone.
if(shard.getFileSpans().get(id) == null)
throw new ReviewedGATKException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported.");
try {
if(threadAllocation.getNumIOThreads() > 0) {
BlockInputStream inputStream = readers.getInputStream(id);
inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id)));
BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory);
codec.setInputStream(inputStream);
iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec);
}
else {
iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id));
}
} catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes
throw new UserException.MalformedBAM(id.samFile, e.getMessage());
}
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator);
}
MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap);
// The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's
// metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when
// we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen.
return applyDecoratingIterators(readMetrics,
enableVerification,
readProperties.useOriginalBaseQualities(),
new ReleasingIterator(readers,GATKSAMIteratorAdapter.adapt(mergingIterator)),
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
readProperties.getSupplementalFilters(),
readProperties.getReadTransformers(),
readProperties.defaultBaseQualities(),
shard instanceof LocusShard);
}
private class BAMCodecIterator implements CloseableIterator<SAMRecord> {
private final BlockInputStream inputStream;
private final SAMFileReader reader;
private final BAMRecordCodec codec;
private SAMRecord nextRead;
private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) {
this.inputStream = inputStream;
this.reader = reader;
this.codec = codec;
advance();
}
public boolean hasNext() {
return nextRead != null;
}
public SAMRecord next() {
if(!hasNext())
throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty");
SAMRecord currentRead = nextRead;
advance();
return currentRead;
}
public void close() {
// NO-OP.
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator");
}
private void advance() {
final long startCoordinate = inputStream.getFilePointer();
nextRead = codec.decode();
final long stopCoordinate = inputStream.getFilePointer();
if(reader != null && nextRead != null)
PicardNamespaceUtils.setFileSource(nextRead, new SAMFileSource(reader, new GATKBAMFileSpan(new GATKChunk(startCoordinate, stopCoordinate))));
}
}
/**
* Filter reads based on user-specified criteria.
*
* @param readMetrics metrics to track when using this iterator.
* @param enableVerification Verify the order of reads.
* @param useOriginalBaseQualities True if original base qualities should be used.
* @param wrappedIterator the raw data source.
* @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this.
* @param supplementalFilters additional filters to apply to the reads.
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
* @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard
* @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null.
*/
protected GATKSAMIterator applyDecoratingIterators(ReadMetrics readMetrics,
boolean enableVerification,
boolean useOriginalBaseQualities,
GATKSAMIterator wrappedIterator,
Boolean noValidationOfReadOrder,
Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
byte defaultBaseQualities,
boolean isLocusBasedTraversal ) {
// Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum,
// this will consolidate the cigar strings into canonical form. This has to be done before the read
// filtering, because not all read filters will behave correctly with things like zero-length cigar
// elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also
// modify the base qualities.
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
// Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads
// that actually survive filtering. Otherwise we could get much less coverage than requested.
wrappedIterator = GATKSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
// Downsampling:
// For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers
// will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding
// splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling
// of individual reads.
boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal &&
readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readProperties.getDownsamplingMethod().toCoverage != null;
// Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be
// doing any downsampling downstream of us
if ( ! assumeDownstreamLIBSDownsampling ) {
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
}
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
// verify the read ordering by applying a sort order iterator
if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
// Read transformers: these are applied last, so that we don't bother transforming reads that get discarded
// by the read filters or downsampler.
for ( final ReadTransformer readTransformer : readTransformers ) {
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
}
return wrappedIterator;
}
protected GATKSAMIterator applyDownsamplingIterator( GATKSAMIterator wrappedIterator ) {
if ( readProperties.getDownsamplingMethod() == null ||
readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) {
return wrappedIterator;
}
if ( readProperties.getDownsamplingMethod().toFraction != null ) {
// If we're downsampling to a fraction of reads, there's no point in paying the cost of
// splitting/re-assembling the read stream by sample to run the FractionalDownsampler on
// reads from each sample separately, since the result would be the same as running the
// FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator
// rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling
// was requested.
return new DownsamplingReadsIterator(wrappedIterator,
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction));
}
else if ( readProperties.getDownsamplingMethod().toCoverage != null ) {
// If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling
// the read stream to run the downsampler on the reads for each individual sample separately if
// BY_SAMPLE downsampling was requested.
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
return new PerSampleDownsamplingReadsIterator(wrappedIterator,
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
}
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
return new DownsamplingReadsIterator(wrappedIterator,
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage));
}
}
return wrappedIterator;
}
private class SAMResourcePool {
/**
* How many entries can be cached in this resource pool?
*/
private final int maxEntries;
/**
* All iterators of this reference-ordered data.
*/
private List<SAMReaders> allResources = new ArrayList<SAMReaders>();
/**
* All iterators that are not currently in service.
*/
private List<SAMReaders> availableResources = new ArrayList<SAMReaders>();
public SAMResourcePool(final int maxEntries) {
this.maxEntries = maxEntries;
}
/**
* Choose a set of readers from the pool to use for this query. When complete,
* @return
*/
public synchronized SAMReaders getAvailableReaders() {
if(availableResources.size() == 0)
createNewResource();
SAMReaders readers = availableResources.get(0);
availableResources.remove(readers);
return readers;
}
public synchronized void releaseReaders(SAMReaders readers) {
if(!allResources.contains(readers))
throw new ReviewedGATKException("Tried to return readers from the pool that didn't originate in the pool.");
availableResources.add(readers);
}
/**
* Gets the reader id for the given reader.
* @param reader Reader for which to determine the id.
* @return id of the given reader.
*/
protected synchronized SAMReaderID getReaderID(SamReader reader) {
for(SAMReaders readers: allResources) {
SAMReaderID id = readers.getReaderID(reader);
if(id != null)
return id;
}
throw new ReviewedGATKException("No such reader id is available");
}
private synchronized void createNewResource() {
if(allResources.size() > maxEntries)
throw new ReviewedGATKException("Cannot create a new resource pool. All resources are in use.");
SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords);
allResources.add(readers);
availableResources.add(readers);
}
}
/**
* A collection of readers derived from a reads metadata structure.
*/
private class SAMReaders implements Iterable<SAMFileReader> {
/**
* Cached representation of the merged header used to generate a merging iterator.
*/
private final SamFileHeaderMerger headerMerger;
/**
* Internal storage for a map of id -> reader.
*/
private final Map<SAMReaderID,SAMFileReader> readers = new LinkedHashMap<SAMReaderID,SAMFileReader>();
/**
* The inptu streams backing
*/
private final Map<SAMReaderID,BlockInputStream> inputStreams = new LinkedHashMap<SAMReaderID,BlockInputStream>();
/**
* Derive a new set of readers from the Reads metadata.
* @param readerIDs reads to load.
* TODO: validationStringency is not used here
* @param validationStringency validation stringency.
* @param removeProgramRecords indicate whether to clear program records from the readers
*/
public SAMReaders(Collection<SAMReaderID> readerIDs, ValidationStringency validationStringency, boolean removeProgramRecords) {
final int totalNumberOfFiles = readerIDs.size();
int readerNumber = 1;
final SimpleTimer timer = new SimpleTimer().start();
if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial");
final int tickSize = 50;
int nExecutedTotal = 0;
long lastTick = timer.currentTime();
for(final SAMReaderID readerID: readerIDs) {
final ReaderInitializer init = new ReaderInitializer(readerID).call();
checkForUnsupportedBamFile(init.reader.getFileHeader());
if (removeProgramRecords) {
init.reader.getFileHeader().setProgramRecords(new ArrayList<SAMProgramRecord>());
}
if (threadAllocation.getNumIOThreads() > 0) {
inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
}
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile));
readers.put(init.readerID,init.reader);
if ( ++nExecutedTotal % tickSize == 0) {
double tickInSec = (timer.currentTime() - lastTick) / 1000.0;
printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec);
lastTick = timer.currentTime();
}
}
if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime()));
Collection<SAMFileHeader> headers = new LinkedList<SAMFileHeader>();
// Examine the bam headers, perform any requested sample renaming on them, and add
// them to the list of headers to pass to the Picard SamFileHeaderMerger:
for ( final Map.Entry<SAMReaderID, SAMFileReader> readerEntry : readers.entrySet() ) {
final SAMReaderID readerID = readerEntry.getKey();
final SAMFileReader reader = readerEntry.getValue();
final SAMFileHeader header = reader.getFileHeader();
// The remappedSampleName will be null if either no on-the-fly sample renaming was requested,
// or the user's sample rename map file didn't contain an entry for this bam file:
final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID.getSamFilePath()) : null;
// If we've been asked to rename the sample for this bam file, do so now. We'll check to
// make sure this bam only contains reads from one sample before proceeding.
//
// IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of
// the existing read group attributes (including sample name) when merging
// headers, regardless of whether there are read group collisions or not.
if ( remappedSampleName != null ) {
remapSampleName(readerID, header, remappedSampleName);
}
headers.add(header);
}
headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true);
// update all read groups to GATKSAMRecordReadGroups
final List<SAMReadGroupRecord> gatkReadGroups = new LinkedList<SAMReadGroupRecord>();
for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) {
gatkReadGroups.add(new GATKSAMReadGroupRecord(rg));
}
headerMerger.getMergedHeader().setReadGroups(gatkReadGroups);
}
/**
* Changes the sample name in the read groups for the provided bam file header to match the
* remappedSampleName. Blows up with a UserException if the header contains more than one
* sample name.
*
* @param readerID ID for the bam file from which the provided header came from
* @param header The bam file header. Will be modified by this call.
* @param remappedSampleName New sample name to replace the existing sample attribute in the
* read groups for the header.
*/
private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) {
String firstEncounteredSample = null;
for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) {
final String thisReadGroupSample = readGroup.getSample();
if ( thisReadGroupSample == null ) {
throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " +
"bam file contains a read group (id: %s) with a null sample attribute",
readerID.getSamFilePath(), readGroup.getId()));
}
else if ( firstEncounteredSample == null ) {
firstEncounteredSample = thisReadGroupSample;
}
else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) {
throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " +
"however this bam file contains reads from more than one sample " +
"(encountered samples %s and %s in the bam header). The GATK requires that " +
"all bams for which on-the-fly sample renaming is requested " +
"contain reads from only a single sample per bam.",
readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample));
}
readGroup.setSample(remappedSampleName);
}
}
final private void printReaderPerformance(final int nExecutedTotal,
final int nExecutedInTick,
final int totalNumberOfFiles,
final SimpleTimer timer,
final double tickDurationInSec) {
final int pendingSize = totalNumberOfFiles - nExecutedTotal;
final double totalTimeInSeconds = timer.getElapsedTime();
final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds);
final int nRemaining = pendingSize;
final double estTimeToComplete = pendingSize / nTasksPerSecond;
logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m",
nExecutedInTick, tickDurationInSec,
nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond,
nRemaining, estTimeToComplete, estTimeToComplete / 60));
}
/**
* Return the header derived from the merging of these BAM files.
* @return the merged header.
*/
public SAMFileHeader getMergedHeader() {
return headerMerger.getMergedHeader();
}
/**
* Do multiple read groups collide in this dataset?
* @return True if multiple read groups collide; false otherwis.
*/
public boolean hasReadGroupCollisions() {
return headerMerger.hasReadGroupCollisions();
}
/**
* Get the newly mapped read group ID for the given read group.
* @param readerID Reader for which to discern the transformed ID.
* @param originalReadGroupID Original read group.
* @return Remapped read group.
*/
public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) {
SAMFileHeader header = readers.get(readerID).getFileHeader();
return headerMerger.getReadGroupId(header,originalReadGroupID);
}
/**
* Creates a new merging iterator from the given map, with the given header.
* @param iteratorMap A map of readers to iterators.
* @return An iterator which will merge those individual iterators.
*/
public MergingSamRecordIterator createMergingIterator(final Map<SAMFileReader,CloseableIterator<SAMRecord>> iteratorMap) {
return new MergingSamRecordIterator(headerMerger,iteratorMap,true);
}
/**
* Retrieve the reader from the data structure.
* @param id The ID of the reader to retrieve.
* @return the reader associated with the given id.
*/
public SAMFileReader getReader(SAMReaderID id) {
if(!readers.containsKey(id))
throw new NoSuchElementException("No reader is associated with id " + id);
return readers.get(id);
}
/**
* Retrieve the input stream backing a reader.
* @param id The ID of the reader to retrieve.
* @return the reader associated with the given id.
*/
public BlockInputStream getInputStream(final SAMReaderID id) {
return inputStreams.get(id);
}
/**
* Searches for the reader id of this reader.
* @param reader Reader for which to search.
* @return The id associated the given reader, or null if the reader is not present in this collection.
*/
protected SAMReaderID getReaderID(SamReader reader) {
for(Map.Entry<SAMReaderID,SAMFileReader> entry: readers.entrySet()) {
if(reader == entry.getValue())
return entry.getKey();
}
// Not found? return null.
return null;
}
/**
* Returns an iterator over all readers in this structure.
* @return An iterator over readers.
*/
public Iterator<SAMFileReader> iterator() {
return readers.values().iterator();
}
/**
* Returns whether any readers are present in this structure.
* @return
*/
public boolean isEmpty() {
return readers.isEmpty();
}
}
class ReaderInitializer implements Callable<ReaderInitializer> {
final SAMReaderID readerID;
BlockInputStream blockInputStream = null;
SAMFileReader reader;
public ReaderInitializer(final SAMReaderID readerID) {
this.readerID = readerID;
}
public ReaderInitializer call() {
final File indexFile = findIndexFile(readerID.samFile);
try {
if (threadAllocation.getNumIOThreads() > 0)
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
reader = new SAMFileReader(readerID.samFile,indexFile,false);
} catch ( RuntimeIOException e ) {
throw new UserException.CouldNotReadInputFile(readerID.samFile, e);
} catch ( SAMFormatException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
}
// Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
// Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
// just in case we want to change this behavior later.
catch ( RuntimeException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
}
reader.setSAMRecordFactory(factory);
reader.enableFileSource(true);
reader.setValidationStringency(validationStringency);
return this;
}
}
private class ReleasingIterator implements GATKSAMIterator {
/**
* The resource acting as the source of the data.
*/
private final SAMReaders resource;
/**
* The iterator to wrap.
*/
private final GATKSAMIterator wrappedIterator;
public ReleasingIterator(SAMReaders resource, GATKSAMIterator wrapped) {
this.resource = resource;
this.wrappedIterator = wrapped;
}
public ReleasingIterator iterator() {
return this;
}
public void remove() {
throw new UnsupportedOperationException("Can't remove from a GATKSAMIterator");
}
public void close() {
wrappedIterator.close();
resourcePool.releaseReaders(resource);
}
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecord next() {
return wrappedIterator.next();
}
}
/**
* Maps read groups in the original SAMFileReaders to read groups in
*/
private class ReadGroupMapping extends HashMap<String,String> {}
/**
* Locates the index file alongside the given BAM, if present.
* @param bamFile The data file to use.
* @return A File object if the index file is present; null otherwise.
*/
private File findIndexFile(File bamFile) {
return SamFiles.findIndex(bamFile);
}
/**
* Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream
* will be as granular as possible given our current knowledge of the best ways to split up BAM files.
* @return An iterator that spans all reads in all BAM files.
*/
public Iterable<Shard> createShardIteratorOverAllReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
* read that has been assigned
*
* @param shardBalancer shard balancer object
* @return non-null initialized version of the shard balancer
*/
public Iterable<Shard> createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser);
return shardBalancer;
}
/**
* Create a schedule for processing the initialized BAM file using the given interval list.
* The returned schedule should be as granular as possible.
* @param intervals The list of intervals for which to create the schedule.
* @return A granular iterator over file pointers.
*/
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
if(intervals == null)
throw new ReviewedGATKException("Unable to create schedule from intervals; no intervals were provided.");
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser);
return shardBalancer;
}
}