Package org.broadinstitute.gatk.engine.datasources.reads

Source Code of org.broadinstitute.gatk.engine.datasources.reads.FilePointer

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.datasources.reads;

import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.GATKBAMFileSpan;
import htsjdk.samtools.GATKChunk;
import htsjdk.samtools.SAMFileSpan;
import htsjdk.samtools.SAMRecord;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
import org.broadinstitute.gatk.utils.interval.IntervalUtils;

import java.util.*;

/**
* Represents a small section of a BAM file, and every associated interval.
*/
public class FilePointer {
    protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
    protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
    protected final IntervalMergingRule intervalMergingRule;

    /**
     * Does this file pointer point into an unmapped region?
     */
    protected final boolean isRegionUnmapped;

    /**
     * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
     * ever visit during this GATK run? If this is set to true, the engine will expect to see only this
     * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
     * from more than one contig.
     */
    private boolean isMonolithic = false;

    /**
     * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
     */
    private Integer contigIndex = null;


    public FilePointer( final IntervalMergingRule mergeRule, final List<GenomeLoc> locations ) {
        this.intervalMergingRule = mergeRule;
        this.locations.addAll(locations);
        this.isRegionUnmapped = checkUnmappedStatus();

        validateAllLocations();
        if ( locations.size() > 0 ) {
            contigIndex = locations.get(0).getContigIndex();
        }
    }

    public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) {
        this(mergeRule, Arrays.asList(locations));
    }

    public FilePointer( final Map<SAMReaderID,SAMFileSpan> fileSpans, final IntervalMergingRule mergeRule, final List<GenomeLoc> locations ) {
        this(mergeRule, locations);
        this.fileSpans.putAll(fileSpans);
    }

    private boolean checkUnmappedStatus() {
        boolean foundMapped = false, foundUnmapped = false;

        for( GenomeLoc location: locations ) {
            if ( GenomeLoc.isUnmapped(location) )
                foundUnmapped = true;
            else
                foundMapped = true;
        }
        if ( foundMapped && foundUnmapped )
            throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped.");

        return foundUnmapped;
    }

    private void validateAllLocations() {
        // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
        if ( isRegionUnmapped || isMonolithic ) {
            return;
        }

        Integer previousContigIndex = null;

        for ( GenomeLoc location : locations ) {
            if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
                throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig");
            }

            previousContigIndex = location.getContigIndex();
        }
    }

    private void validateLocation( GenomeLoc location ) {
        if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
            throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped.");
        }
        if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
            throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig");
        }
    }

    /**
     * Returns an immutable view of this FilePointer's file spans
     *
     * @return an immutable view of this FilePointer's file spans
     */
    public Map<SAMReaderID, SAMFileSpan> getFileSpans() {
        return Collections.unmodifiableMap(fileSpans);
    }

    /**
     * Returns an immutable variant of the list of locations.
     * @return
     */
    public List<GenomeLoc> getLocations() {
        return Collections.unmodifiableList(locations);
    }

    /**
     * Returns the index of the contig into which this FilePointer points (a FilePointer can represent
     * regions in at most one contig).
     *
     * @return the index of the contig into which this FilePointer points
     */
    public int getContigIndex() {
        return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
    }

    /**
     * Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations
     *
     * @return the IntervalMergingRule used by this FilePointer (never null)
     */
    public IntervalMergingRule getIntervalMergingRule() {
        return intervalMergingRule;
    }

    /**
     * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
     * ever visit during this GATK run? If this is set to true, the engine will expect to see only this
     * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
     * from more than one contig.
     *
     * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
     */
    public boolean isMonolithic() {
        return isMonolithic;
    }

    /**
     * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
     * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
     * FP may contain intervals from more than one contig.
     *
     * @param isMonolithic set this FP's monolithic status to this value
     */
    public void setIsMonolithic( boolean isMonolithic ) {
        this.isMonolithic = isMonolithic;
    }

    @Override
    public boolean equals(final Object other) {
        if(!(other instanceof FilePointer))
            return false;
        FilePointer otherFilePointer = (FilePointer)other;

        // intervals
        if(this.locations.size() != otherFilePointer.locations.size())
            return false;
        for(int i = 0; i < locations.size(); i++) {
            if(!this.locations.get(i).equals(otherFilePointer.locations.get(i)))
                return false;
        }

        // fileSpans
        if(this.fileSpans.size() != otherFilePointer.fileSpans.size())
            return false;
        Iterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisEntries = this.fileSpans.entrySet().iterator();
        Iterator<Map.Entry<SAMReaderID,SAMFileSpan>> otherEntries = otherFilePointer.fileSpans.entrySet().iterator();
        while(thisEntries.hasNext() || otherEntries.hasNext()) {
            if(!thisEntries.next().equals(otherEntries.next()))
                return false;
        }
       
        return true;
    }

    public void addLocation(final GenomeLoc location) {
        validateLocation(location);

        this.locations.add(location);
        if ( contigIndex == null ) {
            contigIndex = location.getContigIndex();
        }
    }

    public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
        this.fileSpans.put(id,fileSpan);
    }

    public void addFileSpans(final Map<SAMReaderID, GATKBAMFileSpan> fileSpans) {
        this.fileSpans.putAll(fileSpans);
    }


    /**
     * Computes the size of this file span, in uncompressed bytes.
     * @return Size of the file span.
     */
    public long size() {
        long size = 0L;
        for(SAMFileSpan fileSpan: fileSpans.values())
            size += ((GATKBAMFileSpan)fileSpan).size();
        return size;
    }

    /**
     * Returns the difference in size between two filespans.
     * @param other Other filespan against which to measure.
     * @return The difference in size between the two file pointers.
     */
    public long minus(final FilePointer other) {
        long difference = 0;
        PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator());
        PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> otherIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(other.fileSpans.entrySet().iterator());

        while(thisIterator.hasNext()) {
            // If there are no elements left in the 'other' iterator, spin out this iterator.
            if(!otherIterator.hasNext()) {
                GATKBAMFileSpan nextSpan = (GATKBAMFileSpan)thisIterator.next().getValue();
                difference += nextSpan.size();
                continue;
            }

            // Otherwise, compare the latest value.
            int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey());

            if(compareValue < 0) {
                // This before other.
                difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size();
            }
            else if(compareValue > 0) {
                // Other before this.
                difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size();
            }
            else {
                // equality; difference the values.
                GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue();
                GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue();
                difference += Math.abs(thisRegion.minus(otherRegion).size());
            }
        }
        return difference;
    }

    /**
     * Combines two file pointers into one.
     * @param parser The genomelocparser to use when manipulating intervals.
     * @param other File pointer to combine into this one.
     * @return A completely new file pointer that is the combination of the two.
     */
    public FilePointer combine(final GenomeLocParser parser, final FilePointer other) {
        FilePointer combined = new FilePointer(intervalMergingRule);

        List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
        intervals.addAll(locations);
        intervals.addAll(other.locations);
        for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule))
            combined.addLocation(interval);

        PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator());
        PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> otherIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(other.fileSpans.entrySet().iterator());

        while(thisIterator.hasNext() || otherIterator.hasNext()) {
            int compareValue;
            if(!otherIterator.hasNext()) {
                compareValue = -1;
            }
            else if(!thisIterator.hasNext())
                compareValue = 1;
            else
                compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey());

            // This before other.
            if(compareValue < 0)
                mergeElementsInto(combined,thisIterator);
            // Other before this.
            else if(compareValue > 0)
                mergeElementsInto(combined,otherIterator);
            // equality; union the values.
            else
                mergeElementsInto(combined,thisIterator,otherIterator);
        }
        return combined;
    }

    /**
     * Roll the next element in the iterator into the combined entry.
     * @param combined Entry into which to roll the next element.
     * @param iterators Sources of next elements.
     */
    private void mergeElementsInto(final FilePointer combined, Iterator<Map.Entry<SAMReaderID,SAMFileSpan>>... iterators) {
        if(iterators.length == 0)
            throw new ReviewedGATKException("Tried to add zero elements to an existing file pointer.");
        Map.Entry<SAMReaderID,SAMFileSpan> initialElement = iterators[0].next();
        GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)initialElement.getValue();
        for(int i = 1; i < iterators.length; i++)
            fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue());
        combined.addFileSpans(initialElement.getKey(),fileSpan);
    }

    /**
     * Efficiently generate the union of the n FilePointers passed in. Much more efficient than
     * combining two FilePointers at a time using the combine() method above.
     *
     * IMPORTANT: the FilePointers to be unioned must either all represent regions on the
     * same contig, or all be unmapped, since we cannot create FilePointers with a mix of
     * contigs or with mixed mapped/unmapped regions.
     *
     * @param filePointers the FilePointers to union
     * @param parser our GenomeLocParser
     * @return the union of the FilePointers passed in
     */
    public static FilePointer union( List<FilePointer> filePointers, GenomeLocParser parser ) {
        if ( filePointers == null || filePointers.isEmpty() ) {
            return new FilePointer(IntervalMergingRule.ALL);
        }

        Map<SAMReaderID, List<GATKChunk>> fileChunks = new HashMap<SAMReaderID, List<GATKChunk>>();
        List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
        IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule();

        // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections
        for ( FilePointer filePointer : filePointers ) {
            locations.addAll(filePointer.getLocations());
            if (mergeRule != filePointer.getIntervalMergingRule())
                throw new ReviewedGATKException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule");

            for ( Map.Entry<SAMReaderID, SAMFileSpan> fileSpanEntry : filePointer.getFileSpans().entrySet() ) {
                GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue();

                if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) {
                    fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks());
                }
                else {
                    fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks());
                }
            }
        }

        // Now sort and merge the intervals
        List<GenomeLoc> sortedMergedLocations = new ArrayList<GenomeLoc>();
        sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule));

        // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing
        // the sorted, merged union of the chunks for that file
        Map<SAMReaderID, SAMFileSpan> mergedFileSpans = new HashMap<SAMReaderID, SAMFileSpan>(fileChunks.size());
        for ( Map.Entry<SAMReaderID, List<GATKChunk>> fileChunksEntry : fileChunks.entrySet() ) {
            List<GATKChunk> unmergedChunks = fileChunksEntry.getValue();
            mergedFileSpans.put(fileChunksEntry.getKey(),
                                (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan()));
        }

        return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations);
    }

    /**
     * Returns true if any of the file spans in this FilePointer overlap their counterparts in
     * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region
     * from the start of the first chunk to the end of the last chunk).
     *
     * @param other the FilePointer against which to check overlap with this FilePointer
     * @return true if any file spans overlap their counterparts in other, otherwise false
     */
    public boolean hasFileSpansOverlappingWith( FilePointer other ) {
        for ( Map.Entry<SAMReaderID, SAMFileSpan> thisFilePointerEntry : fileSpans.entrySet() ) {
            GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue());

            SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey());
            if ( otherEntry == null ) {
                continue// no counterpart for this file span in other
            }
            GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry);

            if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) {
                return true;
            }
        }

        return false;
    }

    @Override
    public String toString() {
        StringBuilder builder = new StringBuilder();
        builder.append("FilePointer:\n");
        builder.append("\tlocations = {");
        builder.append(Utils.join(";",locations));
        builder.append("}\n\tregions = \n");
        for(Map.Entry<SAMReaderID,SAMFileSpan> entry: fileSpans.entrySet()) {
            builder.append(entry.getKey());
            builder.append("= {");
            builder.append(entry.getValue());
            builder.append("}");
        }
        return builder.toString();
    }
}
TOP

Related Classes of org.broadinstitute.gatk.engine.datasources.reads.FilePointer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.