Package org.broadinstitute.gatk.utils.activeregion

Source Code of org.broadinstitute.gatk.utils.activeregion.ActiveRegion

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.activeregion;

import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
import org.broadinstitute.gatk.utils.HasGenomeLocation;
import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;

import java.util.*;

/**
* Represents a single active region created by the Active Region Traversal for processing
*
* An active region is a single contiguous span of bases on the genome that should be operated
* on as a single unit for the active region traversal.  The action may contains a list of
* reads that overlap the region (may because there may be no reads in the region).  The region
* is tagged as being either active or inactive, depending on the probabilities provided by
* the isActiveProb results from the ART walker.  Each region carries with it the
* exact span of the region (bases which are the core of the isActiveProbs from the walker) as
* well as an extended size, that includes the ART walker's extension size.  Reads in the region
* provided by ART include all reads overlapping the extended span, not the raw span.
*
* User: rpoplin
* Date: 1/4/12
*/
@Invariant({
        "extension >= 0",
        "activeRegionLoc != null",
        "genomeLocParser != null",
        "spanIncludingReads != null",
        "extendedLoc != null"
})
public class ActiveRegion implements HasGenomeLocation {
    /**
     * The reads included in this active region.  May be empty upon creation, and expand / contract
     * as reads are added or removed from this region.
     */
    private final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();

    /**
     * An ordered list (by genomic coordinate) of the ActivityProfileStates that went
     * into this active region.  May be empty, which says that no supporting states were
     * provided when this region was created.
     */
    private final List<ActivityProfileState> supportingStates;

    /**
     * The raw span of this active region, not including the active region extension
     */
    private final GenomeLoc activeRegionLoc;

    /**
     * The span of this active region on the genome, including the active region extension
     */
    private final GenomeLoc extendedLoc;

    /**
     * The extension, in bp, of this active region.
     */
    private final int extension;

    /**
     * A genomeLocParser so we can create genomeLocs
     */
    private final GenomeLocParser genomeLocParser;

    /**
     * Does this region represent an active region (all isActiveProbs above threshold) or
     * an inactive region (all isActiveProbs below threshold)?
     */
    private final boolean isActive;

    /**
     * The span of this active region, including the bp covered by all reads in this
     * region.  This union of extensionLoc and the loc of all reads in this region.
     *
     * Must be at least as large as extendedLoc, but may be larger when reads
     * partially overlap this region.
     */
    private GenomeLoc spanIncludingReads;


    /**
     * Indicates whether the active region has been finalized
     */
    private boolean hasBeenFinalized;

    /**
     * Create a new ActiveRegion containing no reads
     *
     * @param activeRegionLoc the span of this active region
     * @param supportingStates the states that went into creating this region, or null / empty if none are available.
     *                         If not empty, must have exactly one state for each bp in activeRegionLoc
     * @param isActive indicates whether this is an active region, or an inactve one
     * @param genomeLocParser a non-null parser to let us create new genome locs
     * @param extension the active region extension to use for this active region
     */
    public ActiveRegion( final GenomeLoc activeRegionLoc, final List<ActivityProfileState> supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
        if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null");
        if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc);
        if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
        if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension);

        this.activeRegionLoc = activeRegionLoc;
        this.supportingStates = supportingStates == null ? Collections.<ActivityProfileState>emptyList() : new ArrayList<ActivityProfileState>(supportingStates);
        this.isActive = isActive;
        this.genomeLocParser = genomeLocParser;
        this.extension = extension;
        this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension);
        this.spanIncludingReads = extendedLoc;

        if ( ! this.supportingStates.isEmpty() ) {
            if ( this.supportingStates.size() != activeRegionLoc.size() )
                throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size());
            GenomeLoc lastStateLoc = null;
            for ( final ActivityProfileState state : this.supportingStates ) {
                if ( lastStateLoc != null ) {
                    if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex())
                        throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state);
                }
                lastStateLoc = state.getLoc();
            }
        }
    }

    /**
     * Simple interface to create an active region that isActive without any profile state
     */
    public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) {
        this(activeRegionLoc, Collections.<ActivityProfileState>emptyList(), true, genomeLocParser, extension);
    }

    @Override
    public String toString() {
        return "ActiveRegion "  + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size();
    }

    /**
     * See #getActiveRegionReference but with padding == 0
     */
    public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
        return getActiveRegionReference(referenceReader, 0);
    }

    /**
     * Get the reference bases from referenceReader spanned by the extended location of this active region,
     * including additional padding bp on either side.  If this expanded region would exceed the boundaries
     * of the active region's contig, the returned result will be truncated to only include on-genome reference
     * bases
     * @param referenceReader the source of the reference genome bases
     * @param padding the padding, in BP, we want to add to either side of this active region extended region
     * @return a non-null array of bytes holding the reference bases in referenceReader
     */
    @Ensures("result != null")
    public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
        return getReference(referenceReader, padding, extendedLoc);
    }

    /**
     * See #getActiveRegionReference but using the span including regions not the extended span
     */
    public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
        return getFullReference(referenceReader, 0);
    }

    /**
     * See #getActiveRegionReference but using the span including regions not the extended span
     */
    public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
        return getReference(referenceReader, padding, spanIncludingReads);
    }

    /**
     * Get the reference bases from referenceReader spanned by the extended location of this active region,
     * including additional padding bp on either side.  If this expanded region would exceed the boundaries
     * of the active region's contig, the returned result will be truncated to only include on-genome reference
     * bases
     * @param referenceReader the source of the reference genome bases
     * @param padding the padding, in BP, we want to add to either side of this active region extended region
     * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for
     * @return a non-null array of bytes holding the reference bases in referenceReader
     */
    @Ensures("result != null")
    public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
        if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
        if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
        if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");
        if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc);

        final byte[] reference =  referenceReader.getSubsequenceAt( genomeLoc.getContig(),
                Math.max(1, genomeLoc.getStart() - padding),
                Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();

        return reference;
    }

    /**
     * Get the raw span of this active region (excluding the extension)
     * @return a non-null genome loc
     */
    @Override
    @Ensures("result != null")
    public GenomeLoc getLocation() { return activeRegionLoc; }

    /**
     * Get the span of this active region including the extension value
     * @return a non-null GenomeLoc
     */
    @Ensures("result != null")
    public GenomeLoc getExtendedLoc() { return extendedLoc; }

    /**
     * Get the span of this active region including the extension and the projects on the
     * genome of all reads in this active region.  That is, returns the bp covered by this
     * region and all reads in the region.
     * @return a non-null genome loc
     */
    @Ensures("result != null")
    public GenomeLoc getReadSpanLoc() { return spanIncludingReads; }

    /**
     * Get the active profile states that went into creating this region, if possible
     * @return an unmodifiable list of states that led to the creation of this region, or an empty
     *         list if none were provided
     */
    @Ensures("result != null")
    public List<ActivityProfileState> getSupportingStates() {
        return Collections.unmodifiableList(supportingStates);
    }

    /**
     * Get the active region extension applied to this region
     *
     * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions
     *
     * @return the size in bp of the region extension
     */
    @Ensures("result >= 0")
    public int getExtension() { return extension; }

    /**
     * Get an unmodifiable list of reads currently in this active region.
     *
     * The reads are sorted by their coordinate position
     *
     * @return an unmodifiable list of reads in this active region
     */
    @Ensures("result != null")
    public List<GATKSAMRecord> getReads() {
        return Collections.unmodifiableList(reads);
    }

    /**
     * Get the number of reads currently in this active region
     * @return an integer >= 0
     */
    @Ensures("result >= 0")
    public int size() { return reads.size(); }

    /**
     * Add read to this active region
     *
     * Read must have alignment start >= than the last read currently in this active region.
     *
     * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region
     *
     * @param read a non-null GATKSAMRecord
     */
    @Ensures("reads.size() == old(reads.size()) + 1")
    public void add( final GATKSAMRecord read ) {
        if ( read == null ) throw new IllegalArgumentException("Read cannot be null");

        final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read );
        if ( ! readOverlapsRegion(read) )
            throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc);

        spanIncludingReads = spanIncludingReads.union( readLoc );

        if ( ! reads.isEmpty() ) {
            final GATKSAMRecord lastRead = reads.get(size() - 1);
            if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) )
                throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read);

            if ( read.getAlignmentStart() < lastRead.getAlignmentStart() )
                throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart());
        }

        reads.add( read );
    }

    /**
     * Returns true if read would overlap the extended extent of this region
     * @param read the read we want to test
     * @return true if read can be added to this region, false otherwise
     */
    public boolean readOverlapsRegion(final GATKSAMRecord read) {
        final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read );
        return readLoc.overlapsP(extendedLoc);
    }

    /**
     * Add all reads to this active region
     * @param reads a collection of reads to add to this active region
     */
    public void addAll(final Collection<GATKSAMRecord> reads) {
        if ( reads == null ) throw new IllegalArgumentException("reads cannot be null");
        for ( final GATKSAMRecord read : reads )
            add(read);
    }

    /**
     * Clear all of the reads currently in this active region
     */
    @Ensures("size() == 0")
    public void clearReads() {
        spanIncludingReads = extendedLoc;
        reads.clear();
    }

    /**
     * Remove all of the reads in readsToRemove from this active region
     * @param readsToRemove the set of reads we want to remove
     */
    public void removeAll( final Set<GATKSAMRecord> readsToRemove ) {
        final Iterator<GATKSAMRecord> it = reads.iterator();
        spanIncludingReads = extendedLoc;
        while ( it.hasNext() ) {
            final GATKSAMRecord read = it.next();
            if ( readsToRemove.contains(read) )
                it.remove();
            else
                spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) );
        }
    }

    /**
     * Is this region equal to other, excluding any reads in either region in the comparison
     * @param other the other active region we want to test
     * @return true if this region is equal, excluding any reads and derived values, to other
     */
    protected boolean equalExceptReads(final ActiveRegion other) {
        if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false;
        if ( isActive() != other.isActive()) return false;
        if ( genomeLocParser != other.genomeLocParser ) return false;
        if ( extension != other.extension ) return false;
        if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false;
        return true;
    }

    /**
     * Does this region represent an active region (all isActiveProbs above threshold) or
     * an inactive region (all isActiveProbs below threshold)?
     */
    public boolean isActive() {
        return isActive;
    }

    /**
     * Intersect this active region with the allowed intervals, returning a list of active regions
     * that only contain locations present in intervals
     *
     * Note that the returned list may be empty, if this active region doesn't overlap the set at all
     *
     * Note that the resulting regions are all empty, regardless of whether the current active region has reads
     *
     * @param intervals a non-null set of intervals that are allowed
     * @return an ordered list of active region where each interval is contained within intervals
     */
    @Ensures("result != null")
    protected List<ActiveRegion> splitAndTrimToIntervals(final GenomeLocSortedSet intervals) {
        final List<GenomeLoc> allOverlapping = intervals.getOverlapping(getLocation());
        final List<ActiveRegion> clippedRegions = new LinkedList<ActiveRegion>();

        for ( final GenomeLoc overlapping : allOverlapping ) {
            clippedRegions.add(trim(overlapping, extension));
        }

        return clippedRegions;
    }

    /**
     * Trim this active to just the span, producing a new active region without any reads that has only
     * the extent of newExtend intersected with the current extent
     * @param span the new extend of the active region we want
     * @param extension the extension size we want for the newly trimmed active region
     * @return a non-null, empty active region
     */
    public ActiveRegion trim(final GenomeLoc span, final int extension) {
        if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null");
        if ( extension < 0) throw new IllegalArgumentException("the extension size must be 0 or greater");
        final int extendStart = Math.max(1,span.getStart() - extension);
        final int maxStop = genomeLocParser.getContigs().getSequence(span.getContigIndex()).getSequenceLength();
        final int extendStop = Math.min(span.getStop() + extension, maxStop);
        final GenomeLoc extendedSpan = genomeLocParser.createGenomeLoc(span.getContig(), extendStart, extendStop);
        return trim(span, extendedSpan);

//TODO - Inconsiste support of substates trimming. Check lack of consistency!!!!
//        final GenomeLoc subLoc = getLocation().intersect(span);
//        final int subStart = subLoc.getStart() - getLocation().getStart();
//        final int subEnd = subStart + subLoc.size();
//        final List<ActivityProfileState> subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd);
//        return new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension );

    }

    public ActiveRegion trim(final GenomeLoc span) {
        return trim(span,span);
    }

    /**
     * Trim this active to no more than the span, producing a new active region with properly trimmed reads that
     * attempts to provide the best possible representation of this active region covering the span.
     *
     * The challenge here is that span may (1) be larger than can be represented by this active region
     * + its original extension and (2) the extension must be symmetric on both sides.  This algorithm
     * therefore determines how best to represent span as a subset of the span of this
     * region with a padding value that captures as much of the span as possible.
     *
     * For example, suppose this active region is
     *
     * Active:    100-200 with extension of 50, so that the true span is 50-250
     * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225
     *
     * Here we represent the active region as a active region from 150-200 with 25 bp of padding.
     *
     * The overall constraint is that the active region can never exceed the original active region, and
     * the extension is chosen to maximize overlap with the desired region
     *
     * @param span the new extend of the active region we want
     * @return a non-null, empty active region
     */
    public ActiveRegion trim(final GenomeLoc span, final GenomeLoc extendedSpan) {
        if ( span == null ) throw new IllegalArgumentException("Active region extent cannot be null");
        if ( extendedSpan == null ) throw new IllegalArgumentException("Active region extended span cannot be null");
        if ( ! extendedSpan.containsP(span))
            throw new IllegalArgumentException("The requested extended must fully contain the requested span");

        final GenomeLoc subActive = getLocation().intersect(span);
        final int requiredOnRight = Math.max(extendedSpan.getStop() - subActive.getStop(), 0);
        final int requiredOnLeft = Math.max(subActive.getStart() - extendedSpan.getStart(), 0);
        final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension());

        final ActiveRegion result = new ActiveRegion( subActive, Collections.<ActivityProfileState>emptyList(), isActive, genomeLocParser, requiredExtension );

        final List<GATKSAMRecord> myReads = getReads();
        final GenomeLoc resultExtendedLoc = result.getExtendedLoc();
        final int resultExtendedLocStart = resultExtendedLoc.getStart();
        final int resultExtendedLocStop = resultExtendedLoc.getStop();

        final List<GATKSAMRecord> trimmedReads = new ArrayList<>(myReads.size());
        for( final GATKSAMRecord read : myReads ) {
            final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion(read,
                    resultExtendedLocStart, resultExtendedLocStop);
            if( result.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 )
                trimmedReads.add(clippedRead);
        }
        result.clearReads();
        result.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
        return result;
    }

    public void setFinalized(final boolean value) {
        hasBeenFinalized = value;
    }

    public boolean isFinalized() {
        return hasBeenFinalized;
    }

}
TOP

Related Classes of org.broadinstitute.gatk.utils.activeregion.ActiveRegion

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.