Package org.broadinstitute.gatk.engine.refdata

Source Code of org.broadinstitute.gatk.engine.refdata.SeekableRODIterator

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.refdata;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CloseableIterator;
import org.broadinstitute.gatk.engine.iterators.PushbackIterator;
import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature;
import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* Wrapper class for iterators over ROD objects. It is assumed that the underlying iterator can only
* perform standard next() operation, which advances it to the next ROD in the stream (i.e. reads the data file
* line by line). This iterator 1) shifts the focus from record-based traversal to position-based traversal,
* and 2) adds querying seekForward() method.
*
* Namely, this iterator's next() method advances not to the next ROD in the underlying stream, but to the next
* genomic position covered by (at least one) ROD, and returns all RODs overlapping with that position as a RODRecordList
* collection-like object. Similarly, when seekForward(interval) is called, this iterator skips all the RODs from the
* underlying stream, until it reaches specified genomic interval, and returns the list of all RODs overlapping with that interval.
*
* NOTE: this iterator has a STATE: next() operation is not allowed after a seekForward() to a non-point (extended) interval
* of length > 1. Such a call would leave the iterator in an inconsistent state. seekForward() can always be called after
* either seekForward() or next() (as long as usual ordering criteria are satisfied: the query interval location can neither
* start before the current position, nor end before the previous query end). seekForward to an interval of length 1
* reenables next() operation.
*
* Created by IntelliJ IDEA.
* User: asivache
* Date: Sep 10, 2009
* Time: 6:20:46 PM
* To change this template use File | Settings | File Templates.
*/
public class SeekableRODIterator implements LocationAwareSeekableRODIterator {
    /**
     * Header for the datasource backing this iterator.
     */
    private final Object header;

    /**
     * The parser, used to construct new genome locs.
     */
    private final GenomeLocParser parser;

    private final SAMSequenceDictionary sequenceDictionary;

    private PushbackIterator<GATKFeature> it;
    List<GATKFeature> records = null// here we will keep a pile of records overlaping with current position; when we iterate
                               // and step out of record's scope, we purge it from the list
    String name = null; // name of the ROD track wrapped by this iterator. Will be pulled from underlying iterator.

    int curr_position = 0; // where the iterator is currently positioned on the genome
    int max_position = 0// the rightmost stop position of currently loaded records
    String curr_contig = null;   // what contig the iterator is currently on
    boolean next_is_allowed = true; // see discussion below. next() is illegal after seek-forward queries of length > 1

    // the stop position of the last query. We can query only in forward direction ("seek forward");
    // it is not only the start position of every successive query that can not be before the start
    // of the previous one (curr_start), but it is also illegal for a query interval to *end* before
    // the end of previous query, otherwise we can end up in an inconsistent state
    int curr_query_end = -1;

    // EXAMPLE of inconsistency curr_query_end guards against:
    //              record 1      record 2
    //             ----------     -----------
    // -------------------------------------------------- REF
    //         ------------------------- query 1 (interval 1)
    //               ----------  query 2 (interval 2)
    //                     --------------- query 3
    //
    // If we query first for interval 1, both record 1 and record 2 will be loaded.
    // Query for interval 2, on the other hand, should return only record 1, but after
    // query 1 was performed, record 2 is already loaded from the file. If, on the other hand,
    // we try to un-load it from memory, we won't be able to read it again. Hence query 2 is not
    // allowed after query 1. Note also, that curr_query_end is not equivalent to max_position:
    // the latter only tracks where currently loaded records end (and hence helps to re-load records);
    // after query 1 is performed, max_position will be the end of record 2, but query 3 is still
    // perfectly legal after query 1.
    //
    // IMPORTANT NOTE: it follows from the above discussion and example that next() is illegal after ANY
    // seek-forward query EXCEPT those that are performed with length-1 intervals (queryInterval.start=queryinteval.stop).
    // Indeed, in the example above, after, e.g., query 1 is performed, the iterator is "located" at the start
    // of interval 1, but record1 and record 2 are already loaded. On the other hand, a subsequent call to next() would
    // need to shift iterator's position by 1 base and return only record 1.
    //
    // This implementation tracks the query history and makes next() illegal after a seekforward query of length > 1,
    // but re-enables next() again after a length-1 query.

    public SeekableRODIterator(Object header,SAMSequenceDictionary rodDictionary,SAMSequenceDictionary referenceDictionary,GenomeLocParser parser,CloseableIterator<GATKFeature> it) {
        this.header = header;
        this.parser = parser;
        this.sequenceDictionary = rodDictionary;
        this.it = new PushbackIterator<GATKFeature>(it);
        records = new LinkedList<GATKFeature>();
        // the following is a trick: we would like the iterator to know the actual name assigned to
        // the ROD implementing object we are working with. But the only way to do that is to
        // get an instance of that ROD and query it for its name. Now, the only generic way we have at this point to instantiate
        // the ROD is to make the underlying stream iterator to do it for us. So we are reading (or rather peeking into)
        // the first line of the track data file just to get the ROD object created.
        GATKFeature r = null;
        if (this.it.hasNext()) r = this.it.element();
        name = (r==null?null:r.getName());

        curr_contig = referenceDictionary.getSequence(0).getSequenceName();
    }

    /**
     * Gets the header associated with the backing input stream.
     * @return the ROD header.
     */
    @Override
    public Object getHeader() {
        return header;
    }

    /**
     * Gets the sequence dictionary associated with the backing input stream.
     * @return sequence dictionary from the ROD header.
     */
    @Override
    public SAMSequenceDictionary getSequenceDictionary() {
        return sequenceDictionary;
    }


    /**
     * Returns true if the data we iterate over has records associated with (any, not necessarily adjacent)
     * genomic position farther along the reference.
     * @return
     */
    public boolean hasNext() {

        // if we did not walk to the very end of the interval(s) covered by currently loaded
        // annotations (records), then we definitely have data for next genomic location
        if ( curr_position < max_position ) return true;

        // we are past currently loaded stuff; we have next if there are more lines to load:
        return it.hasNext();
    }

    // Returns point location (i.e. genome loc of length 1) on the reference, to which this iterator will advance
    // upon next call to next().
    public GenomeLoc peekNextLocation() {
        if ( curr_position + 1 <= max_position ) return parser.createGenomeLoc(curr_contig,curr_position+1);

        // sorry, next reference position is not covered by the RODs we are currently holding. In this case,
        // the location we will jump to upon next call to next() is the start of the next ROD record that we did
        // not read yet:
        if ( it.hasNext() ) {
            GATKFeature r = it.element(); // peek, do not load!
            return parser.createGenomeLoc(r.getLocation().getContig(),r.getLocation().getStart());
        }
        return null; // underlying iterator has no more records, there is no next location!
    }

    /** Advances iterator to the next genomic position that has ROD record(s) associated with it,
     * and returns all the records overlapping with that position as a RODList. The location of the whole
     * RODList object will be set to the smallest interval subsuming genomic intervals of all returned records.
     * Note that next() is disabled (will throw an exception) after seekForward() operation with query length > 1.
     * @return list of all RODs overlapping with the next "covered" genomic position
     */
     public RODRecordList next() {
         if ( ! next_is_allowed )
             throw new ReviewedGATKException("Illegal use of iterator: Can not advance iterator with next() after seek-forward query of length > 1");

         curr_position++;
//        curr_query_end = -1;

         if ( curr_position <= max_position ) {

             // we still have bases covered by at least one currently loaded record;
             // we have to purge only subset of records, on which we moved past the end
             purgeOutOfScopeRecords();
         } else {
             // ooops, we are past the end of all loaded records - kill them all at once,
             // load next record and reinitialize by fastforwarding current position to the start of next record
             records.clear();
             GATKFeature r = it.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe
             records.add( r );
             curr_contig = r.getLocation().getContig();
             curr_position = r.getLocation().getStart();
             max_position = r.getLocation().getStop();
         }

         // current position is ste and at this point 'records' only keeps those annotations, on which we did not reach the end yet
         // (we might have reloaded records completely if it was necessary); but we are not guaranteed yet that we
         // hold ALL the records overlapping with the current position. Time to check if we just walked into the interval(s)
         // covered by new records, so we need to load them too:

         while ( it.hasNext() ) {
             GATKFeature r = it.element();
             if ( r == null ) {
                 it.next();
                 continue;
             }

             GenomeLoc currentContig = parser.createOverEntireContig(curr_contig);
             GenomeLoc thatContig = r.getLocation();

             if ( currentContig.isPast(thatContig) )
                 throw new UserException("LocationAwareSeekableRODIterator: contig " +r.getLocation().getContig() +
                         " occurs out of order in track " + r.getName() );
             if ( currentContig.isBefore(thatContig) ) break; // next record is on a higher contig, we do not need it yet...

             if ( r.getLocation().getStart() < curr_position )
                 throw new UserException("LocationAwareSeekableRODIterator: track "+r.getName() +
                         " is out of coordinate order on contig "+r.getLocation() + " compared to " + curr_contig + ":" + curr_position);

             if ( r.getLocation().getStart() > curr_position ) break; // next record starts after the current position; we do not need it yet

             r = it.next(); // we got here only if we do need next record, time to load it for real

             int stop = r.getLocation().getStop();
             if ( stop < curr_position ) throw new ReviewedGATKException("DEBUG: encountered contig that should have been loaded earlier"); // this should never happen
             if ( stop > max_position ) max_position = stop; // max_position keeps the rightmost stop position across all loaded records
             records.add(r);
         }

         // 'records' and current position are fully updated. Last, we need to set the location of the whole track
        // (collection of ROD records) to the genomic site we are currently looking at, and return the list

         return new RODRecordListImpl(name,records, parser.createGenomeLoc(curr_contig,curr_position));
     }

    /**
     * Removes from the underlying collection the last element returned by the
     * iterator (optional operation).  This method can be called only once per
     * call to <tt>next</tt>.  The behavior of an iterator is unspecified if
     * the underlying collection is modified while the iteration is in
     * progress in any way other than by calling this method.
     *
     * @throws UnsupportedOperationException if the <tt>remove</tt>
     *                                       operation is not supported by this Iterator.
     * @throws IllegalStateException         if the <tt>next</tt> method has not
     *                                       yet been called, or the <tt>remove</tt> method has already
     *                                       been called after the last call to the <tt>next</tt>
     *                                       method.
     */
    public void remove() {
        throw new UnsupportedOperationException("LocationAwareSeekableRODIterator does not implement remove() operation");
    }


    /**
     * Returns the current "position" (not location!! ;) ) of this iterator. This method is used by the sharding
     * system when it searches for available iterators in the pool that can be reused to resume traversal.
     * When iterator is advanced using next(), current position
     * is the same as 'location'. However, after a seekForward() query with extended interval, returned position
     * will be set to the last position of the query interval, to disable (illegal) attempts to roll the iterator
     * back and re-start traversal from current location.
     * @return Current ending position of the iterator, or null if no position exists.
     */
    public GenomeLoc position() {
        if ( curr_contig == null ) return null;
        if ( curr_query_end > curr_position )  {
            // do not attempt to reuse this iterator if the position we need it for lies before the end of last query performed
            return parser.createGenomeLoc(curr_contig,curr_query_end,curr_query_end);
        }
        else {
            return parser.createGenomeLoc(curr_contig,curr_position);
        }
    }

    /**
     * Seeks forward through the file until the specified interval is reached.
     * The location object <code>interval</code> can be either a single point or an extended interval. All
     * ROD records overlapping with the whole interval will be returned, or null if no such records exist.
     *
     * Query interval must start at or after the iterator's current location, or exception will be thrown.
     *
     * Query interval must end at or after the stop position of the previous query, if any, or an exception will
     * be thrown: subsequent queries that end before the stop of previous ones are illegal.
     *
     * If seekForward() is performed to an extended (length > 1 i.e. start != stop) interval, next() operation becomes
     * illegal (the iterator changes state). Only seekForward() calls are allowed thereafter, until a seekForward() call
     * to a length-1 interval is performed, which re-enables next(). seekForward() queries with length-1 intervals can
     * always be safely intermixed with next() (as long as ordering is respected and query intervals are at or after the
     * current position).
     *
     * Note that in contrast to
     * next() (which always advances current position of the iterator on the reference), this method scrolls
     * forward ONLY if the specified interval is ahead of the current location of
     * the iterator. However, if called again with the same 'interval' argument as before, seekForward will NOT
     * advance, but will simply return the same ROD list as before.
     *
     *
     * @param interval point-like genomic location to fastforward to.
     * @return ROD object at (or overlapping with) the specified position, or null if no such ROD exists.
     */
    public RODRecordList seekForward(GenomeLoc interval) {

        if ( interval.isBefore(parser.createOverEntireContig(curr_contig)) &&
             !(interval.getStart() == 0 && interval.getStop() == 0 && interval.getContig().equals(curr_contig)) ) // This criteria is syntactic sugar for 'seek to right before curr_contig'
            throw new ReviewedGATKException("Out of order query: query contig "+interval.getContig()+" is located before "+
                                     "the iterator's current contig");
        if ( interval.getContig().equals(curr_contig) ) {
            if ( interval.getStart() < curr_position )
                throw new ReviewedGATKException("Out of order query: query position "+interval +" is located before "+
                        "the iterator's current position "+curr_contig + ":" + curr_position);
            if ( interval.getStop() < curr_query_end )
                throw new ReviewedGATKException("Unsupported querying sequence: current query interval " +
                        interval+" ends before the end of previous query interval ("+curr_query_end+")");
        }

        curr_position = interval.getStart();
        curr_query_end = interval.getStop();

        next_is_allowed = ( curr_position == curr_query_end ); // we can call next() later only if interval length is 1

        if interval.getContig().equals(curr_contig) &&  curr_position <= max_position ) {
            // some of the intervals we are currently keeping do overlap with the query interval

            purgeOutOfScopeRecords();
        } else {
            // clean up and get ready for fast-forwarding towards the requested position
            records.clear();
            max_position = -1;
            curr_contig = interval.getContig();
        }

        // curr_contig and curr_position are set to where we asked to scroll to

        while ( it.hasNext() ) {
            GATKFeature r = it.next();
            if ( r == null ) continue;

            GenomeLoc currentContig = parser.createOverEntireContig(curr_contig);
            GenomeLoc thatContig = r.getLocation();

            if ( currentContig.isPast(thatContig) ) continue; // did not reach requested contig yet
            if ( currentContig.isBefore(thatContig) ) {
                it.pushback(r); // next record is on the higher contig, we do not need it yet...
                break;
            }

            // we get here if we are on the requested contig:

            if ( r.getLocation().getStop() < curr_position ) continue; // did not reach the requested interval yet

            if ( r.getLocation().getStart() > curr_query_end ) {
                // past the query interval
                it.pushback(r);
                break;
            }

            // we get here only if interval of the record r overlaps with query interval, so the record should be loaded
            if ( r.getLocation().getStop() > max_position ) max_position = r.getLocation().getStop();
            records.add(r);
        }

        if ( records.size() > 0 ) {
            return new RODRecordListImpl(name,records,interval);
        } else {
            return null;
        }

    }

    /**
     * Removes records that end before the curr_position from the list of currently kept records. This is a
     * convenience (private) shortcut that does not perform extensive checking. In particular, it assumes that
     * curr_position <= max_position, as well as that we are still on the same contig.
     */
    private void purgeOutOfScopeRecords() {
        Iterator<GATKFeature> i = records.iterator();
        while ( i.hasNext() ) {
            GATKFeature r = i.next();
            if ( r.getLocation().getStop() < curr_position ) {
                i.remove(); // we moved past the end of interval the record r is associated with, purge the record forever
            }
        }

    }

    @Override
    public void close() {
        if (this.it != null) ((CloseableIterator)this.it.getUnderlyingIterator()).close();
    }

}
TOP

Related Classes of org.broadinstitute.gatk.engine.refdata.SeekableRODIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.