Source Code of org.broadinstitute.gatk.utils.codecs.refseq.RefSeqCodec

/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/


package org.broadinstitute.gatk.utils.codecs.refseq;


import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.readers.LineIterator;
import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.UserException;


import java.util.ArrayList;


/**
 * Allows for reading in RefSeq information
 *
 * <p>
 * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
 * strandedness of transcription. 
 * </p>
 *
 * <p>
 * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here
 * <a href="http://www.broadinstitute.org/gatk/guide/article?id=1329">http://www.broadinstitute.org/gatk/guide/article?id=1329</a>
 * </p>
 * <h2> Usage </h2>
 * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
 * <pre>
 * -refSeqBinding:REFSEQ /path/to/refSeq.txt
 * </pre>
 *
 * You will need to consult individual walkers for the binding name ("refSeqBinding", above)
 *
 * <h2>File format example</h2>
 * If you want to define your own file for use, the format is (tab delimited):
 * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
 * and exon frames, for example:
 * <pre>
 * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
 * </pre>
 * for more information see <a href="http://skip.ucsc.edu/cgi-bin/hgTables?hgsid=5651&hgta_doSchemaDb=mm8&hgta_doSchemaTable=refGene">here</a>
 * <p>
 *     
 * </p>
 *
 * @author Mark DePristo
 * @since 2010
 */
public class RefSeqCodec extends AsciiFeatureCodec<RefSeqFeature> implements ReferenceDependentFeatureCodec {


    /**
     * The parser to use when resolving genome-wide locations.
     */
    private GenomeLocParser genomeLocParser;
    private boolean zero_coding_length_user_warned = false;


    public RefSeqCodec() {
        super(RefSeqFeature.class);
    }


    /**
     * Set the parser to use when resolving genetic data.
     * @param genomeLocParser The supplied parser.
     */
    @Override
    public void setGenomeLocParser(GenomeLocParser genomeLocParser) {
        this.genomeLocParser =  genomeLocParser;
    }


    @Override
    public Feature decodeLoc(final LineIterator lineIterator) {
        final String line = lineIterator.next();
        if (line.startsWith("#")) return null;
        String fields[] = line.split("\t");
        if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length);
        String contig_name = fields[2];
        try {
            return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
        } catch ( UserException.MalformedGenomeLoc e ) {
            Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")");
            return null;
        } catch ( NumberFormatException e ) {
            throw new UserException.MalformedFile("Could not parse location from line: " + line);
        }
    }


    /** Fills this object from a text line in RefSeq (UCSC) text dump file */
    @Override
    public RefSeqFeature decode(String line) {
        if (line.startsWith("#")) return null;
        String fields[] = line.split("\t");


        // we reference postion 15 in the split array below, make sure we have at least that many columns
        if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length);
        String contig_name = fields[2];
        RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));


        feature.setTranscript_id(fields[1]);
        if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1);
        else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1);
        else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line);


        int coding_start = Integer.parseInt(fields[6])+1;
        int coding_stop = Integer.parseInt(fields[7]);


        if ( coding_start > coding_stop ) {
            if ( ! zero_coding_length_user_warned ) {
                Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+
                        "Such transcripts will be ignored (this warning is printed only once)");
                zero_coding_length_user_warned = true;
            }
            return null;
        }


        feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])));
        feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop));
        feature.setGene_name(fields[12]);
        String[] exon_starts = fields[9].split(",");
        String[] exon_stops = fields[10].split(",");
        String[] eframes = fields[15].split(",");


        if ( exon_starts.length != exon_stops.length )
            throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line);
        if ( exon_starts.length != eframes.length )
            throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line);


        ArrayList<GenomeLoc> exons = new ArrayList<GenomeLoc>(exon_starts.length);
        ArrayList<Integer> exon_frames = new ArrayList<Integer>(eframes.length);


        for ( int i = 0 ; i < exon_starts.length  ; i++ ) {
            exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
            exon_frames.add(Integer.decode(eframes[i]));
        }


        feature.setExons(exons);
        feature.setExon_frames(exon_frames);
        return feature;
    }


    @Override
    public Object readActualHeader(LineIterator lineIterator) {
        // No header for this format
        return null;
    }
}
Source Code of org.broadinstitute.gatk.utils.codecs.refseq.RefSeqCodec

Related Classes of org.broadinstitute.gatk.utils.codecs.refseq.RefSeqCodec