Package picard.analysis.directed

Source Code of picard.analysis.directed.RnaSeqMetricsCollector

package picard.analysis.directed;

import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.SequenceUtil;
import picard.PicardException;
import picard.analysis.MetricAccumulationLevel;
import picard.analysis.RnaSeqMetrics;
import picard.annotation.Gene;
import picard.annotation.LocusFunction;
import picard.metrics.PerUnitMetricCollector;
import picard.metrics.SAMRecordMultiLevelCollector;
import picard.util.MathUtil;

import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class RnaSeqMetricsCollector extends SAMRecordMultiLevelCollector<RnaSeqMetrics, Integer> {
    public enum StrandSpecificity {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND}

    private final int minimumLength;
    private final StrandSpecificity strandSpecificity;
    private final double rrnaFragmentPercentage;
    private final Long ribosomalInitialValue;

    final private Set<Integer> ignoredSequenceIndices;

    private final OverlapDetector<Gene> geneOverlapDetector;
    private final OverlapDetector<Interval> ribosomalSequenceOverlapDetector;
    private final boolean collectCoverageStatistics;
   
    public RnaSeqMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords,
                                  final Long ribosomalBasesInitialValue, OverlapDetector<Gene> geneOverlapDetector, OverlapDetector<Interval> ribosomalSequenceOverlapDetector,
                                  final HashSet<Integer> ignoredSequenceIndices, final int minimumLength, final StrandSpecificity strandSpecificity,
                                  final double rrnaFragmentPercentage, boolean collectCoverageStatistics) {
        this.ribosomalInitialValue  = ribosomalBasesInitialValue;
        this.ignoredSequenceIndices = ignoredSequenceIndices;
        this.geneOverlapDetector    = geneOverlapDetector;
        this.ribosomalSequenceOverlapDetector = ribosomalSequenceOverlapDetector;
        this.minimumLength          = minimumLength;
        this.strandSpecificity      = strandSpecificity;
        this.rrnaFragmentPercentage = rrnaFragmentPercentage;
        this.collectCoverageStatistics = collectCoverageStatistics;
        setup(accumulationLevels, samRgRecords);
    }

    @Override
    protected PerUnitMetricCollector<RnaSeqMetrics, Integer, SAMRecord> makeChildCollector(final String sample, final String library, final String readGroup) {
        return new PerUnitRnaSeqMetricsCollector(sample, library, readGroup, ribosomalInitialValue);
    }

    public static OverlapDetector<Interval> makeOverlapDetector(final File samFile, final SAMFileHeader header, final File ribosomalIntervalsFile) {

        OverlapDetector<Interval> ribosomalSequenceOverlapDetector = new OverlapDetector<Interval>(0, 0);
        if (ribosomalIntervalsFile != null) {

            final IntervalList ribosomalIntervals = IntervalList.fromFile(ribosomalIntervalsFile);
            try {
                SequenceUtil.assertSequenceDictionariesEqual(header.getSequenceDictionary(), ribosomalIntervals.getHeader().getSequenceDictionary());
            } catch (SequenceUtil.SequenceListsDifferException e) {
                throw new PicardException("Sequence dictionaries differ in " + samFile.getAbsolutePath() + " and " + ribosomalIntervalsFile.getAbsolutePath(),
                        e);
            }
            ribosomalIntervals.unique();
            final List<Interval> intervals = ribosomalIntervals.getIntervals();
            ribosomalSequenceOverlapDetector.addAll(intervals, intervals);
        }
        return ribosomalSequenceOverlapDetector;
    }

    public static HashSet<Integer> makeIgnoredSequenceIndicesSet(final SAMFileHeader header, final Set<String> ignoredSequence) {
        final HashSet<Integer> ignoredSequenceIndices = new HashSet<Integer>();
        for (final String sequenceName: ignoredSequence) {
            final SAMSequenceRecord sequenceRecord = header.getSequence(sequenceName);
            if (sequenceRecord == null) {
                throw new PicardException("Unrecognized sequence " + sequenceName + " passed as argument to IGNORE_SEQUENCE");
            }
            ignoredSequenceIndices.add(sequenceRecord.getSequenceIndex());
        }
        return ignoredSequenceIndices;
    }

    private class PerUnitRnaSeqMetricsCollector implements PerUnitMetricCollector<RnaSeqMetrics, Integer, SAMRecord> {

        final RnaSeqMetrics metrics = new RnaSeqMetrics();
       
        private final Map<Gene.Transcript, int[]> coverageByTranscript = new HashMap<Gene.Transcript, int[]>();

        public PerUnitRnaSeqMetricsCollector(final String sample,
                                             final String library,
                                             final String readGroup,
                                             final Long ribosomalBasesInitialValue) {
            this.metrics.SAMPLE = sample;
            this.metrics.LIBRARY = library;
            this.metrics.READ_GROUP = readGroup;
            this.metrics.RIBOSOMAL_BASES = ribosomalBasesInitialValue;
           
        }

        public void acceptRecord(SAMRecord rec) {
            // Filter out some reads, and collect the total number of PF bases
            if (rec.getReadFailsVendorQualityCheckFlag() || rec.isSecondaryOrSupplementary()) return;

            this.metrics.PF_BASES += rec.getReadLength();
            if (rec.getReadUnmappedFlag()) return;

            if (ignoredSequenceIndices.contains(rec.getReferenceIndex())) {
                ++this.metrics.IGNORED_READS;
                return;
            }

            // Grab information about the alignment and overlapping genes etc.
            final Interval readInterval = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd());

            // Attempt to get an interval for the entire fragment (if paired read) else just use the read itself.
            // If paired read is chimeric or has one end unmapped, don't create an interval.
            final Interval fragmentInterval;
            if (!rec.getReadPairedFlag()) {
                fragmentInterval = readInterval;
            } else if (rec.getMateUnmappedFlag() || rec.getReferenceIndex() != rec.getMateReferenceIndex()) {
                fragmentInterval = null;
            } else {
                final int fragmentStart = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart());
                final int fragmentEnd = CoordMath.getEnd(fragmentStart, Math.abs(rec.getInferredInsertSize()));
                fragmentInterval = new Interval(rec.getReferenceName(), fragmentStart, fragmentEnd);
            }
            if (fragmentInterval != null) {
                final Collection<Interval> overlappingRibosomalIntervals = ribosomalSequenceOverlapDetector.getOverlaps(fragmentInterval);
                int intersectionLength = 0;
                for (final Interval overlappingInterval : overlappingRibosomalIntervals) {
                    final int thisIntersectionLength = overlappingInterval.getIntersectionLength(fragmentInterval);
                    intersectionLength = Math.max(intersectionLength, thisIntersectionLength);
                }
                if (intersectionLength/(double)fragmentInterval.length() >= rrnaFragmentPercentage) {
                    // Assume entire read is ribosomal.
                    // TODO: Should count reads, not bases?
                    metrics.RIBOSOMAL_BASES += rec.getReadLength();
                    int numAlignedBases = 0;
                    for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
                        numAlignedBases += alignmentBlock.getLength();
                    }
                    metrics.PF_ALIGNED_BASES += numAlignedBases;
                    return;
                }
            }

            final Collection<Gene> overlappingGenes                  = geneOverlapDetector.getOverlaps(readInterval);
            final List<AlignmentBlock> alignmentBlocks               = rec.getAlignmentBlocks();
            boolean overlapsExon = false;

            for (final AlignmentBlock alignmentBlock : alignmentBlocks) {
                // Get functional class for each position in the alignment block.
                final LocusFunction[] locusFunctions = new LocusFunction[alignmentBlock.getLength()];

                // By default, if base does not overlap with rRNA or gene, it is intergenic.
                Arrays.fill(locusFunctions, 0, locusFunctions.length, LocusFunction.INTERGENIC);

                for (final Gene gene : overlappingGenes) {
                    for (final Gene.Transcript transcript : gene) {
                        transcript.assignLocusFunctionForRange(alignmentBlock.getReferenceStart(), locusFunctions);
                        // if you want to gather coverage statistics, this variable should be true.
                        // added for cases with many units [samples/read groups] which overwhelm memory.               
                        // Add coverage to our coverage counter for this transcript
                        if (collectCoverageStatistics) {
                            int[] coverage = this.coverageByTranscript.get(transcript);
                            if (coverage == null) {
                                coverage = new int[transcript.length()];
                                this.coverageByTranscript.put(transcript, coverage);
                            }
                            transcript.addCoverageCounts(alignmentBlock.getReferenceStart(),
                                    CoordMath.getEnd(alignmentBlock.getReferenceStart(), alignmentBlock.getLength()),
                                    coverage);
                        }

                    }
                }

                // Tally the function of each base in the alignment block.
                for (final LocusFunction locusFunction : locusFunctions) {
                    ++metrics.PF_ALIGNED_BASES;
                    switch (locusFunction) {
                        case INTERGENIC:
                            ++metrics.INTERGENIC_BASES;
                            break;
                        case INTRONIC:
                            ++metrics.INTRONIC_BASES;
                            break;
                        case UTR:
                            ++metrics.UTR_BASES;
                            overlapsExon = true;
                            break;
                        case CODING:
                            ++metrics.CODING_BASES;
                            overlapsExon = true;
                            break;
                        case RIBOSOMAL:
                            ++metrics.RIBOSOMAL_BASES;
                            break;
                    }
                }
            }

            // Strand-specificity is tallied on read basis rather than base at a time.  A read that aligns to more than one
            // gene is not counted.
            if (overlapsExon && strandSpecificity != StrandSpecificity.NONE && overlappingGenes.size() == 1) {
                final boolean negativeTranscriptionStrand = overlappingGenes.iterator().next().isNegativeStrand();
                final boolean negativeReadStrand = rec.getReadNegativeStrandFlag();
                final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand;
                final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag();
                final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND;
                final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree;
                // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree,
                // then the strand specificity for this read is correct.
                // -- OR --
                // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed
                // to agree, then the strand specificity for this read is correct.
                if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) {
                    ++metrics.CORRECT_STRAND_READS;
                } else {
                    ++metrics.INCORRECT_STRAND_READS;
                }
            }

        }

        public void finish() {
            if (metrics.PF_ALIGNED_BASES > 0) {
                if (metrics.RIBOSOMAL_BASES != null) {
                    metrics.PCT_RIBOSOMAL_BASES =  metrics.RIBOSOMAL_BASES  / (double) metrics.PF_ALIGNED_BASES;
                }
                metrics.PCT_CODING_BASES =     metrics.CODING_BASES     / (double) metrics.PF_ALIGNED_BASES;
                metrics.PCT_UTR_BASES =        metrics.UTR_BASES        / (double) metrics.PF_ALIGNED_BASES;
                metrics.PCT_INTRONIC_BASES =   metrics.INTRONIC_BASES   / (double) metrics.PF_ALIGNED_BASES;
                metrics.PCT_INTERGENIC_BASES = metrics.INTERGENIC_BASES / (double) metrics.PF_ALIGNED_BASES;
                metrics.PCT_MRNA_BASES =       metrics.PCT_CODING_BASES + metrics.PCT_UTR_BASES;
                metrics.PCT_USABLE_BASES =     (metrics.CODING_BASES + metrics.UTR_BASES) / (double) metrics.PF_BASES;
            }

            if (metrics.CORRECT_STRAND_READS > 0 || metrics.INCORRECT_STRAND_READS > 0) {
                metrics.PCT_CORRECT_STRAND_READS = metrics.CORRECT_STRAND_READS/(double)(metrics.CORRECT_STRAND_READS + metrics.INCORRECT_STRAND_READS);
            }
        }

        @Override
        public void addMetricsToFile(final MetricsFile<RnaSeqMetrics, Integer> file) {
            // Compute metrics based on coverage of top 1000 genes
            final Histogram<Integer> normalizedCovByPos = computeCoverageMetrics();
            file.addMetric(metrics);
            file.addHistogram(normalizedCovByPos);
        }

        /**
         * Computes a set of coverage based metrics on the mostly highly expressed genes' most highly
         * expressed transcripts.
         */
        private Histogram<Integer> computeCoverageMetrics() {
            final Histogram<Double> cvs = new Histogram<Double>();
            final Histogram<Double> fivePrimeSkews = new Histogram<Double>();
            final Histogram<Double> threePrimeSkews = new Histogram<Double>();
            final Histogram<Double> gapBasesPerKb = new Histogram<Double>();
            final Histogram<Double> fiveToThreeSkews = new Histogram<Double>();
            String prefix = null;
            if (this.metrics.READ_GROUP != null) {
                prefix = this.metrics.READ_GROUP + ".";
            }
            else if (this.metrics.LIBRARY != null) {
                prefix = this.metrics.LIBRARY + ".";
            }
            else if (this.metrics.SAMPLE != null) {
                prefix = this.metrics.SAMPLE + ".";
            }
            else {
                prefix = "All_Reads.";
            }

            final Histogram<Integer> normalizedCoverageByNormalizedPosition = new Histogram<Integer>("normalized_position", prefix + "normalized_coverage");

            final Map<Gene.Transcript,int[]> transcripts = pickTranscripts(coverageByTranscript);
            final double transcriptCount = transcripts.size();

            for (final Map.Entry<Gene.Transcript,int[]> entry : transcripts.entrySet()) {
                final Gene.Transcript tx = entry.getKey();
                final double[] coverage;
                {
                    final double[] tmp = MathUtil.promote(entry.getValue());
                    if (tx.getGene().isPositiveStrand())  coverage = tmp;
                    else coverage = copyAndReverse(tmp);
                }
                final double mean = MathUtil.mean(coverage, 0, coverage.length);

                // Calculate the CV of coverage for this tx
                final double stdev = MathUtil.stddev(coverage, 0, coverage.length, mean);
                final double cv    = stdev / mean;
                cvs.increment(cv);

                // Calculate the 5' and 3' biases
                {
                    final int PRIME_BASES = 100;
                    final double fivePrimeCoverage = MathUtil.mean(coverage, 0, PRIME_BASES);
                    final double threePrimeCoverage = MathUtil.mean(coverage, coverage.length - PRIME_BASES, coverage.length);

                    fivePrimeSkews.increment(fivePrimeCoverage / mean);
                    threePrimeSkews.increment(threePrimeCoverage / mean);
                    fiveToThreeSkews.increment(fivePrimeCoverage / threePrimeCoverage);
                }

                // Calculate normalized coverage vs. normalized position
                {
                    final int lastIndex = coverage.length - 1;

                    for (int percent=0; percent<=100; ++percent) {
                        final double p = percent / 100d;
                        final int start  = (int) Math.max(0,         lastIndex * (p-0.005));
                        final int end    = (int) Math.min(lastIndex, lastIndex * (p+0.005));
                        final int length = end - start + 1;

                        double sum = 0;
                        for (int i=start; i<=end; ++i) sum += coverage[i];
                        final double normalized = (sum / length) / mean;
                        normalizedCoverageByNormalizedPosition.increment(percent, normalized / transcriptCount);
                    }
                }

                // Calculate gap bases per kilobase
                //            {
                //                int gapBases = 0;
                //                final double minCoverage = mean * 0.1;
                //                for (int i=0; i<coverage.length; ++i) {
                //                    if (coverage[i] < minCoverage) ++gapBases;
                //                }
                //                gapBasesPerKb.increment(gapBases / (coverage.length / 1000d));
                //            }
            }

            this.metrics.MEDIAN_CV_COVERAGE = cvs.getMedian();
            this.metrics.MEDIAN_5PRIME_BIAS = fivePrimeSkews.getMedian();
            this.metrics.MEDIAN_3PRIME_BIAS = threePrimeSkews.getMedian();
            this.metrics.MEDIAN_5PRIME_TO_3PRIME_BIAS = fiveToThreeSkews.getMedian();

            return normalizedCoverageByNormalizedPosition;
        }

        /** Little method to copy an array and reverse it at the same time. */
        private double[] copyAndReverse(final double[] in) {
            final double[] out = new double[in.length];
            for (int i=0, j=in.length-1; i<in.length; ++i, --j) out[j] = in[i];
            return out;
        }

        /** Picks the set of transcripts on which the coverage metrics are to be calculated. */
        public Map<Gene.Transcript, int[]> pickTranscripts(final Map<Gene.Transcript, int[]> transcriptCoverage) {
            final Map<Gene.Transcript, Double> bestPerGene = new HashMap<Gene.Transcript, Double>();

            // Make a map of the best transcript per gene to it's mean coverage
            for (final Gene gene : geneOverlapDetector.getAll()) {
                Gene.Transcript best = null;
                double bestMean = 0;

                for (final Gene.Transcript tx : gene) {
                    final int[] cov = transcriptCoverage.get(tx);

                    if (tx.length() < Math.max(minimumLength, 100)) continue;
                    if (cov == null) continue;

                    final double mean = MathUtil.mean(MathUtil.promote(cov), 0, cov.length);
                    if (mean < 1d) continue;
                    if (best == null || mean > bestMean) {
                        best = tx;
                        bestMean = mean;
                    }
                }

                if (best != null) bestPerGene.put(best, bestMean);
            }

            // Find the 1000th best coverage value
            final double[] coverages = new double[bestPerGene.size()];
            int i=0;
            for (final double d : bestPerGene.values()) coverages[i++] = d;
            Arrays.sort(coverages);
            final double min = coverages.length == 0 ? 0 : coverages[Math.max(0, coverages.length - 1001)];

            // And finally build the output map
            final Map<Gene.Transcript, int[]> retval = new HashMap<Gene.Transcript, int[]>();
            for (final Map.Entry<Gene.Transcript,Double> entry : bestPerGene.entrySet()) {
                final Gene.Transcript tx = entry.getKey();
                final double coverage = entry.getValue();

                if (coverage >= min) {
                    retval.put(tx, transcriptCoverage.get(tx));
                }
            }

            return retval;
        }

    }
}
TOP

Related Classes of picard.analysis.directed.RnaSeqMetricsCollector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.