Source Code of picard.analysis.CollectOxoGMetrics$Calculator

package picard.analysis;


import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.programgroups.Metrics;
import picard.util.DbSnpBitSetUtil;
import htsjdk.samtools.util.IntervalList;import htsjdk.samtools.util.ListMap;
import htsjdk.samtools.util.Log;import htsjdk.samtools.util.SamLocusIterator;
import htsjdk.samtools.filter.DuplicateReadFilter;
import htsjdk.samtools.filter.NotPrimaryAlignmentFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;


import java.io.File;
import java.lang.IllegalStateException;
import java.lang.Integer;
import java.lang.Math;
import java.lang.Override;
import java.lang.String;
import java.lang.System;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;


import static java.lang.Math.log10;


/**
 * Class for trying to quantify the CpCG->CpCA error rate.
 */
@CommandLineProgramProperties(
        usage = CollectOxoGMetrics.USAGE,
        usageShort = CollectOxoGMetrics.USAGE,
        programGroup = Metrics.class
)
public class CollectOxoGMetrics extends CommandLineProgram {
    static final String USAGE = "Collects metrics quantifying the CpCG -> CpCA error rate from the provided SAM/BAM";
        
    @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc="Input BAM file for analysis.")
    public File INPUT;


    @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc="Location of output metrics file to write.")
    public File OUTPUT;


    @Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME,
      doc="Reference sequence to which BAM is aligned.")
    public File REFERENCE_SEQUENCE;


    @Option(doc="An optional list of intervals to restrict analysis to.",
      optional=true)
    public File INTERVALS;
    
    @Option(doc="VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis.",
      optional=true)
    public File DB_SNP;


    @Option(shortName="Q",
      doc="The minimum base quality score for a base to be included in analysis.")
    public int MINIMUM_QUALITY_SCORE = 20;


    @Option(shortName="MQ",
      doc="The minimum mapping quality score for a base to be included in analysis.")
    public int MINIMUM_MAPPING_QUALITY = 30;


    @Option(shortName="MIN_INS",
      doc="The minimum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.")
    public int MINIMUM_INSERT_SIZE = 60;


    @Option(shortName="MAX_INS",
      doc="The maximum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.")
    public int MAXIMUM_INSERT_SIZE = 600;


    @Option(doc="When available, use original quality scores for filtering.")
    public boolean USE_OQ = true;


    @Option(doc="The number of context bases to include on each side of the assayed G/C base.")
    public int CONTEXT_SIZE = 1;


    @Option(doc="The optional set of sequence contexts to restrict analysis to. If not supplied all contexts are analyzed.")
    public Set<String> CONTEXTS = new HashSet<String>();


    @Option(doc="For debugging purposes: stop after visiting this many sites with at least 1X coverage.")
    public int STOP_AFTER = Integer.MAX_VALUE;


    private final Log log = Log.getInstance(CollectOxoGMetrics.class);
    private static final String UNKNOWN_LIBRARY = "UnknownLibrary";
    private static final String UNKNOWN_SAMPLE = "UnknownSample";


    /** Metrics class for outputs. */
    public static final class CpcgMetrics extends MetricBase {
        /** The name of the sample being assayed. */
        public String SAMPLE_ALIAS;
        /** The name of the library being assayed. */
        public String LIBRARY;
        /** The sequence context being reported on. */
        public String CONTEXT;
        /** The total number of sites that had at least one base covering them. */
        public int TOTAL_SITES;
        /** The total number of basecalls observed at all sites. */
        public long TOTAL_BASES;
        /** The number of reference alleles observed as C in read 1 and G in read 2. */
        public long REF_NONOXO_BASES;
        /** The number of reference alleles observed as G in read 1 and C in read 2. */
        public long REF_OXO_BASES;
    /** The total number of reference alleles observed */
        public long REF_TOTAL_BASES;
    /**
     * The count of observed A basecalls at C reference positions and T basecalls
     * at G reference bases that are correlated to instrument read number in a way
     * that rules out oxidation as the cause
     */
        public long ALT_NONOXO_BASES;
    /**
     * The count of observed A basecalls at C reference positions and T basecalls
     * at G reference bases that are correlated to instrument read number in a way
     * that is consistent with oxidative damage.
     */
        public long ALT_OXO_BASES;
    /** The oxo error rate, calculated as max(ALT_OXO_BASES - ALT_NONOXO_BASES, 1) / TOTAL_BASES */
        public double OXIDATION_ERROR_RATE;
    /** -10 * log10(OXIDATION_ERROR_RATE) */
        public double OXIDATION_Q;


        // Fields below this point are metrics that are calculated to see if there is oxidative damage that is
        // biased toward the reference base - i.e. occurs more where the reference base is a C vs. a G or vice
        // versa.


        /** The number of ref basecalls observed at sites where the genome reference == C. */
        public long C_REF_REF_BASES;
        /** The number of ref basecalls observed at sites where the genome reference == G. */
        public long G_REF_REF_BASES;
        /** The number of alt (A/T) basecalls observed at sites where the genome reference == C. */
        public long C_REF_ALT_BASES;
        /** The number of alt (A/T) basecalls observed at sites where the genome reference == G. */
        public long G_REF_ALT_BASES;


        /**
         * The rate at which C>A and G>T substitutions are observed at C reference sites above the expected rate if there
         * were no bias between sites with a C reference base vs. a G reference base.
         */
        public double C_REF_OXO_ERROR_RATE;
        /**  C_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */
        public double C_REF_OXO_Q;
        /**
         * The rate at which C>A and G>T substitutions are observed at G reference sites above the expected rate if there
         * were no bias between sites with a C reference base vs. a G reference base.
         */
        public double G_REF_OXO_ERROR_RATE;
        /**  G_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */
        public double G_REF_OXO_Q;
    }


    /**
     * SAM filter for insert size range.
     */
    static class InsertSizeFilter implements SamRecordFilter {
        final int minInsertSize;
        final int maxInsertSize;


        InsertSizeFilter(final int minInsertSize, final int maxInsertSize) {
            this.minInsertSize = minInsertSize;
            this.maxInsertSize = maxInsertSize;
        }


        @Override public boolean filterOut(final SAMRecord rec) {
            // Treat both parameters == 0 as not filtering
            if (minInsertSize == 0 && maxInsertSize == 0) return false;


            if (rec.getReadPairedFlag()) {
                final int ins = Math.abs(rec.getInferredInsertSize());
                return ins < minInsertSize || ins > maxInsertSize;
            }


            // If the read isn't paired and either min or max is specified filter it out
            return minInsertSize != 0 || maxInsertSize != 0;
        }


        @Override public boolean filterOut(final SAMRecord r1, final SAMRecord r2) {
            return filterOut(r1) || filterOut(r2);
        }
    }


    // Stock main method
    public static void main(final String[] args) {
        new CollectOxoGMetrics().instanceMainWithExit(args);
    }


    @Override
    protected String[] customCommandLineValidation() {
        final int size = 1 + 2*CONTEXT_SIZE;
        final List<String> messages = new ArrayList<String>();
        
        for (final String ctx : CONTEXTS) {
            if (ctx.length() != size) {
                messages.add("Context " + ctx + " is not " + size + " long as implied by CONTEXT_SIZE=" + CONTEXT_SIZE);
            }
            else if (ctx.charAt(ctx.length() / 2) != 'C') {
                messages.add("Middle base of context sequence " + ctx + " must be C");
            }
        }


        return messages.isEmpty() ? null : messages.toArray(new String[messages.size()]);
    }


    /** Mimic of Oracle's nvl() - returns the first value if not null, otherwise the second value. */
    private final <T> T nvl(final T value1, final T value2) {
        if (value1 != null) return value1;
        else return value2;
    }


    @Override
    protected int doWork() {
        IOUtil.assertFileIsReadable(INPUT);
        IOUtil.assertFileIsWritable(OUTPUT);
        if (INTERVALS != null) IOUtil.assertFileIsReadable(INTERVALS);
        IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
        
        final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
        final SAMFileReader in = new SAMFileReader(INPUT);
        
        final Set<String> samples   = new HashSet<String>();
        final Set<String> libraries = new HashSet<String>();
        for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
            samples.add(nvl(rec.getSample(), UNKNOWN_SAMPLE));
            libraries.add(nvl(rec.getLibrary(), UNKNOWN_LIBRARY));
        }
        
        // Setup the calculators
        final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS;
        final ListMap<String, Calculator> calculators = new ListMap<String, Calculator>();
        for (final String context : contexts) {
            for (final String library : libraries) {
                calculators.add(context, new Calculator(library, context));
            }
        }
        
        // Load up dbSNP if available
        log.info("Loading dbSNP File: " + DB_SNP);
        final DbSnpBitSetUtil dbSnp;
        if (DB_SNP != null) dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary());
        else dbSnp = null;


        // Make an iterator that will filter out funny looking things
        final SamLocusIterator iterator;
        if (INTERVALS != null) {
            final IntervalList intervals = IntervalList.fromFile(INTERVALS);
            intervals.unique();
            iterator = new SamLocusIterator(in, intervals, false);
        }
        else {
            iterator = new SamLocusIterator(in);
        }
        iterator.setEmitUncoveredLoci(false);
        iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
        iterator.setSamFilters(Arrays.asList(
                new NotPrimaryAlignmentFilter(),
                new DuplicateReadFilter(),
                new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE)
        ));


        log.info("Starting iteration.");
        long nextLogTime = 0;
        int sites = 0;
        
        for (final SamLocusIterator.LocusInfo info : iterator) {
            // Skip dbSNP sites
            final String chrom = info.getSequenceName();
            final int pos      = info.getPosition();
            final int index    = pos-1;
            if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos)) continue;
            
            // Skip sites at the end of chromosomes 
            final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases();
            if (pos < 3 || pos > bases.length-3) continue;


            // Skip non C-G bases
            final byte base = StringUtil.toUpperCase(bases[index]);
            if (base != 'C' && base != 'G') continue;
            
            // Get the context string
            final String context;
            {
                final String tmp = StringUtil.bytesToString(bases, index-CONTEXT_SIZE, 1 + (2*CONTEXT_SIZE)).toUpperCase();
                if (base == 'C') context = tmp;
                else /* if G */  context = SequenceUtil.reverseComplement(tmp);
            }


            final List<Calculator> calculatorsForContext = calculators.get(context);
            if (calculatorsForContext == null) continue; // happens if we get ambiguous bases in the reference
            for (final Calculator calc : calculatorsForContext) calc.accept(info, base);


            // See if we need to stop
            if (++sites % 100 == 0) {
                final long now = System.currentTimeMillis();
                if (now > nextLogTime) {
                    log.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos);
                    nextLogTime = now + 60000;
                }
            }
            if (sites >= STOP_AFTER) break;
        }


        final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile();
        for (final List<Calculator> calcs : calculators.values()) {
            for (final Calculator calc : calcs) {
                final CpcgMetrics m = calc.finish();
                m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<String>(samples));
                file.addMetric(m);
            }
        }


        file.write(OUTPUT);
        return 0;
    }


    private Set<String> makeContextStrings(final int contextSize) {
        final Set<String> contexts = new HashSet<String>();


        for (final byte[] kmer : generateAllKmers(2*contextSize + 1)) {
            if (kmer[contextSize] == 'C') {
                contexts.add(StringUtil.bytesToString(kmer));
            }
        }
        
        log.info("Generated " + contexts.size() + " context strings.");
        return contexts;
    }


    /** Generates all possible kmers of length and returns them as byte[]s. */
    private List<byte[]> generateAllKmers(final int length) {
        final List<byte[]> sofar = new LinkedList<byte[]>();
        final byte[] bases = {'A', 'C', 'G', 'T'};


        if (sofar.size() == 0) {
            sofar.add(new byte[length]);
        }


        while (true) {
            final byte[] bs = sofar.remove(0);
            final int indexOfNextBase = findIndexOfNextBase(bs);


            if (indexOfNextBase == -1) {
                sofar.add(bs);
                break;
            }
            else {
                for (final byte b : bases) {
                    final byte[] next = Arrays.copyOf(bs, bs.length);
                    next[indexOfNextBase] = b;
                    sofar.add(next);
                }
            }
        }


        return sofar;
    }


    /** Finds the first non-zero character in the array, or returns -1 if all are non-zero. */
    private int findIndexOfNextBase(final byte[] bs) {
        for (int i=0; i<bs.length; ++i) {
            if (bs[i] == 0) return i;
        }


        return -1;
    }


    /** A little class for counting alleles. */
    private static class Counts {
        int controlA;
        int oxidatedA;
        int controlC;
        int oxidatedC;
        int total() { return controlC + oxidatedC + controlA + oxidatedA; }
    }


    /**
     * Class that calculated CpCG metrics for a specific library.
     */
    private class Calculator {
        private final String library;
        private final String context;


        // Things to be accumulated
        int sites = 0;
        long refCcontrolA  = 0;
        long refCoxidatedA = 0;
        long refCcontrolC  = 0;
        long refCoxidatedC = 0;
        long refGcontrolA  = 0;
        long refGoxidatedA = 0;
        long refGcontrolC  = 0;
        long refGoxidatedC = 0;
        
        Calculator(final String library, final String context) {
            this.library = library;
            this.context = context;
        }
        
        void accept(final SamLocusIterator.LocusInfo info, final byte refBase) {
            final Counts counts = computeAlleleFraction(info, refBase);


            if (counts.total() > 0) {
                // Things calculated on all sites with coverage
                this.sites++;
                if (refBase == 'C') {
                    this.refCcontrolA  += counts.controlA;
                    this.refCoxidatedA += counts.oxidatedA;
                    this.refCcontrolC  += counts.controlC;
                    this.refCoxidatedC += counts.oxidatedC;
                }
                else if (refBase == 'G') {
                    this.refGcontrolA  += counts.controlA;
                    this.refGoxidatedA += counts.oxidatedA;
                    this.refGcontrolC  += counts.controlC;
                    this.refGoxidatedC += counts.oxidatedC;
                }
                else {
                    throw new IllegalStateException("Reference bases other than G and C not supported.");
                }
            }
        }
        
        CpcgMetrics finish() {
            final CpcgMetrics m = new CpcgMetrics();
            m.LIBRARY              = this.library;
            m.CONTEXT              = this.context;
            m.TOTAL_SITES          = this.sites;
            m.TOTAL_BASES          = this.refCcontrolC + this.refCoxidatedC + this.refCcontrolA + this.refCoxidatedA +
                                     this.refGcontrolC + this.refGoxidatedC + this.refGcontrolA + this.refGoxidatedA;
            m.REF_OXO_BASES        = this.refCoxidatedC + refGoxidatedC;
            m.REF_NONOXO_BASES     = this.refCcontrolC + this.refGcontrolC;
            m.REF_TOTAL_BASES      = m.REF_OXO_BASES + m.REF_NONOXO_BASES;
            m.ALT_NONOXO_BASES     = this.refCcontrolA + this.refGcontrolA;
            m.ALT_OXO_BASES        = this.refCoxidatedA + this.refGoxidatedA;


            /**
             * Why do we calculate the oxo error rate using oxidatedA - controlA you ask?  We know that all the
             * bases counted in oxidatedA are consistent with 8-oxo-G damage during shearing, but not all of them
             * will have been caused by this. If we assume that C>A errors caused by other factors will occur randomly
             * with respect to read1/read2, then we should see as many in the 8-oxo-G consistent state as not.  So we
             * assume that controlA is half the story, and remove the other half from oxidatedA.
             */
            m.OXIDATION_ERROR_RATE = Math.max(m.ALT_OXO_BASES - m.ALT_NONOXO_BASES, 1) / (double) m.TOTAL_BASES;
            m.OXIDATION_Q          = -10 * log10(m.OXIDATION_ERROR_RATE);


            /** Now look for things that have a reference base bias! */
            m.C_REF_REF_BASES = this.refCcontrolC + this.refCoxidatedC;
            m.G_REF_REF_BASES = this.refGcontrolC + this.refGoxidatedC;
            m.C_REF_ALT_BASES = this.refCcontrolA + this.refCoxidatedA;
            m.G_REF_ALT_BASES = this.refGcontrolA + this.refGoxidatedA;


            final double cRefErrorRate = m.C_REF_ALT_BASES / (double) (m.C_REF_ALT_BASES + m.C_REF_REF_BASES);
            final double gRefErrorRate = m.G_REF_ALT_BASES / (double) (m.G_REF_ALT_BASES + m.G_REF_REF_BASES);


            m.C_REF_OXO_ERROR_RATE = Math.max(cRefErrorRate - gRefErrorRate, 1e-10);
            m.G_REF_OXO_ERROR_RATE = Math.max(gRefErrorRate - cRefErrorRate, 1e-10);
            m.C_REF_OXO_Q = -10 * log10(m.C_REF_OXO_ERROR_RATE);
            m.G_REF_OXO_Q = -10 * log10(m.G_REF_OXO_ERROR_RATE);


            return m;
        }


        /**
         *
         */
        private Counts computeAlleleFraction(final SamLocusIterator.LocusInfo info, final byte refBase) {
            final Counts counts = new Counts();
            final byte altBase = (refBase == 'C') ? (byte) 'A' : (byte) 'T';
            
            for (final SamLocusIterator.RecordAndOffset rec : info.getRecordAndPositions()) {
                final byte qual;
                final SAMRecord samrec = rec.getRecord();
                
                if (USE_OQ) {
                    final byte[] oqs = samrec.getOriginalBaseQualities();
                    if (oqs != null) qual = oqs[rec.getOffset()];
                    else qual = rec.getBaseQuality();
                }
                else {
                    qual = rec.getBaseQuality();
                }


                // Skip if below qual, or if library isn't a match
                if (qual < MINIMUM_QUALITY_SCORE) continue;
                if (!this.library.equals(nvl(samrec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY))) continue;
                
                // Get the read base, and get it in "as read" orientation
                final byte base = rec.getReadBase();
                final byte baseAsRead = samrec.getReadNegativeStrandFlag() ? SequenceUtil.complement(base) : base;
                final int read        = samrec.getReadPairedFlag() && samrec.getSecondOfPairFlag() ? 2 : 1;


                // Figure out how to count the alternative allele. If the damage is caused by oxidation of G
                // during shearing (in non-rnaseq data), then we know that:
                //     G>T observation is always in read 1
                //     C>A observation is always in read 2
                // But if the substitution is from other causes the distribution of A/T across R1/R2 will be
                // random.
                if (base == refBase) {
                    if      (baseAsRead == 'G' && read == 1) ++counts.oxidatedC;
                    else if (baseAsRead == 'G' && read == 2) ++counts.controlC;
                    else if (baseAsRead == 'C' && read == 1) ++counts.controlC;
                    else if (baseAsRead == 'C' && read == 2) ++counts.oxidatedC;
                }
                else if (base == altBase) {
                    if      (baseAsRead == 'T' && read == 1) ++counts.oxidatedA;
                    else if (baseAsRead == 'T' && read == 2) ++counts.controlA;
                    else if (baseAsRead == 'A' && read == 1) ++counts.controlA;
                    else if (baseAsRead == 'A' && read == 2) ++counts.oxidatedA;
                }
            }


            return counts;
        }
    }
}
Source Code of picard.analysis.CollectOxoGMetrics$Calculator

Related Classes of picard.analysis.CollectOxoGMetrics$Calculator