package picard.analysis;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.programgroups.Metrics;
import picard.util.DbSnpBitSetUtil;
import htsjdk.samtools.util.IntervalList;import htsjdk.samtools.util.ListMap;
import htsjdk.samtools.util.Log;import htsjdk.samtools.util.SamLocusIterator;
import htsjdk.samtools.filter.DuplicateReadFilter;
import htsjdk.samtools.filter.NotPrimaryAlignmentFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.metrics.MetricBase;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import java.io.File;
import java.lang.IllegalStateException;
import java.lang.Integer;
import java.lang.Math;
import java.lang.Override;
import java.lang.String;
import java.lang.System;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import static java.lang.Math.log10;
/**
* Class for trying to quantify the CpCG->CpCA error rate.
*/
@CommandLineProgramProperties(
usage = CollectOxoGMetrics.USAGE,
usageShort = CollectOxoGMetrics.USAGE,
programGroup = Metrics.class
)
public class CollectOxoGMetrics extends CommandLineProgram {
static final String USAGE = "Collects metrics quantifying the CpCG -> CpCA error rate from the provided SAM/BAM";
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME,
doc="Input BAM file for analysis.")
public File INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME,
doc="Location of output metrics file to write.")
public File OUTPUT;
@Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME,
doc="Reference sequence to which BAM is aligned.")
public File REFERENCE_SEQUENCE;
@Option(doc="An optional list of intervals to restrict analysis to.",
optional=true)
public File INTERVALS;
@Option(doc="VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis.",
optional=true)
public File DB_SNP;
@Option(shortName="Q",
doc="The minimum base quality score for a base to be included in analysis.")
public int MINIMUM_QUALITY_SCORE = 20;
@Option(shortName="MQ",
doc="The minimum mapping quality score for a base to be included in analysis.")
public int MINIMUM_MAPPING_QUALITY = 30;
@Option(shortName="MIN_INS",
doc="The minimum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.")
public int MINIMUM_INSERT_SIZE = 60;
@Option(shortName="MAX_INS",
doc="The maximum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.")
public int MAXIMUM_INSERT_SIZE = 600;
@Option(doc="When available, use original quality scores for filtering.")
public boolean USE_OQ = true;
@Option(doc="The number of context bases to include on each side of the assayed G/C base.")
public int CONTEXT_SIZE = 1;
@Option(doc="The optional set of sequence contexts to restrict analysis to. If not supplied all contexts are analyzed.")
public Set<String> CONTEXTS = new HashSet<String>();
@Option(doc="For debugging purposes: stop after visiting this many sites with at least 1X coverage.")
public int STOP_AFTER = Integer.MAX_VALUE;
private final Log log = Log.getInstance(CollectOxoGMetrics.class);
private static final String UNKNOWN_LIBRARY = "UnknownLibrary";
private static final String UNKNOWN_SAMPLE = "UnknownSample";
/** Metrics class for outputs. */
public static final class CpcgMetrics extends MetricBase {
/** The name of the sample being assayed. */
public String SAMPLE_ALIAS;
/** The name of the library being assayed. */
public String LIBRARY;
/** The sequence context being reported on. */
public String CONTEXT;
/** The total number of sites that had at least one base covering them. */
public int TOTAL_SITES;
/** The total number of basecalls observed at all sites. */
public long TOTAL_BASES;
/** The number of reference alleles observed as C in read 1 and G in read 2. */
public long REF_NONOXO_BASES;
/** The number of reference alleles observed as G in read 1 and C in read 2. */
public long REF_OXO_BASES;
/** The total number of reference alleles observed */
public long REF_TOTAL_BASES;
/**
* The count of observed A basecalls at C reference positions and T basecalls
* at G reference bases that are correlated to instrument read number in a way
* that rules out oxidation as the cause
*/
public long ALT_NONOXO_BASES;
/**
* The count of observed A basecalls at C reference positions and T basecalls
* at G reference bases that are correlated to instrument read number in a way
* that is consistent with oxidative damage.
*/
public long ALT_OXO_BASES;
/** The oxo error rate, calculated as max(ALT_OXO_BASES - ALT_NONOXO_BASES, 1) / TOTAL_BASES */
public double OXIDATION_ERROR_RATE;
/** -10 * log10(OXIDATION_ERROR_RATE) */
public double OXIDATION_Q;
// Fields below this point are metrics that are calculated to see if there is oxidative damage that is
// biased toward the reference base - i.e. occurs more where the reference base is a C vs. a G or vice
// versa.
/** The number of ref basecalls observed at sites where the genome reference == C. */
public long C_REF_REF_BASES;
/** The number of ref basecalls observed at sites where the genome reference == G. */
public long G_REF_REF_BASES;
/** The number of alt (A/T) basecalls observed at sites where the genome reference == C. */
public long C_REF_ALT_BASES;
/** The number of alt (A/T) basecalls observed at sites where the genome reference == G. */
public long G_REF_ALT_BASES;
/**
* The rate at which C>A and G>T substitutions are observed at C reference sites above the expected rate if there
* were no bias between sites with a C reference base vs. a G reference base.
*/
public double C_REF_OXO_ERROR_RATE;
/** C_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */
public double C_REF_OXO_Q;
/**
* The rate at which C>A and G>T substitutions are observed at G reference sites above the expected rate if there
* were no bias between sites with a C reference base vs. a G reference base.
*/
public double G_REF_OXO_ERROR_RATE;
/** G_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */
public double G_REF_OXO_Q;
}
/**
* SAM filter for insert size range.
*/
static class InsertSizeFilter implements SamRecordFilter {
final int minInsertSize;
final int maxInsertSize;
InsertSizeFilter(final int minInsertSize, final int maxInsertSize) {
this.minInsertSize = minInsertSize;
this.maxInsertSize = maxInsertSize;
}
@Override public boolean filterOut(final SAMRecord rec) {
// Treat both parameters == 0 as not filtering
if (minInsertSize == 0 && maxInsertSize == 0) return false;
if (rec.getReadPairedFlag()) {
final int ins = Math.abs(rec.getInferredInsertSize());
return ins < minInsertSize || ins > maxInsertSize;
}
// If the read isn't paired and either min or max is specified filter it out
return minInsertSize != 0 || maxInsertSize != 0;
}
@Override public boolean filterOut(final SAMRecord r1, final SAMRecord r2) {
return filterOut(r1) || filterOut(r2);
}
}
// Stock main method
public static void main(final String[] args) {
new CollectOxoGMetrics().instanceMainWithExit(args);
}
@Override
protected String[] customCommandLineValidation() {
final int size = 1 + 2*CONTEXT_SIZE;
final List<String> messages = new ArrayList<String>();
for (final String ctx : CONTEXTS) {
if (ctx.length() != size) {
messages.add("Context " + ctx + " is not " + size + " long as implied by CONTEXT_SIZE=" + CONTEXT_SIZE);
}
else if (ctx.charAt(ctx.length() / 2) != 'C') {
messages.add("Middle base of context sequence " + ctx + " must be C");
}
}
return messages.isEmpty() ? null : messages.toArray(new String[messages.size()]);
}
/** Mimic of Oracle's nvl() - returns the first value if not null, otherwise the second value. */
private final <T> T nvl(final T value1, final T value2) {
if (value1 != null) return value1;
else return value2;
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
if (INTERVALS != null) IOUtil.assertFileIsReadable(INTERVALS);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SAMFileReader in = new SAMFileReader(INPUT);
final Set<String> samples = new HashSet<String>();
final Set<String> libraries = new HashSet<String>();
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
samples.add(nvl(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(nvl(rec.getLibrary(), UNKNOWN_LIBRARY));
}
// Setup the calculators
final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS;
final ListMap<String, Calculator> calculators = new ListMap<String, Calculator>();
for (final String context : contexts) {
for (final String library : libraries) {
calculators.add(context, new Calculator(library, context));
}
}
// Load up dbSNP if available
log.info("Loading dbSNP File: " + DB_SNP);
final DbSnpBitSetUtil dbSnp;
if (DB_SNP != null) dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary());
else dbSnp = null;
// Make an iterator that will filter out funny looking things
final SamLocusIterator iterator;
if (INTERVALS != null) {
final IntervalList intervals = IntervalList.fromFile(INTERVALS);
intervals.unique();
iterator = new SamLocusIterator(in, intervals, false);
}
else {
iterator = new SamLocusIterator(in);
}
iterator.setEmitUncoveredLoci(false);
iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY);
iterator.setSamFilters(Arrays.asList(
new NotPrimaryAlignmentFilter(),
new DuplicateReadFilter(),
new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE)
));
log.info("Starting iteration.");
long nextLogTime = 0;
int sites = 0;
for (final SamLocusIterator.LocusInfo info : iterator) {
// Skip dbSNP sites
final String chrom = info.getSequenceName();
final int pos = info.getPosition();
final int index = pos-1;
if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos)) continue;
// Skip sites at the end of chromosomes
final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases();
if (pos < 3 || pos > bases.length-3) continue;
// Skip non C-G bases
final byte base = StringUtil.toUpperCase(bases[index]);
if (base != 'C' && base != 'G') continue;
// Get the context string
final String context;
{
final String tmp = StringUtil.bytesToString(bases, index-CONTEXT_SIZE, 1 + (2*CONTEXT_SIZE)).toUpperCase();
if (base == 'C') context = tmp;
else /* if G */ context = SequenceUtil.reverseComplement(tmp);
}
final List<Calculator> calculatorsForContext = calculators.get(context);
if (calculatorsForContext == null) continue; // happens if we get ambiguous bases in the reference
for (final Calculator calc : calculatorsForContext) calc.accept(info, base);
// See if we need to stop
if (++sites % 100 == 0) {
final long now = System.currentTimeMillis();
if (now > nextLogTime) {
log.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos);
nextLogTime = now + 60000;
}
}
if (sites >= STOP_AFTER) break;
}
final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile();
for (final List<Calculator> calcs : calculators.values()) {
for (final Calculator calc : calcs) {
final CpcgMetrics m = calc.finish();
m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<String>(samples));
file.addMetric(m);
}
}
file.write(OUTPUT);
return 0;
}
private Set<String> makeContextStrings(final int contextSize) {
final Set<String> contexts = new HashSet<String>();
for (final byte[] kmer : generateAllKmers(2*contextSize + 1)) {
if (kmer[contextSize] == 'C') {
contexts.add(StringUtil.bytesToString(kmer));
}
}
log.info("Generated " + contexts.size() + " context strings.");
return contexts;
}
/** Generates all possible kmers of length and returns them as byte[]s. */
private List<byte[]> generateAllKmers(final int length) {
final List<byte[]> sofar = new LinkedList<byte[]>();
final byte[] bases = {'A', 'C', 'G', 'T'};
if (sofar.size() == 0) {
sofar.add(new byte[length]);
}
while (true) {
final byte[] bs = sofar.remove(0);
final int indexOfNextBase = findIndexOfNextBase(bs);
if (indexOfNextBase == -1) {
sofar.add(bs);
break;
}
else {
for (final byte b : bases) {
final byte[] next = Arrays.copyOf(bs, bs.length);
next[indexOfNextBase] = b;
sofar.add(next);
}
}
}
return sofar;
}
/** Finds the first non-zero character in the array, or returns -1 if all are non-zero. */
private int findIndexOfNextBase(final byte[] bs) {
for (int i=0; i<bs.length; ++i) {
if (bs[i] == 0) return i;
}
return -1;
}
/** A little class for counting alleles. */
private static class Counts {
int controlA;
int oxidatedA;
int controlC;
int oxidatedC;
int total() { return controlC + oxidatedC + controlA + oxidatedA; }
}
/**
* Class that calculated CpCG metrics for a specific library.
*/
private class Calculator {
private final String library;
private final String context;
// Things to be accumulated
int sites = 0;
long refCcontrolA = 0;
long refCoxidatedA = 0;
long refCcontrolC = 0;
long refCoxidatedC = 0;
long refGcontrolA = 0;
long refGoxidatedA = 0;
long refGcontrolC = 0;
long refGoxidatedC = 0;
Calculator(final String library, final String context) {
this.library = library;
this.context = context;
}
void accept(final SamLocusIterator.LocusInfo info, final byte refBase) {
final Counts counts = computeAlleleFraction(info, refBase);
if (counts.total() > 0) {
// Things calculated on all sites with coverage
this.sites++;
if (refBase == 'C') {
this.refCcontrolA += counts.controlA;
this.refCoxidatedA += counts.oxidatedA;
this.refCcontrolC += counts.controlC;
this.refCoxidatedC += counts.oxidatedC;
}
else if (refBase == 'G') {
this.refGcontrolA += counts.controlA;
this.refGoxidatedA += counts.oxidatedA;
this.refGcontrolC += counts.controlC;
this.refGoxidatedC += counts.oxidatedC;
}
else {
throw new IllegalStateException("Reference bases other than G and C not supported.");
}
}
}
CpcgMetrics finish() {
final CpcgMetrics m = new CpcgMetrics();
m.LIBRARY = this.library;
m.CONTEXT = this.context;
m.TOTAL_SITES = this.sites;
m.TOTAL_BASES = this.refCcontrolC + this.refCoxidatedC + this.refCcontrolA + this.refCoxidatedA +
this.refGcontrolC + this.refGoxidatedC + this.refGcontrolA + this.refGoxidatedA;
m.REF_OXO_BASES = this.refCoxidatedC + refGoxidatedC;
m.REF_NONOXO_BASES = this.refCcontrolC + this.refGcontrolC;
m.REF_TOTAL_BASES = m.REF_OXO_BASES + m.REF_NONOXO_BASES;
m.ALT_NONOXO_BASES = this.refCcontrolA + this.refGcontrolA;
m.ALT_OXO_BASES = this.refCoxidatedA + this.refGoxidatedA;
/**
* Why do we calculate the oxo error rate using oxidatedA - controlA you ask? We know that all the
* bases counted in oxidatedA are consistent with 8-oxo-G damage during shearing, but not all of them
* will have been caused by this. If we assume that C>A errors caused by other factors will occur randomly
* with respect to read1/read2, then we should see as many in the 8-oxo-G consistent state as not. So we
* assume that controlA is half the story, and remove the other half from oxidatedA.
*/
m.OXIDATION_ERROR_RATE = Math.max(m.ALT_OXO_BASES - m.ALT_NONOXO_BASES, 1) / (double) m.TOTAL_BASES;
m.OXIDATION_Q = -10 * log10(m.OXIDATION_ERROR_RATE);
/** Now look for things that have a reference base bias! */
m.C_REF_REF_BASES = this.refCcontrolC + this.refCoxidatedC;
m.G_REF_REF_BASES = this.refGcontrolC + this.refGoxidatedC;
m.C_REF_ALT_BASES = this.refCcontrolA + this.refCoxidatedA;
m.G_REF_ALT_BASES = this.refGcontrolA + this.refGoxidatedA;
final double cRefErrorRate = m.C_REF_ALT_BASES / (double) (m.C_REF_ALT_BASES + m.C_REF_REF_BASES);
final double gRefErrorRate = m.G_REF_ALT_BASES / (double) (m.G_REF_ALT_BASES + m.G_REF_REF_BASES);
m.C_REF_OXO_ERROR_RATE = Math.max(cRefErrorRate - gRefErrorRate, 1e-10);
m.G_REF_OXO_ERROR_RATE = Math.max(gRefErrorRate - cRefErrorRate, 1e-10);
m.C_REF_OXO_Q = -10 * log10(m.C_REF_OXO_ERROR_RATE);
m.G_REF_OXO_Q = -10 * log10(m.G_REF_OXO_ERROR_RATE);
return m;
}
/**
*
*/
private Counts computeAlleleFraction(final SamLocusIterator.LocusInfo info, final byte refBase) {
final Counts counts = new Counts();
final byte altBase = (refBase == 'C') ? (byte) 'A' : (byte) 'T';
for (final SamLocusIterator.RecordAndOffset rec : info.getRecordAndPositions()) {
final byte qual;
final SAMRecord samrec = rec.getRecord();
if (USE_OQ) {
final byte[] oqs = samrec.getOriginalBaseQualities();
if (oqs != null) qual = oqs[rec.getOffset()];
else qual = rec.getBaseQuality();
}
else {
qual = rec.getBaseQuality();
}
// Skip if below qual, or if library isn't a match
if (qual < MINIMUM_QUALITY_SCORE) continue;
if (!this.library.equals(nvl(samrec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY))) continue;
// Get the read base, and get it in "as read" orientation
final byte base = rec.getReadBase();
final byte baseAsRead = samrec.getReadNegativeStrandFlag() ? SequenceUtil.complement(base) : base;
final int read = samrec.getReadPairedFlag() && samrec.getSecondOfPairFlag() ? 2 : 1;
// Figure out how to count the alternative allele. If the damage is caused by oxidation of G
// during shearing (in non-rnaseq data), then we know that:
// G>T observation is always in read 1
// C>A observation is always in read 2
// But if the substitution is from other causes the distribution of A/T across R1/R2 will be
// random.
if (base == refBase) {
if (baseAsRead == 'G' && read == 1) ++counts.oxidatedC;
else if (baseAsRead == 'G' && read == 2) ++counts.controlC;
else if (baseAsRead == 'C' && read == 1) ++counts.controlC;
else if (baseAsRead == 'C' && read == 2) ++counts.oxidatedC;
}
else if (base == altBase) {
if (baseAsRead == 'T' && read == 1) ++counts.oxidatedA;
else if (baseAsRead == 'T' && read == 2) ++counts.controlA;
else if (baseAsRead == 'A' && read == 1) ++counts.controlA;
else if (baseAsRead == 'A' && read == 2) ++counts.oxidatedA;
}
}
return counts;
}
}
}