/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis;
import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
@Analysis(description = "Evaluation summary for indels")
public class IndelSummary extends VariantEvaluator implements StandardEval {
final protected static Logger logger = Logger.getLogger(IndelSummary.class);
//
// counts of snps and indels
//
@DataPoint(description = "Number of SNPs", format = "%d")
public int n_SNPs = 0;
@DataPoint(description = "Number of singleton SNPs", format = "%d")
public int n_singleton_SNPs = 0;
@DataPoint(description = "Number of indels", format = "%d")
public int n_indels = 0;
@DataPoint(description = "Number of singleton indels", format = "%d")
public int n_singleton_indels = 0;
//
// gold standard
//
@DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
public int n_indels_matching_gold_standard = 0;
@DataPoint(description = "Percent of indels overlapping gold standard sites")
public String gold_standard_matching_rate;
//
// multi-allelics
//
// Number of Indels Sites (counts one for any number of alleles at site)
public int nIndelSites = 0;
@DataPoint(description = "Number of sites with where the number of alleles is greater than 2")
public int n_multiallelic_indel_sites = 0;
@DataPoint(description = "Percent of indel sites that are multi-allelic")
public String percent_of_sites_with_more_than_2_alleles;
//
// snp : indel ratios
//
@DataPoint(description = "SNP to indel ratio")
public String SNP_to_indel_ratio;
@DataPoint(description = "Singleton SNP to indel ratio")
public String SNP_to_indel_ratio_for_singletons;
//
// novelty
//
@DataPoint(description = "Number of novel indels", format = "%d")
public int n_novel_indels = 0;
@DataPoint(description = "Indel novelty rate")
public String indel_novelty_rate;
//
// insertions to deletions
//
@DataPoint(description = "Number of insertion indels")
public int n_insertions = 0;
@DataPoint(description = "Number of deletion indels")
public int n_deletions = 0;
@DataPoint(description = "Insertion to deletion ratio")
public String insertion_to_deletion_ratio;
@DataPoint(description = "Number of large (>10 bp) deletions")
public int n_large_deletions = 0;
@DataPoint(description = "Number of large (>10 bp) insertions")
public int n_large_insertions = 0;
@DataPoint(description = "Ratio of large (>10 bp) insertions to deletions")
public String insertion_to_deletion_ratio_for_large_indels;
//
// Frameshifts
//
@DataPoint(description = "Number of indels in protein-coding regions labeled as frameshift")
public int n_coding_indels_frameshifting = 0;
@DataPoint(description = "Number of indels in protein-coding regions not labeled as frameshift")
public int n_coding_indels_in_frame = 0;
@DataPoint(description = "Frameshift percent")
public String frameshift_rate_for_coding_indels;
//
// Het : hom ratios
//
@DataPoint(description = "Het to hom ratio for SNPs")
public String SNP_het_to_hom_ratio;
@DataPoint(description = "Het to hom ratio for indels")
public String indel_het_to_hom_ratio;
int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;
int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
// - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
// downstream frameshift, if we make the simplifying assumptions that 3 bp ins
// and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
// selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
// as for deletions, and certainly would expect consistency between in/dels that
// multiple methods find and in/dels that are unique to one method (since deletions
// are more common and the artifacts differ, it is probably worth looking at the totals,
// overlaps and ratios for insertions and deletions separately in the methods
// comparison and in this case don't even need to make the simplifying in = del functional assumption
@DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions")
public String ratio_of_1_and_2_to_3_bp_insertions;
@DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions")
public String ratio_of_1_and_2_to_3_bp_deletions;
public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;
@Override public int getComparisonOrder() { return 2; }
public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || (getWalker().ignoreAC0Sites() && eval.isMonomorphicInSamples()) )
return;
// update counts
switch ( eval.getType() ) {
case SNP:
n_SNPs += eval.getNAlleles() - 1; // -1 for ref
if ( variantWasSingleton(eval) ) n_singleton_SNPs++;
// collect information about het / hom ratio
for ( final Genotype g : eval.getGenotypes() ) {
if ( g.isHet() ) nSNPHets++;
if ( g.isHomVar() ) nSNPHoms++;
}
break;
case INDEL:
final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard);
nIndelSites++;
if ( ! eval.isBiallelic() ) n_multiallelic_indel_sites++;
// collect information about het / hom ratio
for ( final Genotype g : eval.getGenotypes() ) {
if ( g.isHet() ) nIndelHets++;
if ( g.isHomVar() ) nIndelHoms++;
}
for ( Allele alt : eval.getAlternateAlleles() ) {
n_indels++; // +1 for each alt allele
if ( variantWasSingleton(eval) ) n_singleton_indels++;
if ( comp == null ) n_novel_indels++; // TODO -- make this test allele specific?
if ( gold != null ) n_indels_matching_gold_standard++;
// ins : del ratios
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedGATKException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
if ( alleleSize > 0 ) n_insertions++;
if ( alleleSize < 0 ) n_deletions++;
// requires snpEFF annotations
if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
final String effect = eval.getAttributeAsString("SNPEFF_EFFECT", "missing");
if ( effect.equals("missing") )
throw new ReviewedGATKException("Saw SNPEFF_GENE_BIOTYPE but unexpected no SNPEFF_EFFECT at " + eval);
if ( effect.equals("FRAME_SHIFT") )
n_coding_indels_frameshifting++;
else if ( effect.startsWith("CODON") )
n_coding_indels_in_frame++;
else
; // lots of protein coding effects that shouldn't be counted, such as INTRON
}
if ( alleleSize > LARGE_INDEL_SIZE_THRESHOLD )
n_large_insertions++;
else if ( alleleSize < -LARGE_INDEL_SIZE_THRESHOLD )
n_large_deletions++;
// update the baby histogram
final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength;
final int absSize = Math.abs(alleleSize);
if ( absSize < countByLength.length ) countByLength[absSize]++;
}
break;
default:
// TODO - MIXED, SYMBOLIC, and MNP records are skipped over
//throw new UserException.BadInput("Unexpected variant context type: " + eval);
break;
}
return;
}
public void finalizeEvaluation() {
percent_of_sites_with_more_than_2_alleles = Utils.formattedPercent(n_multiallelic_indel_sites, nIndelSites);
SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels);
SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
gold_standard_matching_rate = Utils.formattedPercent(n_indels_matching_gold_standard, n_indels);
indel_novelty_rate = Utils.formattedNoveltyRate(n_indels - n_novel_indels, n_indels);
frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);
ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]);
SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);
insertion_to_deletion_ratio = Utils.formattedRatio(n_insertions, n_deletions);
insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);
}
}