Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeConcordance

/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/


package org.broadinstitute.gatk.tools.walkers.variantutils;


import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.report.GATKReport;
import org.broadinstitute.gatk.engine.report.GATKReportTable;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeader;


import java.io.PrintStream;
import java.util.*;


/**
 * Genotype concordance (per-sample and aggregate counts and frequencies, NRD/NRS and site allele overlaps) between two callsets
 *
 * <p>
 *  GenotypeConcordance takes in two callsets (vcfs) and tabulates the number of sites which overlap and share alleles,
 *  and for each sample, the genotype-by-genotype counts (e.g. the number of sites at which a sample was
 *  called homozygous-reference in the EVAL callset, but homozygous-variant in the COMP callset). It outputs these
 *  counts as well as convenient proportions (such as the proportion of het calls in the EVAL which were called REF in
 *  the COMP) and metrics (such as NRD and NRS).
 *  </p>
 *
 *  <h3>Input</h3>
 *  <p>
 *  Genotype concordance requires two callsets (as it does a comparison): an EVAL and a COMP callset, specified via
 *  the -eval and -comp arguments. Typically, the EVAL callset is an experimental set you want to evaluate, while the
 *  COMP callset is a previously existing set used as a standard for comparison (taken to represent "truth").
 *  </p>
 *  <p>
 *  (Optional) Jexl expressions for genotype-level filtering of EVAL or COMP genotypes, specified via the -gfe and
 *  -cfe arguments, respectively.
 *  </p>
 *
 *  <h3>Output</h3>
 *  <p>
 *  Genotype Concordance writes a GATK report to the specified file (via -o), consisting of multiple tables of counts
 *  and proportions. These tables are constructed on a per-sample basis, and include counts of EVAL vs COMP genotype
 *  states.
 *  </p>
 *  <h4>Tables</h4>
 *  <p>
 *  Headers for the (non-moltenized -- see below) GenotypeConcordance counts and proportions tables give the genotype of
 *  the COMP callset followed by the genotype of the EVAL callset. For example the value corresponding to HOM_REF_HET
 *  reflects variants called HOM_REF in the COMP callset and HET in the EVAL callset. Variants for which the alternate
 *  alleles between the EVAL and COMP sample did not match are excluded from genotype comparisons and given in the
 *  "Mismatching_Alleles" field.
 *  </p>
 *  <p>
 *  It may be informative to reshape rows of the GenotypeConcordance counts and proportions tables into separate row-major tables
 *  where the columns indicate the COMP genotype and the rows indicate the EVAL genotype for easy comparison between the
 *  two callsets. This can be done with a command similar to d <- matrix(sampleRow,nrow=6,byrow=T) in R where sampleRow is the 36-value row corresponding to the sample of interest, excluding "Mismatching_Alleles".
 *  In Excel this can be accomplished using the OFFSET function.
 *  </p>
 *  <ul>
 *      <li><i>GenotypeConcordance_CompProportions</i>: Gives the proportion of variants in each category normalized to the total number of called genotypes in the COMP callset</li>
 *      <li><i>GenotypeConcordance_Counts</i>: Gives the counts for number of genotypes in each category</li>
 *      <li><i>GenotypeConcordance_EvalProportions</i>: Gives the proportion of genotypes in each category normalized to the total number of called genotypes in the EVAL callset</li>
 *      <li><i>GenotypeConcordance_Summary</i>: Summary statistics for the sum of all samples and each sample individually. See below for definitions.</li>
 *      <li><i>SiteConcordance_Summary</i>: Gives comparison counts of called genotypes and their alleles between the two callsets. See below for definitions.</li>
 *  </ul>
 *  </p>
 *
 *  <h4>Term and metrics definitions</h4>
 * <p>
 * <ul>
 *          <li><i>GenotypeConcordance_CompProportions</i>, <i>GenotypeConcordance_Counts</i>, and <i>GenotypeConcordance_EvalProportions</i></li>
 *          <ul>
 *               <li>NO_CALL: reported genotype is ./., indicating not enough data to call</li>
 *               <li>HET: heterozygous</li>
 *               <li>HOM_REF: homozygous reference</li>
 *               <li>HOM_VAR: homozygous variant</li>
 *               <li>UNAVAILABLE: variant is not called in this callset</li>
 *               <li>MIXED: something like ./1</li>
 *          </ul>
 *          <li><i>GenotypeConcordance_Summary</i></li>
 *          <ul>
 *               <li>Non-Reference_Sensitivity (NRS): sensitivity of the EVAL calls to polymorphic calls in the COMP set, calculated by (# true positive)/(# true polymorphic)</li>
 *               <li>Non-Reference_Discrepancy (NRD): genotype discordance excluding concordant reference sites, calculated by (# discordant sites)/(total excluding # HOM_REF_HOM_REF) = 1.0-(# HOM_VAR_HOM_VAR + # HET_HET)/(total excluding # HOM_REF_HOM_REF)</li>
 *               <li>Overall_Genotype_Concordance: overall concordance calculated by (# concordant genotypes)/(# genotypes)</li>
 *          </ul>
 *          <li><i>SiteConcordance_Summary</i></li>
 *          <ul>
 *               <li>ALLELES_MATCH: counts of calls at the same site where the alleles match</li>
 *               <li>ALLELES_DO_NOT_MATCH: counts of calls at the same location with different alleles, such as the EVAL set calling a 'G' alternate allele, and the comp set calling a 'T' alternate allele</li>
 *               <li>EVAL_SUBSET_TRUTH: (multi-alleleic sites only) ALT alleles for EVAL are a subset of ALT alleles for COMP. See also below.</li>
 *               <li>EVAL_SUPERSET_TRUTH: (multi-allelic sites only) ALT alleles for COMP are a subset of ALT alleles for EVAL. See also below.</li>
 *               <li>EVAL_ONLY: counts of sites present only in the EVAL set, not in the COMP set</li>
 *               <li>TRUTH_ONLY: counts of sites present only in the COMP set, not in the EVAL set</li>
 *          </ul>
 * </ul>
 * </p>
 *
 * <h4>Site-level allelic concordance</h4>
 *
 * <p>
 *  For strictly bi-allelic VCFs, only the ALLELES_MATCH, EVAL_ONLY, TRUTH_ONLY fields will be populated,
 *  but where multi-allelic sites are involved counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated.
 * </p>
 * <p>
 *  For example, in the following situation
 *  <pre>
 *    eval:  ref - A   alt - C
 *    comp:  ref - A   alt - C,T
 *  </pre>
 *  then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH.
 *  However, in the case where EVAL has both C and T alternate alleles, both must be observed in the genotypes
 *  (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If
 *  one of the alleles has no observations in the genotype fields of the EVAL, the site-level concordance is
 *  tabulated as though that allele were not present in the record.
 * </p>
 *
 *  <h4>Monomorphic Records</h4>
 *  <p>
 *  A site which has an alternate allele, but which is monomorphic in samples, is treated as not having been
 *  discovered, and will be recorded in the TRUTH_ONLY column (if a record exists in the COMP set), or not at all
 *  (if no record exists in the COMP set).
 *  </p>
 *  <p>
 *  That is, in the situation
 *  <pre>
 *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
 *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
 *  </pre>
 *  is equivalent to
 *  <pre>
 *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
 *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
 *  </pre>
 *  </p>
 *  <p>
 *  When a record is present in the COMP set the *genotypes* for the monomorphic site will still be used to evaluate
 *  per-sample genotype concordance counts.
 * </p>
 *
 *  <h4>Filtered Records</h4>
 *  Filtered records are treated as though they were not present in the VCF, unless -ignoreSiteFilters is provided,
 *  in which case all records are used. There is currently no way to assess concordance metrics on filtered sites
 *  exclusively. SelectVariants can be used to extract filtered sites, and VariantFiltration used to un-filter them.
 *
 * <h4>Moltenized tables</h4>
 *
 * <p>These tables may be optionally moltenized via the -moltenize argument. That is, the standard table
 *
 *  <pre>
 *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
 *  NA12878       0.003        0.001            0.000        (...)
 *  NA12891       0.005        0.000            0.000        (...)
 *  </pre>
 *
 *  would instead be displayed
 *
 * <pre>
 *  NA12878  NO_CALL_HOM_REF   0.003
 *  NA12878  NO_CALL_HET       0.001
 *  NA12878  NO_CALL_HOM_VAR   0.000
 *  NA12891  NO_CALL_HOM_REF   0.005
 *  NA12891  NO_CALL_HET       0.000
 *  NA12891  NO_CALL_HOM_VAR   0.000
 *  (...)
 *  </pre>


 */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class GenotypeConcordance extends RodWalker<List<Pair<VariantContext,VariantContext>>,ConcordanceMetrics> {


    /**
     * The callset you want to evaluate, typically this is where you'd put 'unassessed' callsets.
     */
    @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true)
    RodBinding<VariantContext> evalBinding;


    /**
     * The callset you want to treat as 'truth'. Can also be of unknown quality for the sake of callset comparisons.
     */
    @Input(fullName="comp",shortName="comp",doc="The variants and genotypes to compare against",required=true)
    RodBinding<VariantContext> compBinding;


    /**
     * The FILTER field of the eval and comp VCFs will be ignored. If this flag is not included, all FILTER sites will
     * be treated as not being present in the VCF. (That is, the genotypes will be assigned UNAVAILABLE, as distinct
     * from NO_CALL).
     */
    @Argument(fullName="ignoreFilters",doc="Filters will be ignored",required=false)
    boolean ignoreFilters = false;


    /**
     * A genotype level JEXL expression to apply to eval genotypes. Genotypes filtered in this way will be replaced by NO_CALL.
     * For instance: -gfe 'GQ<20' will set to no-call any genotype with genotype quality less than 20.
     */
    @Argument(shortName="gfe", fullName="genotypeFilterExpressionEval", doc="One or more criteria to use to set EVAL genotypes to no-call. "+
            "These genotype-level filters are only applied to the EVAL rod.", required=false)
    public ArrayList<String> genotypeFilterExpressionsEval = new ArrayList<String>();


    /**
     * Identical to -gfe except the filter is applied to genotypes in the comp rod.
     */
    @Argument(shortName="gfc", fullName="genotypeFilterExpressionComp", doc="One or more criteria to use to set COMP genotypes to no-call. "+
            "These genotype-level filters are only applied to the COMP rod.", required=false)
    public ArrayList<String> genotypeFilterExpressionsComp = new ArrayList<String>();


    /**
     * Moltenize the count and proportion tables. Rather than moltenizing per-sample data into a 2x2 table, it is fully
     * moltenized into elements. That is, WITHOUT this argument, each row of the table begins with the sample name and
     * proceeds directly with counts/proportions of eval/comp counts (for instance HOM_REF/HOM_REF, HOM_REF/NO_CALL).
     *
     * If the Moltenize argument is given, the output will begin with a sample name, followed by the contrastive genotype
     * type (such as HOM_REF/HOM_REF), followed by the count or proportion. This will significantly increase the number of
     * rows.
     */
    @Argument(shortName="moltenize",fullName="moltenize",doc="Molten rather than tabular output")
    public boolean moltenize = false;


    /**
     * Print sites where genotypes are mismatched between callsets along with annotations giving the genotype of each callset
     * to the given filename
     *
     */


    @Argument(shortName = "sites",required = false,fullName = "printInterestingSites", doc="File to output the discordant sites and genotypes.")
    private PrintStream sitesFile = null;


    @Output
    PrintStream out;


    private List<String> evalSamples;
    private List<String> compSamples;
    private List<VariantContextUtils.JexlVCMatchExp> evalJexls = null;
    private List<VariantContextUtils.JexlVCMatchExp> compJexls = null;


    // todo -- table with "proportion of overlapping sites" (not just eval/comp margins) [e.g. drop no-calls]
    //  (this will break all the integration tests of course, due to new formatting)


    public void initialize() {
        evalJexls = initializeJexl(genotypeFilterExpressionsEval);
        compJexls = initializeJexl(genotypeFilterExpressionsComp);
    }


    private List<VariantContextUtils.JexlVCMatchExp> initializeJexl(ArrayList<String> genotypeFilterExpressions) {
        ArrayList<String> dummyNames = new ArrayList<String>(genotypeFilterExpressions.size());
        int expCount = 1;
        for ( String exp : genotypeFilterExpressions ) {
            dummyNames.add(String.format("gfe%d",expCount++));
        }
        return VariantContextUtils.initializeMatchExps(dummyNames, genotypeFilterExpressions);
    }


    public ConcordanceMetrics reduceInit() {
        Map<String,VCFHeader> headerMap = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(evalBinding,compBinding));
        VCFHeader evalHeader = headerMap.get(evalBinding.getName());
        evalSamples = evalHeader.getGenotypeSamples();
        VCFHeader compHeader = headerMap.get(compBinding.getName());
        compSamples = compHeader.getGenotypeSamples();
        return new ConcordanceMetrics(evalHeader,compHeader, sitesFile);
    }




    public List<Pair<VariantContext,VariantContext>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        List<Pair<VariantContext,VariantContext>> evalCompPair = new ArrayList<Pair<VariantContext,VariantContext>>(3);
        if ( tracker != null && (
                tracker.getValues(evalBinding,ref.getLocus()).size() > 0 ||
                        tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) {


            List<VariantContext> eval = tracker.getValues(evalBinding,ref.getLocus());
            List<VariantContext> comp = tracker.getValues(compBinding,ref.getLocus());
            if ( eval.size() > 1 || comp.size() > 1 ) {
                if ( noDuplicateTypes(eval) && noDuplicateTypes(comp) ) {
                    logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving.");
                    evalCompPair = resolveMultipleRecords(eval,comp);
                } else {
                    logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records of the same type. This locus will be skipped.");
                }
            } else {
                // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct.
                // note that if there is no eval rod there must be a comp rod, and also the reverse
                VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples);
                VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples);
                evalContext = filterGenotypes(evalContext,ignoreFilters,evalJexls);
                compContext = filterGenotypes(compContext,ignoreFilters,compJexls);
                evalCompPair.add(new Pair<VariantContext, VariantContext>(evalContext,compContext));
            }
        }


        return evalCompPair;
    }


    private boolean noDuplicateTypes(List<VariantContext> vcList) {
        HashSet<VariantContext.Type> types = new HashSet<VariantContext.Type>(vcList.size());
        for ( VariantContext vc : vcList ) {
            VariantContext.Type type = vc.getType();
            if ( types.contains(type) )
                return false;
            types.add(type);
        }


        return true;
    }


    /**
     * The point of this method is to match up pairs of evals and comps by their type (or alternate alleles for mixed).
     * Basically multiple records could exist for a site such as:
     * Eval: 20   4000     A     C
     * Eval: 20   4000     A    AC
     * Comp: 20   4000     A     C
     * So for each eval, loop through the comps. If the types match, or for mixed types if eval alleles (non-emptily)
     * intersect the comp alleles, pair them up and remove that comp records.
     * Continue until we're out of evals or comps. This is n^2, but should rarely actually happen.
     *
     * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of:
     *  1 - (20,4000,A/C  |  20,4000,A/C)
     *  2 - (20,4000,A/AC |    Empty    )
     * @param evalList - list of eval variant contexts
     * @param compList - list of comp variant contexts
     * @return resolved pairs of the input lists
     */
    private List<Pair<VariantContext,VariantContext>> resolveMultipleRecords(List<VariantContext> evalList, List<VariantContext> compList) {
        List<Pair<VariantContext,VariantContext>> resolvedPairs = new ArrayList<Pair<VariantContext,VariantContext>>(evalList.size()+compList.size()); // oversized but w/e
        List<VariantContext> pairedEval = new ArrayList<VariantContext>(evalList.size());
        for ( VariantContext eval : evalList ) {
            VariantContext.Type evalType = eval.getType();
            Set<Allele> evalAlleles = new HashSet<Allele>(eval.getAlternateAlleles());
            VariantContext pairedComp = null;
            for ( VariantContext comp : compList ) {
                if ( evalType.equals(comp.getType()) ) {
                    pairedComp = comp;
                    break;
                } else if ( eval.isMixed() || comp.isMixed() ) {
                    for ( Allele compAllele : comp.getAlternateAlleles() ) {
                        if ( evalAlleles.contains(compAllele) ) {
                            pairedComp = comp;
                            break;
                        }
                    }
                }
            }
            if ( pairedComp != null ) {
                compList.remove(pairedComp);
                resolvedPairs.add(new Pair<VariantContext, VariantContext>(filterGenotypes(eval,ignoreFilters,evalJexls),filterGenotypes(pairedComp,ignoreFilters,compJexls)));
                pairedEval.add(eval);
                if ( compList.size() < 1 )
                    break;
            }
        }
        evalList.removeAll(pairedEval);
        for ( VariantContext unpairedEval : evalList ) {
            resolvedPairs.add(new Pair<VariantContext, VariantContext>(filterGenotypes(unpairedEval,ignoreFilters,evalJexls),createEmptyContext(unpairedEval,compSamples)));
        }


        for ( VariantContext unpairedComp : compList ) {
            resolvedPairs.add(new Pair<VariantContext, VariantContext>(createEmptyContext(unpairedComp,evalSamples),filterGenotypes(unpairedComp,ignoreFilters,compJexls)));
        }


        return resolvedPairs;
    }


    public ConcordanceMetrics reduce(List<Pair<VariantContext,VariantContext>> evalCompList, ConcordanceMetrics metrics) {
        for ( Pair<VariantContext,VariantContext> evalComp : evalCompList){
            metrics.update(evalComp.getFirst(),evalComp.getSecond());


        }
        return metrics;
    }


    private static double repairNaN(double d) {
     if ( Double.isNaN(d) ) {
      return 0.0;
     }
     return d;
    }


    public void onTraversalDone(ConcordanceMetrics metrics) {
        // todo -- this is over 200 lines of code just to format the output and could use some serious cleanup
        GATKReport report = new GATKReport();
        GATKReportTable concordanceCounts = new GATKReportTable("GenotypeConcordance_Counts","Per-sample concordance tables: comparison counts",2+GenotypeType.values().length*GenotypeType.values().length);
        GATKReportTable concordanceEvalProportions = new GATKReportTable("GenotypeConcordance_EvalProportions", "Per-sample concordance tables: proportions of genotypes called in eval",2+GenotypeType.values().length*GenotypeType.values().length);
        GATKReportTable concordanceCompProportions = new GATKReportTable("GenotypeConcordance_CompProportions", "Per-sample concordance tables: proportions of genotypes called in comp",2+GenotypeType.values().length*GenotypeType.values().length);
        GATKReportTable concordanceSummary = new GATKReportTable("GenotypeConcordance_Summary","Per-sample summary statistics: NRS, NRD, and OGC",2);
        GATKReportTable siteConcordance = new GATKReportTable("SiteConcordance_Summary","Site-level summary statistics",ConcordanceMetrics.SiteConcordanceType.values().length);
        if ( moltenize ) {
            concordanceCompProportions.addColumn("Sample","%s");
            concordanceCounts.addColumn("Sample","%s");
            concordanceEvalProportions.addColumn("Sample","%s");
            concordanceSummary.addColumn("Sample","%s");


            concordanceCompProportions.addColumn("Eval_Genotype","%s");
            concordanceCounts.addColumn("Eval_Genotype","%s");
            concordanceEvalProportions.addColumn("Eval_Genotype","%s");
            concordanceSummary.addColumn("Non-Reference_Discrepancy","%.3f");


            concordanceCompProportions.addColumn("Comp_Genotype","%s");
            concordanceCounts.addColumn("Comp_Genotype","%s");
            concordanceEvalProportions.addColumn("Comp_Genotype","%s");
            concordanceSummary.addColumn("Non-Reference_Sensitivity","%.3f");


            concordanceCompProportions.addColumn("Proportion","%.3f");
            concordanceCounts.addColumn("Count","%d");
            concordanceEvalProportions.addColumn("Proportion","%.3f");
            concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f");


            for ( Map.Entry<String,ConcordanceMetrics.GenotypeConcordanceTable> entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) {
                ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue();
                for ( GenotypeType evalType : GenotypeType.values() ) {
                    for ( GenotypeType compType : GenotypeType.values() ) {
                        String rowKey = String.format("%s_%s_%s",entry.getKey(),evalType.toString(),compType.toString());
                        concordanceCounts.set(rowKey,"Sample",entry.getKey());
                        concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString());
                        concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString());
                        int count = table.get(evalType, compType);
                        concordanceCounts.set(rowKey,"Count",count);
                        if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) {
                            concordanceEvalProportions.set(rowKey,"Sample",entry.getKey());
                            concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString());
                            concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString());
                            concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType)));
                        }
                        if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) {
                            concordanceCompProportions.set(rowKey,"Sample",entry.getKey());
                            concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString());
                            concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString());
                            concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType)));
                        }
                    }
                }
                String mismatchKey = String.format("%s_%s",entry.getKey(),"Mismatching");
                concordanceCounts.set(mismatchKey,"Sample",entry.getKey());
                concordanceCounts.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles");
                concordanceCounts.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles");
                concordanceEvalProportions.set(mismatchKey,"Sample",entry.getKey());
                concordanceEvalProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles");
                concordanceEvalProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles");
                concordanceCompProportions.set(mismatchKey,"Sample",entry.getKey());
                concordanceCompProportions.set(mismatchKey,"Eval_Genotype","Mismatching_Alleles");
                concordanceCompProportions.set(mismatchKey,"Comp_Genotype","Mismatching_Alleles");
                concordanceEvalProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes()));
                concordanceCompProportions.set(mismatchKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes()));
                concordanceCounts.set(mismatchKey,"Count",table.getnMismatchingAlt());
            }


            String sampleKey = "ALL";
            ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance();
            for ( GenotypeType evalType : GenotypeType.values() ) {
                for ( GenotypeType compType : GenotypeType.values() ) {
                    String rowKey = String.format("%s_%s_%s",sampleKey,evalType.toString(),compType.toString());
                    concordanceCounts.set(rowKey,"Sample",sampleKey);
                    concordanceCounts.set(rowKey,"Eval_Genotype",evalType.toString());
                    concordanceCounts.set(rowKey,"Comp_Genotype",compType.toString());
                    int count = table.get(evalType, compType);
                    concordanceCounts.set(rowKey,"Count",count);
                    if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR) {
                        concordanceEvalProportions.set(rowKey,"Sample",sampleKey);
                        concordanceEvalProportions.set(rowKey,"Eval_Genotype",evalType.toString());
                        concordanceEvalProportions.set(rowKey,"Comp_Genotype",compType.toString());
                        concordanceEvalProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnEvalGenotypes(evalType)));
                    }
                    if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF ) {
                        concordanceCompProportions.set(rowKey,"Sample",sampleKey);
                        concordanceCompProportions.set(rowKey,"Eval_Genotype",evalType.toString());
                        concordanceCompProportions.set(rowKey,"Comp_Genotype",compType.toString());
                        concordanceCompProportions.set(rowKey,"Proportion",repairNaN(( (double) count)/table.getnCompGenotypes(compType)));
                    }
                }
            }
            String rowKey = String.format("%s_%s",sampleKey,"Mismatching");
            concordanceCounts.set(rowKey,"Sample",sampleKey);
            concordanceCounts.set(rowKey,"Eval_Genotype","Mismatching_Alleles");
            concordanceCounts.set(rowKey,"Comp_Genotype","Mismatching_Alleles");
            concordanceEvalProportions.set(rowKey,"Sample",sampleKey);
            concordanceEvalProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles");
            concordanceEvalProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles");
            concordanceCompProportions.set(rowKey,"Sample",sampleKey);
            concordanceCompProportions.set(rowKey,"Eval_Genotype","Mismatching_Alleles");
            concordanceCompProportions.set(rowKey,"Comp_Genotype","Mismatching_Alleles");
            concordanceEvalProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes()));
            concordanceCompProportions.set(rowKey,"Proportion", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes()));
            concordanceCounts.set(rowKey,"Count",table.getnMismatchingAlt());


            for ( Map.Entry<String,Double> nrsEntry : metrics.getPerSampleNRS().entrySet() ) {
                concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey());
                concordanceSummary.set(nrsEntry.getKey(),"Non-Reference_Sensitivity",nrsEntry.getValue());
            }
            for ( Map.Entry<String,Double> nrdEntry : metrics.getPerSampleNRD().entrySet() ) {
                concordanceSummary.set(nrdEntry.getKey(),"Non-Reference_Discrepancy",nrdEntry.getValue());
            }
            for ( Map.Entry<String,Double> ogcEntry : metrics.getPerSampleOGC().entrySet() ) {
                concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue());
            }
            concordanceSummary.set("ALL_NRS_NRD","Sample","ALL");
            concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Sensitivity",metrics.getOverallNRS());
            concordanceSummary.set("ALL_NRS_NRD","Non-Reference_Discrepancy",metrics.getOverallNRD());
            concordanceSummary.set("ALL_NRS_NRD","Overall_Genotype_Concordance",metrics.getOverallOGC());




            for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) {
                siteConcordance.addColumn(type.toString(),"%d");
            }


            for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) {
                siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type));
            }


        } else {
            concordanceCompProportions.addColumn("Sample","%s");
            concordanceCounts.addColumn("Sample","%s");
            concordanceEvalProportions.addColumn("Sample","%s");
            concordanceSummary.addColumn("Sample","%s");
            for ( GenotypeType evalType : GenotypeType.values() ) {
                for ( GenotypeType compType : GenotypeType.values() ) {
                    String colKey = String.format("%s_%s", evalType.toString(), compType.toString());
                    concordanceCounts.addColumn(colKey,"%d");
                    if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR)
                        concordanceEvalProportions.addColumn(colKey,"%.3f");
                    if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF )
                        concordanceCompProportions.addColumn(colKey,"%.3f");
                }
            }
            concordanceEvalProportions.addColumn("Mismatching_Alleles","%.3f");
            concordanceCompProportions.addColumn("Mismatching_Alleles","%.3f");
            concordanceCounts.addColumn("Mismatching_Alleles","%d");
            concordanceSummary.addColumn("Non-Reference Sensitivity","%.3f");
            concordanceSummary.addColumn("Non-Reference Discrepancy","%.3f");
            concordanceSummary.addColumn("Overall_Genotype_Concordance","%.3f");
            for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) {
                siteConcordance.addColumn(type.toString(),"%d");
            }


            for ( Map.Entry<String,ConcordanceMetrics.GenotypeConcordanceTable> entry : metrics.getPerSampleGenotypeConcordance().entrySet() ) {
                ConcordanceMetrics.GenotypeConcordanceTable table = entry.getValue();
                concordanceEvalProportions.set(entry.getKey(),"Sample",entry.getKey());
                concordanceCompProportions.set(entry.getKey(),"Sample",entry.getKey());
                concordanceCounts.set(entry.getKey(),"Sample",entry.getKey());
                for ( GenotypeType evalType : GenotypeType.values() ) {
                    for ( GenotypeType compType : GenotypeType.values() ) {
                        String colKey = String.format("%s_%s",evalType.toString(),compType.toString());
                        int count = table.get(evalType, compType);
                        concordanceCounts.set(entry.getKey(),colKey,count);
                        if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR)
                            concordanceEvalProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType)));
                        if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF )
                            concordanceCompProportions.set(entry.getKey(),colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType)));
                    }
                }
                concordanceEvalProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes()));
                concordanceCompProportions.set(entry.getKey(),"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes()));
                concordanceCounts.set(entry.getKey(),"Mismatching_Alleles",table.getnMismatchingAlt());
            }


            String rowKey = "ALL";
            concordanceCompProportions.set(rowKey,"Sample",rowKey);
            concordanceEvalProportions.set(rowKey,"Sample",rowKey);
            concordanceCounts.set(rowKey,"Sample",rowKey);
            ConcordanceMetrics.GenotypeConcordanceTable table = metrics.getOverallGenotypeConcordance();
            for ( GenotypeType evalType : GenotypeType.values() ) {
                for ( GenotypeType compType : GenotypeType.values() ) {
                    String colKey = String.format("%s_%s",evalType.toString(),compType.toString());
                    int count = table.get(evalType,compType);
                    concordanceCounts.set(rowKey,colKey,count);
                    if ( evalType == GenotypeType.HET || evalType == GenotypeType.HOM_REF || evalType == GenotypeType.HOM_VAR)
                        concordanceEvalProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnEvalGenotypes(evalType)));
                    if ( compType == GenotypeType.HET || compType == GenotypeType.HOM_VAR || compType == GenotypeType.HOM_REF )
                        concordanceCompProportions.set(rowKey,colKey,repairNaN(( (double) count)/table.getnCompGenotypes(compType)));
                }
            }
            concordanceEvalProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledEvalGenotypes()));
            concordanceCompProportions.set(rowKey,"Mismatching_Alleles", repairNaN(( (double) table.getnMismatchingAlt() )/table.getnCalledCompGenotypes()));
            concordanceCounts.set(rowKey,"Mismatching_Alleles",table.getnMismatchingAlt());


            for ( Map.Entry<String,Double> nrsEntry : metrics.getPerSampleNRS().entrySet() ) {
                concordanceSummary.set(nrsEntry.getKey(),"Sample",nrsEntry.getKey());
                concordanceSummary.set(nrsEntry.getKey(),"Non-Reference Sensitivity",nrsEntry.getValue());
            }
            for ( Map.Entry<String,Double> nrdEntry : metrics.getPerSampleNRD().entrySet() ) {
                concordanceSummary.set(nrdEntry.getKey(),"Non-Reference Discrepancy",nrdEntry.getValue());
            }
            for ( Map.Entry<String,Double> ogcEntry : metrics.getPerSampleOGC().entrySet() ) {
                concordanceSummary.set(ogcEntry.getKey(),"Overall_Genotype_Concordance",ogcEntry.getValue());
            }
            concordanceSummary.set("ALL","Sample","ALL");
            concordanceSummary.set("ALL","Non-Reference Sensitivity",metrics.getOverallNRS());
            concordanceSummary.set("ALL","Non-Reference Discrepancy",metrics.getOverallNRD());
            concordanceSummary.set("ALL","Overall_Genotype_Concordance",metrics.getOverallOGC());


            for (ConcordanceMetrics.SiteConcordanceType type : ConcordanceMetrics.SiteConcordanceType.values() ) {
                siteConcordance.set("Comparison",type.toString(),metrics.getOverallSiteConcordance().get(type));
            }
        }


        report.addTable(concordanceCompProportions);
        report.addTable(concordanceEvalProportions);
        report.addTable(concordanceCounts);
        report.addTable(concordanceSummary);
        report.addTable(siteConcordance);


        report.print(out);
    }


    public VariantContext createEmptyContext(VariantContext other, List<String> samples) {
        VariantContextBuilder builder = new VariantContextBuilder();
        // set the alleles to be the same
        builder.alleles(other.getAlleles());
        builder.loc(other.getChr(),other.getStart(),other.getEnd());
        // set all genotypes to empty
        List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
        for ( String sample : samples )
            genotypes.add(GenotypeBuilder.create(sample, new ArrayList<Allele>(0)));
        builder.genotypes(genotypes);
        return builder.make();
    }


    public VariantContext filterGenotypes(VariantContext context, boolean ignoreSiteFilter, List<VariantContextUtils.JexlVCMatchExp> exps) {
        if ( ! context.isFiltered() || ignoreSiteFilter ) {
            List<Genotype> filteredGenotypes = new ArrayList<Genotype>(context.getNSamples());
            for ( Genotype g : context.getGenotypes() ) {
                Map<VariantContextUtils.JexlVCMatchExp, Boolean> matchMap = VariantContextUtils.match(context, g, exps);
                boolean filtered = false;
                for ( Boolean b : matchMap.values() ) {
                    if ( b ) {
                        filtered = true;
                        break;
                    }
                }
                if ( filtered ) {
                    filteredGenotypes.add(GenotypeBuilder.create(g.getSampleName(),Arrays.asList(Allele.NO_CALL,Allele.NO_CALL),g.getExtendedAttributes()));
                } else {
                    filteredGenotypes.add(g);
                }
            }
            VariantContextBuilder builder = new VariantContextBuilder(context);
            builder.genotypes(filteredGenotypes);
            return builder.make();
        }


        VariantContextBuilder builder = new VariantContextBuilder();
        builder.alleles(Arrays.asList(context.getReference()));
        builder.loc(context.getChr(),context.getStart(),context.getEnd());
        List<Genotype> newGeno = new ArrayList<Genotype>(context.getNSamples());
        for ( Genotype g : context.getGenotypes().iterateInSampleNameOrder() ) {
            newGeno.add(GenotypeBuilder.create(g.getSampleName(),new ArrayList<Allele>()));
        }
        builder.genotypes(newGeno);
        return builder.make();
    }
}
Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeConcordance

Related Classes of org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeConcordance