Package org.broadinstitute.gatk.tools.walkers.varianteval

Source Code of org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.varianteval;

import com.google.java.contract.Requires;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.util.IntervalTree;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator;
import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.IntervalStratification;
import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.VariantStratifier;
import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.StratificationManager;
import org.broadinstitute.gatk.tools.walkers.varianteval.util.EvaluationContext;
import org.broadinstitute.gatk.tools.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.gatk.tools.walkers.varianteval.util.VariantEvalUtils;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextUtils;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;

/**
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
*
* <p>
* Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
* raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
* to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
* by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
* degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
* evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
* and stratification modules.
*
* <h3>Input</h3>
* <p>
* One or more variant sets to evaluate plus any number of comparison sets.
* </p>
*
* <h3>Output</h3>
* <p>
* Evaluation tables detailing the results of the eval modules which were applied.
* For example:
* <pre>
* output.eval.gatkreport:
* ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
* CountVariants  CompRod   CpG      EvalRod  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
* CountVariants  dbsnp     CpG      eval     none            all      65900028        135770       0         135770        0.00206024  ...
* CountVariants  dbsnp     CpG      eval     none            known    65900028        47068        0         47068         0.00071423  ...
* CountVariants  dbsnp     CpG      eval     none            novel    65900028        88702        0         88702         0.00134601  ...
* CountVariants  dbsnp     all      eval     none            all      65900028        330818       0         330818        0.00502000  ...
* CountVariants  dbsnp     all      eval     none            known    65900028        120685       0         120685        0.00183133  ...
* CountVariants  dbsnp     all      eval     none            novel    65900028        210133       0         210133        0.00318866  ...
* CountVariants  dbsnp     non_CpG  eval     none            all      65900028        195048       0         195048        0.00295976  ...
* CountVariants  dbsnp     non_CpG  eval     none            known    65900028        73617        0         73617         0.00111710  ...
* CountVariants  dbsnp     non_CpG  eval     none            novel    65900028        121431       0         121431        0.00184265  ...
* ...
* </pre>
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T VariantEval \
*   -o output.eval.gatkreport \
*   --eval:set1 set1.vcf \
*   --eval:set2 set2.vcf \
*   [--comp comp.vcf]
* </pre>
*
* <h3>Caveat</h3>
*
* <p>Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, such as AlleleCount and VariantSummary, or Sample and VariantSummary.
* If you specify such a combination, the program will output an error message and ask you to disable one of these options.
* We do not currently provide an exhaustive list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy command line, to rapidly ascertain whether it will work or not.</p>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50, stop=50))
@PartitionBy(PartitionType.NONE)
public class VariantEval extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
    public static final String IS_SINGLETON_KEY = "ISSINGLETON";

    @Output
    protected PrintStream out;

    /**
     * The variant file(s) to evaluate.
     */
    @Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
    public List<RodBinding<VariantContext>> evals;

    /**
     * The variant file(s) to compare against.
     */
    @Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
    public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();
    private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();

    /**
     * dbSNP comparison VCF.  By default, the dbSNP file is used to specify the set of "known" variants.
     * Other sets can be specified with the -knownName (--known_names) argument.
     */
    @ArgumentCollection
    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

    /**
     * Some analyses want to count overlap not with dbSNP (which is in general very open) but
     * actually want to itemize their overlap specifically with a set of gold standard sites
     * such as HapMap, OMNI, or the gold standard indels.  This argument provides a mechanism
     * for communicating which file to use
     */
    @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false)
    public RodBinding<VariantContext> goldStandard = null;

    /**
     * Note that the --list argument requires a fully resolved and correct command-line to work.
     */
    @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
    protected Boolean LIST = false;

    // Partitioning the data arguments
    @Argument(shortName="select", doc="One or more stratifications to use when evaluating the data", required=false)
    protected ArrayList<String> SELECT_EXPS = new ArrayList<String>();

    @Argument(shortName="selectName", doc="Names to use for the list of stratifications (must be a 1-to-1 mapping)", required=false)
    protected ArrayList<String> SELECT_NAMES = new ArrayList<String>();

    @Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
    protected Set<String> SAMPLE_EXPRESSIONS;

    /**
     * List of rod tracks to be used for specifying "known" variants other than dbSNP.
     */
    @Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
    protected HashSet<String> KNOWN_NAMES = new HashSet<String>();
    List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();

    // Stratification arguments
    @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
    protected String[] STRATIFICATIONS_TO_USE = {};

    @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required=false)
    protected Boolean NO_STANDARD_STRATIFICATIONS = false;

    /**
     * See the -list argument to view available modules.
     */
    @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noEV is specified)", required=false)
    protected String[] MODULES_TO_USE = {};

    @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -EV option)", required=false)
    protected Boolean NO_STANDARD_MODULES = false;

    @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
    protected double MIN_PHASE_QUALITY = 10.0;

    @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false)
    protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;

    @Argument(shortName="ploidy", fullName="samplePloidy", doc="Per-sample ploidy (number of chromosomes per sample)", required=false)
    protected int ploidy = GATKVariantContextUtils.DEFAULT_PLOIDY;

    @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false)
    private File ancestralAlignmentsFile = null;

    @Argument(fullName="requireStrictAlleleMatch", shortName="strict", doc="If provided only comp and eval tracks with exactly matching reference and alternate alleles will be counted as overlapping", required=false)
    private boolean requireStrictAlleleMatch = false;

    @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
    private boolean keepSitesWithAC0 = false;

    @Hidden
    @Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
    private int numSamplesFromArgument = 0;

    /**
     * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
     * variant set, and evaluate the union of the results.  Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
     */
    @Argument(fullName="mergeEvals", shortName="mergeEvals", doc="If provided, all -eval tracks will be merged into a single eval track", required=false)
    public boolean mergeEvals = false;

    /**
     * File containing tribble-readable features for the IntervalStratificiation
     */
    @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false)
    public IntervalBinding<Feature> intervalsFile = null;

    /**
     * File containing tribble-readable features containing known CNVs.  For use with VariantSummary table.
     */
    @Input(fullName="knownCNVs", shortName="knownCNVs", doc="File containing tribble-readable features describing a known list of copy number variants", required=false)
    public IntervalBinding<Feature> knownCNVsFile = null;
    Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();

    // Variables
    private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

    private boolean isSubsettingSamples;
    private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>();
    private Set<String> sampleNamesForStratification = new LinkedHashSet<String>();

    // important stratifications
    private boolean byFilterIsEnabled = false;
    private boolean perSampleIsEnabled = false;

    // Public constants
    private static String ALL_SAMPLE_NAME = "all";

    // the number of processed bp for this walker
    long nProcessedLoci = 0;

    // Utility class
    private final VariantEvalUtils variantEvalUtils = new VariantEvalUtils(this);

    // Ancestral alignments
    private IndexedFastaSequenceFile ancestralAlignments = null;

    // The set of all possible evaluation contexts
    StratificationManager<VariantStratifier, EvaluationContext> stratManager;
    //Set<DynamicStratification> dynamicStratifications = Collections.emptySet();

    /**
     * Initialize the stratifications, evaluations, evaluation contexts, and reporting object
     */
    public void initialize() {
        // Just list the modules, and exit quickly.
        if (LIST) { variantEvalUtils.listModulesAndExit(); }

        // maintain the full list of comps
        comps.addAll(compsProvided);
        if ( dbsnp.dbsnp.isBound() ) {
            comps.add(dbsnp.dbsnp);
            knowns.add(dbsnp.dbsnp);
        }

        // Add a dummy comp track if none exists
        if ( comps.size() == 0 )
            comps.add(new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));

        // Set up set of additional knowns
        for ( RodBinding<VariantContext> compRod : comps ) {
            if ( KNOWN_NAMES.contains(compRod.getName()) )
                knowns.add(compRod);
        }

        // Now that we have all the rods categorized, determine the sample list from the eval rods.
        Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
        Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

        // Load the sample list, using an intermediate tree set to sort the samples
        final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples);
        sampleNamesForEvaluation.addAll(new TreeSet<String>(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)));
        isSubsettingSamples = ! sampleNamesForEvaluation.containsAll(allSampleNames);

        if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
            sampleNamesForStratification.addAll(sampleNamesForEvaluation);
        }
        sampleNamesForStratification.add(ALL_SAMPLE_NAME);

        // Initialize select expressions
        for (VariantContextUtils.JexlVCMatchExp jexl : VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS)) {
            SortableJexlVCMatchExp sjexl = new SortableJexlVCMatchExp(jexl.name, jexl.exp);
            jexlExpressions.add(sjexl);
        }

        // Initialize the set of stratifications and evaluations to use
        // The list of stratifiers and evaluators to use
        final List<VariantStratifier> stratificationObjects = variantEvalUtils.initializeStratificationObjects(NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
        final Set<Class<? extends VariantEvaluator>> evaluationClasses = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);

        checkForIncompatibleEvaluatorsAndStratifiers(stratificationObjects, evaluationClasses);

        for ( VariantStratifier vs : stratificationObjects ) {
            if ( vs.getName().equals("Filter") )
                byFilterIsEnabled = true;
            else if ( vs.getName().equals("Sample") )
                perSampleIsEnabled = true;
        }

        if ( intervalsFile != null ) {
            boolean fail = true;
            for ( final VariantStratifier vs : stratificationObjects ) {
                if ( vs.getClass().equals(IntervalStratification.class) )
                    fail = false;
            }
            if ( fail )
                throw new UserException.BadArgumentValue("ST", "stratIntervals argument provided but -ST IntervalStratification not provided");
        }

        // Initialize the evaluation contexts
        createStratificationStates(stratificationObjects, evaluationClasses);

        // Load ancestral alignments
        if (ancestralAlignmentsFile != null) {
            try {
                ancestralAlignments = new IndexedFastaSequenceFile(ancestralAlignmentsFile);
            } catch (FileNotFoundException e) {
                throw new ReviewedGATKException(String.format("The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath()));
            }
        }

        // initialize CNVs
        if ( knownCNVsFile != null ) {
            knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
        }
    }

    final void checkForIncompatibleEvaluatorsAndStratifiers( final List<VariantStratifier> stratificationObjects,
                                                             Set<Class<? extends VariantEvaluator>> evaluationClasses) {
        for ( final VariantStratifier vs : stratificationObjects ) {
            for ( Class<? extends VariantEvaluator> ec : evaluationClasses )
                if ( vs.getIncompatibleEvaluators().contains(ec) )
                    throw new UserException.BadArgumentValue("ST and ET",
                            "The selected stratification " + vs.getName() +
                                    " and evaluator " + ec.getSimpleName() +
                                    " are incompatible due to combinatorial memory requirements." +
                                    " Please disable one");
        }
    }
   
    final void createStratificationStates(final List<VariantStratifier> stratificationObjects, final Set<Class<? extends VariantEvaluator>> evaluationObjects) {
        final List<VariantStratifier> strats = new ArrayList<VariantStratifier>(stratificationObjects);
        stratManager = new StratificationManager<VariantStratifier, EvaluationContext>(strats);

        logger.info("Creating " + stratManager.size() + " combinatorial stratification states");
        for ( int i = 0; i < stratManager.size(); i++ ) {
            EvaluationContext ec = new EvaluationContext(this, evaluationObjects);
            stratManager.set(i, ec);
        }
    }   
   
    public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(final IntervalBinding<Feature> intervals) {
        final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>();

        final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());

        // set up the map from contig -> interval tree
        for ( final String contig : getContigNames() )
            byContig.put(contig, new IntervalTree<GenomeLoc>());

        for ( final GenomeLoc loc : locs ) {
            byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
        }

        return byContig;
    }

    /**
     * Collect relevant information from each variant in the supplied VCFs
     */
    @Override
    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        // we track the processed bp and expose this for modules instead of wasting CPU power on calculating
        // the same thing over and over in evals that want the processed bp
        synchronized (this) {
            nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
        }

        if (tracker != null) {
            String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());

//            // update the dynamic stratifications
//            for (final VariantContext vc : tracker.getValues(evals, ref.getLocus())) {
//                // don't worry -- DynamicStratification only work with one eval object
//                for ( final DynamicStratification ds :  dynamicStratifications ) {
//                    ds.update(vc);
//                }
//            }

            //      --------- track ---------           sample  - VariantContexts -
            HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals);
            HashMap<RodBinding<VariantContext>, HashMap<String, Collection<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false);

            // for each eval track
            for ( final RodBinding<VariantContext> evalRod : evals ) {
                final Map<String, Collection<VariantContext>> emptyEvalMap = Collections.emptyMap();
                final Map<String, Collection<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap;

                // for each sample stratifier
                for ( final String sampleName : sampleNamesForStratification ) {
                    Collection<VariantContext> evalSetBySample = evalSet.get(sampleName);
                    if ( evalSetBySample == null ) {
                        evalSetBySample = new HashSet<VariantContext>(1);
                        evalSetBySample.add(null);
                    }

                    // for each eval in the track
                    for ( VariantContext eval : evalSetBySample ) {
                        // deal with ancestral alleles if requested
                        if ( eval != null && aastr != null ) {
                            eval = new VariantContextBuilder(eval).attribute("ANCESTRALALLELE", aastr).make();
                        }

                        // for each comp track
                        for ( final RodBinding<VariantContext> compRod : comps ) {
                            // no sample stratification for comps
                            final HashMap<String, Collection<VariantContext>> compSetHash = compVCs.get(compRod);
                            final Collection<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.<VariantContext>emptyList() : compVCs.get(compRod).values().iterator().next();

                            // find the comp
                            final VariantContext comp = findMatchingComp(eval, compSet);

                            for ( EvaluationContext nec : getEvaluationContexts(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {

                                // eval against the comp
                                synchronized (nec) {
                                    nec.apply(tracker, ref, context, comp, eval);
                                }

                                // eval=null against all comps of different type that aren't bound to another eval
                                for ( VariantContext otherComp : compSet ) {
                                    if ( otherComp != comp && ! compHasMatchingEval(otherComp, evalSetBySample) ) {
                                        synchronized (nec) {
                                            nec.apply(tracker, ref, context, otherComp, null);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                if ( mergeEvals ) break; // stop processing the eval tracks
            }
        }

        return null;
    }

    /**
     * Given specific eval and comp VCs and the sample name, return an iterable
     * over all of the applicable state keys.
     *
     * this code isn't structured yet for efficiency.  Here we currently are
     * doing the following inefficient algorithm:
     *
     * for each strat:
     *   get list of relevant states that eval and comp according to strat
     *   add this list of states to a list of list states
     *
     * then
     *
     * ask the strat manager to look up all of the keys associated with the combinations
     * of these states.  For example, suppose we have a single variant S.  We have active
     * strats EvalRod, CompRod, and Novelty.  We produce a list that looks like:
     *
     *   L = [[Eval], [Comp], [All, Novel]]
     *
     * We then go through the strat manager tree to produce the keys associated with these states:
     *
     *   K = [0, 1] where EVAL x COMP x ALL = 0 and EVAL x COMP x NOVEL = 1
     *
     * It's clear that a better
     *
     * TODO -- create an inline version that doesn't create the intermediate list of list
     *
     * @param tracker
     * @param ref
     * @param eval
     * @param evalName
     * @param comp
     * @param compName
     * @param sampleName
     * @return
     */
    protected Collection<EvaluationContext> getEvaluationContexts(final RefMetaDataTracker tracker,
                                                                  final ReferenceContext ref,
                                                                  final VariantContext eval,
                                                                  final String evalName,
                                                                  final VariantContext comp,
                                                                  final String compName,
                                                                  final String sampleName ) {
        final List<List<Object>> states = new LinkedList<List<Object>>();
        for ( final VariantStratifier vs : stratManager.getStratifiers() ) {
            states.add(vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName));
        }
        return stratManager.values(states);
    }


    @Requires({"comp != null", "evals != null"})
    private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
        // find all of the matching comps
        for ( final VariantContext eval : evals ) {
            if ( eval != null && doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) != EvalCompMatchType.NO_MATCH )
                return true;
        }

        // nothing matched
        return false;
    }

    private enum EvalCompMatchType { NO_MATCH, STRICT, LENIENT }

    @Requires({"eval != null", "comp != null"})
    private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
        if ( comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION )
            // if either of these are NO_VARIATION they are LENIENT matches
            return EvalCompMatchType.LENIENT;

        if ( comp.getType() != eval.getType() )
            return EvalCompMatchType.NO_MATCH;

        // find the comp which matches both the reference allele and alternate allele from eval
        final Allele altEval = eval.getAlternateAlleles().size() == 0 ? null : eval.getAlternateAllele(0);
        final Allele altComp = comp.getAlternateAlleles().size() == 0 ? null : comp.getAlternateAllele(0);
        if ((altEval == null && altComp == null) || (altEval != null && altEval.equals(altComp) && eval.getReference().equals(comp.getReference())))
            return EvalCompMatchType.STRICT;
        else
            return requireStrictAlleleMatch ? EvalCompMatchType.NO_MATCH : EvalCompMatchType.LENIENT;
    }

    private VariantContext findMatchingComp(final VariantContext eval, final Collection<VariantContext> comps) {
        // if no comps, return null
        if ( comps == null || comps.isEmpty() )
            return null;

        // if no eval, return any comp
        if ( eval == null )
            return comps.iterator().next();

        // find all of the matching comps
        VariantContext lenientMatch = null;
        for ( final VariantContext comp : comps ) {
            switch ( doEvalAndCompMatch(comp, eval, requireStrictAlleleMatch) ) {
                case STRICT:
                    return comp;
                case LENIENT:
                    if ( lenientMatch == null ) lenientMatch = comp;
                    break;
                case NO_MATCH:
                    // do nothing
            }
        }

        // nothing matched, just return lenientMatch, which might be null
        return lenientMatch;
    }

    public Integer treeReduce(Integer lhs, Integer rhs) { return null; }

    @Override
    public Integer reduceInit() { return null; }

    @Override
    public Integer reduce(Integer value, Integer sum) { return null; }

    /**
     * Output the finalized report
     *
     * @param result  an integer that doesn't get used for anything
     */
    public void onTraversalDone(Integer result) {
        logger.info("Finalizing variant report");
       
        // go through the evaluations and finalize them
        for ( final EvaluationContext nec : stratManager.values() )
            for ( final VariantEvaluator ve : nec.getVariantEvaluators() )
                ve.finalizeEvaluation();
       
        VariantEvalReportWriter.writeReport(out, stratManager, stratManager.getStratifiers(), stratManager.get(0).getVariantEvaluators());
    }

    // Accessors
    public Logger getLogger() { return logger; }

    public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; }

    public int getSamplePloidy() { return ploidy; }
    public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; }

    public static String getAllSampleName() { return ALL_SAMPLE_NAME; }

    public List<RodBinding<VariantContext>> getKnowns() { return knowns; }

    public List<RodBinding<VariantContext>> getEvals() { return evals; }

    public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
    public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }

    public int getNumberOfSamplesForEvaluation() {
        if (sampleNamesForEvaluation!= null &&  !sampleNamesForEvaluation.isEmpty())
            return sampleNamesForEvaluation.size();
        else {
            return numSamplesFromArgument;
        }

    }
    public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }

    public List<RodBinding<VariantContext>> getComps() { return comps; }

    public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; }

    public long getnProcessedLoci() {
        return nProcessedLoci;
    }

    public Set<String> getContigNames() {
        final TreeSet<String> contigs = new TreeSet<String>();
        for( final SAMSequenceRecord r :  getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) {
            contigs.add(r.getSequenceName());
        }
        return contigs;
    }

    /**
     * getToolkit is protected, so we have to pseudo-overload it here so eval / strats can get the toolkit
     * @return
     */
    public GenomeAnalysisEngine getToolkit() {
        return super.getToolkit();
    }

    public boolean ignoreAC0Sites() {
        return ! keepSitesWithAC0;
    }
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.