Package org.broadinstitute.gatk.tools.walkers.variantutils

Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.SelectVariants

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.variantutils;

import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.TreeReducible;
import org.broadinstitute.gatk.tools.walkers.annotator.ChromosomeCountConstants;
import org.broadinstitute.gatk.utils.MendelianViolation;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.text.XReadLines;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;

/**
* Selects variants from a VCF source.
*
* <p>
* Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses
* (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain
* requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose.
* Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
* pattern match).  Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25).  These JEXL expressions are
* documented in the Using JEXL expressions section (http://www.broadinstitute.org/gatk/guide/article?id=1255).
* One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
*
* <h3>Input</h3>
* <p>
* A variant set to select from.
* </p>
*
* <h3>Output</h3>
* <p>
* A selected VCF.
* </p>
*
* <h3>Examples</h3>
* <pre>
* Select two samples out of a VCF with many samples:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -sn SAMPLE_A_PARC \
*   -sn SAMPLE_B_ACTG
*
* Select two samples and any sample that matches a regular expression:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -sn SAMPLE_1_PARC \
*   -sn SAMPLE_1_ACTG \
*   -se 'SAMPLE.+PARC'
*
* Select any sample that matches a regular expression and sites where the QD annotation is more than 10:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -se 'SAMPLE.+PARC'
*   -select "QD > 10.0"
*
* Select a sample and exclude non-variant loci and filtered loci:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -sn SAMPLE_1_ACTG \
*   -env \
*   -ef
*
* Select a sample and restrict the output vcf to a set of intervals:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -L /path/to/my.interval_list \
*   -sn SAMPLE_1_ACTG
*
* Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant hapmap.vcf \
*   --discordance myCalls.vcf
*   -o output.vcf \
*   -sn mySample
*
* Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant myCalls.vcf \
*   --concordance hisCalls.vcf
*   -o output.vcf \
*   -sn mySample
*
* Generating a VCF of all the variants that are mendelian violations:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -ped family.ped \
*   -mvq 50 \
*   -o violations.vcf
*
* Creating a set with 50% of the total number of variants in the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -fraction 0.5
*
* Select only indels from a VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -selectType INDEL
*
* Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T SelectVariants \
*   --variant input.vcf \
*   -o output.vcf \
*   -selectType SNP -selectType MNP \
*   -restrictAllelesTo MULTIALLELIC
*
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
    @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();

    /**
     * A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype
     * and either the site isn't present in this track, the sample isn't present in this track,
     * or the sample is called reference in this track.
     */
    @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
    protected RodBinding<VariantContext> discordanceTrack;

    /**
     * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
     * in both the variant and concordance tracks or (2) every sample present in the variant track is present in the
     * concordance track and they have the sample genotype call.
     */
    @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
    protected RodBinding<VariantContext> concordanceTrack;

    @Output(doc="File to which variants should be written")
    protected VariantContextWriter vcfWriter = null;

    @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
    public Set<String> sampleNames = new HashSet<String>(0);

    @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
    public Set<String> sampleExpressions ;

    @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
    public Set<File> sampleFiles;

    /**
     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
     */
    @Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false)
    public Set<String> XLsampleNames = new HashSet<String>(0);

    /**
     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
     */
    @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
    public Set<File> XLsampleFiles = new HashSet<File>(0);

    /**
     * Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
     */
    @Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
    public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();

    @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
    protected boolean EXCLUDE_NON_VARIANTS = false;

    @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
    protected boolean EXCLUDE_FILTERED = false;

    /**
     * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
     * For example, a multiallelic record such as:
     * 1    100 .   A   AAA,AAAAA
     * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
     * 1    100 .   A  T
     * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
     */
    @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
    private  NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;

    @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false)
    private boolean KEEP_ORIGINAL_CHR_COUNTS = false;

    /**
     * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure.
     */
    @Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
    private Boolean MENDELIAN_VIOLATIONS = false;

    @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
    protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;

    /**
     * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction.  Can be used for large fractions.
     */
    @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
    protected double fractionRandom = 0;

    @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false)
    protected double fractionGenotypes = 0;

    /**
     * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
     * When specified one or more times, a particular type of variant is selected.
     *
     */
    @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
    private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();

    /**
     * If provided, we will only include variants whose ID field is present in this list of ids.  The matching
     * is exact string matching.  The file format is just one ID per line
     *
     */
    @Argument(fullName="keepIDs", shortName="IDs", doc="Only emit sites whose ID is found in this file (one ID per line)", required=false)
    private File rsIDFile = null;


    @Hidden
    @Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
    private boolean fullyDecode = false;

    @Hidden
    @Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes.  For efficiency testing only", required=false)
    private boolean forceGenotypesDecode = false;

    @Hidden
    @Argument(fullName="justRead", doc="If true, we won't actually write the output file.  For efficiency testing only", required=false)
    private boolean justRead = false;

    @Argument(doc="indel size select",required=false,fullName="maxIndelSize")
    private int maxIndelSize = Integer.MAX_VALUE;

    @Argument(doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")
    private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;


    public enum NumberAlleleRestriction {
        ALL,
        BIALLELIC,
        MULTIALLELIC
    }

    private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
    private ArrayList<String> selectNames = new ArrayList<String>();
    private List<VariantContextUtils.JexlVCMatchExp> jexls = null;

    private TreeSet<String> samples = new TreeSet<String>();
    private boolean NO_SAMPLES_SPECIFIED = false;

    private boolean DISCORDANCE_ONLY = false;
    private boolean CONCORDANCE_ONLY = false;

    private MendelianViolation mv;


    /* variables used by the SELECT RANDOM modules */
    private boolean SELECT_RANDOM_FRACTION = false;

    //Random number generator for the genotypes to remove
    private Random randomGenotypes = new Random();

    private Set<String> IDsToKeep = null;
    private Map<String, VCFHeader> vcfRods;

    /**
     * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
     */
    public void initialize() {
        // Get list of samples to include in the output
        List<String> rodNames = Arrays.asList(variantCollection.variants.getName());

        vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
        TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));

        Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
        Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);

        // first, check overlap between requested and present samples
        Set<String> commandLineUniqueSamples = new HashSet<String>(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size());
        commandLineUniqueSamples.addAll(samplesFromFile);
        commandLineUniqueSamples.addAll(samplesFromExpressions);
        commandLineUniqueSamples.addAll(sampleNames);
        commandLineUniqueSamples.removeAll(vcfSamples);

        // second, add the requested samples
        samples.addAll(sampleNames);
        samples.addAll(samplesFromExpressions);
        samples.addAll(samplesFromFile);

        logger.debug(Utils.join(",",commandLineUniqueSamples));

        if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) {
            logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
            samples.removeAll(commandLineUniqueSamples);
        } else if (commandLineUniqueSamples.size() > 0 ) {
            throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
                    "Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
                    "A list of these samples:",
                    Utils.join(",",commandLineUniqueSamples),
                    "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES"));
        }


        // if none were requested, we want all of them
        if ( samples.isEmpty() ) {
            samples.addAll(vcfSamples);
            NO_SAMPLES_SPECIFIED = true;
        }

        // now, exclude any requested samples
        final Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
        samples.removeAll(XLsamplesFromFile);
        samples.removeAll(XLsampleNames);
        NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty() && XLsamplesFromFile.isEmpty();

        if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
            throw new UserException("All samples requested to be included were also requested to be excluded.");

        if ( ! NO_SAMPLES_SPECIFIED )
            for ( String sample : samples )
            logger.info("Including sample '" + sample + "'");

        // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
        if (TYPES_TO_INCLUDE.isEmpty()) {

            for (VariantContext.Type t : VariantContext.Type.values())
                selectedTypes.add(t);

        }
        else {
            for (VariantContext.Type t : TYPES_TO_INCLUDE)
                selectedTypes.add(t);

        }
        // Initialize VCF header
        Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
        headerLines.add(new VCFHeaderLine("source", "SelectVariants"));

        if (KEEP_ORIGINAL_CHR_COUNTS) {
            headerLines.add(new VCFInfoHeaderLine("AC_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Original AC"));
            headerLines.add(new VCFInfoHeaderLine("AF_Orig", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Original AF"));
            headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
        }
        headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
        headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));

        for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
            // It's not necessary that the user supply select names for the JEXL expressions, since those
            // expressions will only be needed for omitting records.  Make up the select names here.
            selectNames.add(String.format("select-%d", i));
        }

        jexls = VariantContextUtils.initializeMatchExps(selectNames, SELECT_EXPRESSIONS);

        // Look at the parameters to decide which analysis to perform
        DISCORDANCE_ONLY = discordanceTrack.isBound();
        if (DISCORDANCE_ONLY) logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());

        CONCORDANCE_ONLY = concordanceTrack.isBound();
        if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());

        if (MENDELIAN_VIOLATIONS) {
            mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
        }

        SELECT_RANDOM_FRACTION = fractionRandom > 0;
        if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");

        /** load in the IDs file to a hashset for matching */
        if ( rsIDFile != null ) {
            IDsToKeep = new HashSet<String>();
            try {
                for ( final String line : new XReadLines(rsIDFile).readLines() ) {
                    IDsToKeep.add(line.trim());
                }
                logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile);
            } catch ( FileNotFoundException e ) {
                throw new UserException.CouldNotReadInputFile(rsIDFile, e);
            }
        }

        vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
    }

    /**
     * Subset VC record if necessary and emit the modified record (provided it satisfies criteria for printing)
     *
     * @param  tracker   the ROD tracker
     * @param  ref       reference information
     * @param  context   alignment info
     * @return 1 if the record was printed to the output file, 0 if otherwise
     */
    @Override
    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null )
            return 0;

        Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());

        if ( vcs == null || vcs.size() == 0) {
            return 0;
        }

        for (VariantContext vc : vcs) {
            // an option for performance testing only
            if ( fullyDecode )
                vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing() );

            // an option for performance testing only
            if ( forceGenotypesDecode ) {
                final int x = vc.getCalledChrCount();
                //logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
            }

            if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
                continue;

            if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1)
                break;

            if (DISCORDANCE_ONLY) {
                Collection<VariantContext> compVCs = tracker.getValues(discordanceTrack, context.getLocation());
                if (!isDiscordant(vc, compVCs))
                    continue;
            }
            if (CONCORDANCE_ONLY) {
                Collection<VariantContext> compVCs = tracker.getValues(concordanceTrack, context.getLocation());
                if (!isConcordant(vc, compVCs))
                    continue;
            }

            if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
                continue;

            if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
                continue;

            if (!selectedTypes.contains(vc.getType()))
                continue;

            if ( containsIndelLargerThan(vc, maxIndelSize) )
                continue;

            VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);

            if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
                boolean failedJexlMatch = false;
                try {
                    for (VariantContextUtils.JexlVCMatchExp jexl : jexls) {
                        if (!VariantContextUtils.match(sub, jexl)) {
                            failedJexlMatch = true;
                            break;
                        }
                    }
                } catch (IllegalArgumentException e) {
                    /*The IAE thrown by htsjdk already includes an informative error message ("Invalid JEXL
                      expression detected...")*/
                    throw new UserException(e.getMessage());
                }
                if ( !failedJexlMatch &&
                        !justRead &&
                        ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
                    vcfWriter.add(sub);
                }
            }
        }

        return 1;
    }

    /*
     * Determines if any of the alternate alleles are greater than the max indel size
     *
     * @param vc            the variant context to check
     * @param maxIndelSize  the maximum size of allowed indels
     * @return true if the VC contains an indel larger than maxIndelSize and false otherwise
     */
    protected static boolean containsIndelLargerThan(final VariantContext vc, final int maxIndelSize) {
        final List<Integer> lengths = vc.getIndelLengths();
        if ( lengths == null )
            return false;

        for ( Integer indelLength : lengths ) {
            if ( Math.abs(indelLength) > maxIndelSize )
                return true;
        }

        return false;
    }

    /**
     * Checks if vc has a variant call for (at least one of) the samples.
     * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
     * @param compVCs the comparison VariantContext (discordance
     * @return true if is discordant
     */
    private boolean isDiscordant (VariantContext vc, Collection<VariantContext> compVCs) {
        if (vc == null)
            return false;

        // if we're not looking at specific samples then the absence of a compVC means discordance
        if (NO_SAMPLES_SPECIFIED)
            return (compVCs == null || compVCs.isEmpty());

        // check if we find it in the variant rod
        GenotypesContext genotypes = vc.getGenotypes(samples);
        for (final Genotype g : genotypes) {
            if (sampleHasVariant(g)) {
                // There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples.
                if (compVCs == null)
                    return true;
                // Look for this sample in the all vcs of the comp ROD track.
                boolean foundVariant = false;
                for (VariantContext compVC : compVCs) {
                    if (haveSameGenotypes(g, compVC.getGenotype(g.getSampleName()))) {
                        foundVariant = true;
                        break;
                    }
                }
                // if (at least one sample) was not found in all VCs of the comp ROD, we have discordance
                if (!foundVariant)
                    return true;
            }
        }
        return false; // we only get here if all samples have a variant in the comp rod.
    }

    private boolean isConcordant (VariantContext vc, Collection<VariantContext> compVCs) {
        if (vc == null || compVCs == null || compVCs.isEmpty())
            return false;

        // if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant.
        if (NO_SAMPLES_SPECIFIED)
            return true;

        // make a list of all samples contained in this variant VC that are being tracked by the user command line arguments.
        Set<String> variantSamples = vc.getSampleNames();
        variantSamples.retainAll(samples);

        // check if we can find all samples from the variant rod in the comp rod.
        for (String sample : variantSamples) {
            boolean foundSample = false;
            for (VariantContext compVC : compVCs) {
                Genotype varG = vc.getGenotype(sample);
                Genotype compG = compVC.getGenotype(sample);
                if (haveSameGenotypes(varG, compG)) {
                    foundSample = true;
                    break;
                }
            }
            // if at least one sample doesn't have the same genotype, we don't have concordance
            if (!foundSample) {
                return false;
            }
        }
        return true;
    }

    private boolean sampleHasVariant(Genotype g) {
        return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED)));
    }

    private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
        if ( g1 == null || g2 == null )
            return false;

        if ((g1.isCalled() && g2.isFiltered()) ||
                (g2.isCalled() && g1.isFiltered()) ||
                (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED))
            return false;

        List<Allele> a1s = g1.getAlleles();
        List<Allele> a2s = g2.getAlleles();
        return (a1s.containsAll(a2s) && a2s.containsAll(a1s));
    }
    @Override
    public Integer reduceInit() { return 0; }

    @Override
    public Integer reduce(Integer value, Integer sum) { return value + sum; }

    @Override
    public Integer treeReduce(Integer lhs, Integer rhs) {
        return lhs + rhs;
    }

    public void onTraversalDone(Integer result) {
        logger.info(result + " records processed.");
    }



    /**
     * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
     *
     * @param vc       the VariantContext record to subset
     * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles?
     * @return the subsetted VariantContext
     */
    private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
        if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
            return vc;

        final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used

        final VariantContextBuilder builder = new VariantContextBuilder(sub);

        // if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values
        GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc);

        // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
        if ( vc.getNSamples() != sub.getNSamples() ) {
            builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY);
            builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
        }

        // Remove a fraction of the genotypes if needed
        if ( fractionGenotypes > 0 ){
            final ArrayList<Genotype> genotypes = new ArrayList<>();
            for ( Genotype genotype : newGC ) {
                //Set genotype to no call if it falls in the fraction.
                if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
                    final List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
                    genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
                }
                else{
                    genotypes.add(genotype);
                }
            }
            newGC = GenotypesContext.create(genotypes);
        }

        builder.genotypes(newGC);

        addAnnotations(builder, vc, sub.getSampleNames());

        return builder.make();
    }

    /*
     * Add annotations to the new VC
     *
     * @param builder     the new VC to annotate
     * @param originalVC  the original VC
     * @param selectedSampleNames the post-selection list of sample names
     */
    private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set<String> selectedSampleNames) {
        if ( fullyDecode ) return; // TODO -- annotations are broken with fully decoded data

        if ( KEEP_ORIGINAL_CHR_COUNTS ) {
            final int[] indexOfOriginalAlleleForNewAllele;
            final List<Allele> newAlleles = builder.getAlleles();
            final int numOriginalAlleles = originalVC.getNAlleles();

            // if the alleles already match up, we can just copy the previous list of counts
            if ( numOriginalAlleles == newAlleles.size() ) {
                indexOfOriginalAlleleForNewAllele = null;
            }
            // otherwise we need to parse them and select out the correct ones
            else {
                indexOfOriginalAlleleForNewAllele = new int[newAlleles.size() - 1];
                Arrays.fill(indexOfOriginalAlleleForNewAllele, -1);

                // note that we don't care about the reference allele at position 0
                for ( int newI = 1; newI < newAlleles.size(); newI++ ) {
                    final Allele newAlt = newAlleles.get(newI);
                    for ( int oldI = 0; oldI < numOriginalAlleles - 1; oldI++ ) {
                        if ( newAlt.equals(originalVC.getAlternateAllele(oldI), false) ) {
                            indexOfOriginalAlleleForNewAllele[newI - 1] = oldI;
                            break;
                        }
                    }
                }
            }

            if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
                builder.attribute("AC_Orig", getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY), indexOfOriginalAlleleForNewAllele));
            if ( originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) )
                builder.attribute("AF_Orig", getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY), indexOfOriginalAlleleForNewAllele));
            if ( originalVC.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) )
                builder.attribute("AN_Orig", originalVC.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
        }

        VariantContextUtils.calculateChromosomeCounts(builder, false);

        boolean sawDP = false;
        int depth = 0;
        for ( final String sample : selectedSampleNames ) {
            Genotype g = originalVC.getGenotype(sample);

            if ( ! g.isFiltered() ) {
                if ( g.hasDP() ) {
                    depth += g.getDP();
                    sawDP = true;
                }
            }
        }

        if ( sawDP )
            builder.attribute("DP", depth);
    }

    /**
     * Pulls out the appropriate tokens from the old ordering of an attribute to the new ordering
     *
     * @param attribute               the non-null attribute (from the INFO field)
     * @param oldToNewIndexOrdering   the mapping from new to old ordering
     * @return non-null Object attribute
     */
    private Object getReorderedAttributes(final Object attribute, final int[] oldToNewIndexOrdering) {
        // if the ordering is the same, then just use the original attribute
        if ( oldToNewIndexOrdering == null )
            return attribute;

        // break the original attributes into separate tokens; unfortunately, this means being smart about class types
        final Object[] tokens;
        if ( attribute.getClass().isArray() )
            tokens = (Object[])attribute;
        else if ( List.class.isAssignableFrom(attribute.getClass()) )
            tokens = ((List)attribute).toArray();
        else
            tokens = attribute.toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);

        final List<Object> result = new ArrayList<>();
        for ( final int index : oldToNewIndexOrdering ) {
            if ( index >= tokens.length )
                throw new IllegalArgumentException("the old attribute has an incorrect number of elements: " + attribute);
            result.add(tokens[index]);
        }
        return result;
    }
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.variantutils.SelectVariants

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.