Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.CombineVariants

/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/


package org.broadinstitute.gatk.tools.walkers.variantutils;


import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.Reference;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.TreeReducible;
import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.tools.walkers.annotator.ChromosomeCountConstants;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextUtils;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;


import java.util.*;


/**
 * Combines VCF records from different sources.
 *
 * <p>
 * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
 * and any number of sources can be input. This tool currently supports two different combination types for each of
 * variants (the first 8 fields of the VCF) and genotypes (the rest).
 * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
 * Union: assumes each rod represents the same set of samples (although this is not enforced); using the
 * priority list (if provided), it emits a single record instance at every position represented in the rods.
 *
 * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
 * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
 * CombineVariants always produces a union of the input VCFs.  However, any part of the Venn of the N merged VCFs
 * can be exacted using JEXL expressions on the set attribute using SelectVariants.  If you want to extract just
 * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
 * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
 * in the detailed example in the documentation guide.
 *
 * Note that CombineVariants supports multi-threaded parallelism (8/15/12).  This is particularly useful
 * when converting from VCF to BCF2, which can be expensive.  In this case each thread spends CPU time
 * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
 * efficiency.  However, since this merge runs in only one thread, you can quickly reach diminishing
 * returns with the number of parallel threads.  -nt 4 works well but -nt 8 may be too much.
 *
 * Some fine details about the merging algorithm:
 *   <ul>
 *   <li> As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
 *      the first VCF record with a non-MISSING QUAL value.  The previous behavior was to take the
 *      max QUAL, which resulted in sometime strange downstream confusion</li>
 *   </ul>
 *
 * <h3>Input</h3>
 * <p>
 * One or more variant sets to combine.
 * </p>
 *
 * <h3>Output</h3>
 * <p>
 * A combined VCF.
 * </p>
 *
 * <h3>Examples</h3>
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T CombineVariants \
 *   --variant input1.vcf \
 *   --variant input2.vcf \
 *   -o output.vcf \
 *   -genotypeMergeOptions UNIQUIFY
 *
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T CombineVariants \
 *   --variant:foo input1.vcf \
 *   --variant:bar input2.vcf \
 *   -o output.vcf \
 *   -genotypeMergeOptions PRIORITIZE
 *   -priority foo,bar
 * </pre>
 *
 */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50))
public class CombineVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
    /**
     * The VCF files to merge together
     *
     * variants can take any number of arguments on the command line.  Each -V argument
     * will be included in the final merged output VCF.  If no explicit name is provided,
     * the -V arguments will be named using the default algorithm: variants, variants2, variants3, etc.
     * The user can override this by providing an explicit name -V:name,vcf for each -V argument,
     * and each named argument will be labeled as such in the output (i.e., set=name rather than
     * set=variants2).  The order of arguments does not matter unless except for the naming, so
     * if you provide an rod priority list and no explicit names than variants, variants2, etc
     * are technically order dependent.  It is strongly recommended to provide explicit names when
     * a rod priority list is provided.
     */
    @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
    public List<RodBindingCollection<VariantContext>> variantCollections;
    final private List<RodBinding<VariantContext>> variants = new ArrayList<>();


    @Output(doc="File to which variants should be written")
    protected VariantContextWriter vcfWriter = null;


    @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false)
    public GATKVariantContextUtils.GenotypeMergeType genotypeMergeOption = null;


    @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
    public GATKVariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;


    @Hidden
    @Argument(shortName="multipleAllelesMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different allele types (for example, SNP vs. indel)", required=false)
    public GATKVariantContextUtils.MultipleAllelesMergeType multipleAllelesMergeType = GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE;


    /**
     * Used when taking the union of variants that contain genotypes.  A complete priority list MUST be provided.
     */
    @Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false)
    public String PRIORITY_STRING = null;


    @Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
    public boolean printComplexMerges = false;


    @Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false)
    public boolean filteredAreUncalled = false;


    /**
     * Used to generate a sites-only file.
     */
    @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false)
    public boolean minimalVCF = false;


    @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false)
    public boolean EXCLUDE_NON_VARIANTS = false;


    /**
     * Set to 'null' if you don't want the set field emitted.
     */
    @Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
    public String SET_KEY = "set";


    /**
     * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.
     */
    @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false)
    public boolean ASSUME_IDENTICAL_SAMPLES = false;


    @Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false)
    public int minimumN = 1;


    /**
     * This option allows the suppression of the command line in the VCF header. This is most often usefully when combining variants for dozens or hundreds of smaller VCFs.
     */
    @Argument(fullName="suppressCommandLineHeader", shortName="suppressCommandLineHeader", doc="If true, do not output the header containing the command line used", required=false)
    public boolean SUPPRESS_COMMAND_LINE_HEADER = false;


    @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
    public boolean MERGE_INFO_WITH_MAX_AC = false;


    private List<String> priority = null;


    /** Optimization to strip out genotypes before merging if we are doing a sites_only output */
    private boolean sitesOnlyVCF = false;
    private Set<String> samples;


    public void initialize() {
        Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());


        if ( vcfWriter instanceof VariantContextWriterStub) {
            sitesOnlyVCF = ((VariantContextWriterStub)vcfWriter).getWriterOptions().contains(Options.DO_NOT_WRITE_GENOTYPES);
            if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
        } else
            logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");


        validateAnnotateUnionArguments();


        final boolean sampleNamesAreUnique = SampleUtils.verifyUniqueSamplesNames(vcfRods);


        if (genotypeMergeOption == null) {
            if (!sampleNamesAreUnique)
                throw new UserException("Duplicate sample names were discovered but no genotypemergeoption was supplied. " +
                    "To combine samples without merging specify --genotypemergeoption UNIQUIFY. Merging duplicate samples " +
                    "without specified priority is unsupported, but can be achieved by specifying --genotypemergeoption UNSORTED.");
            else
                genotypeMergeOption = GATKVariantContextUtils.GenotypeMergeType.UNSORTED;
        }


        if ( PRIORITY_STRING == null && genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE) {
            //PRIORITY_STRING = Utils.join(",", vcfRods.keySet());  Deleted by Ami (7/10/12)
            logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority);
        }


        if (genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE &&
                !sampleNamesAreUnique)
            throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered.");


        samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);


        if ( SET_KEY.toLowerCase().equals("null") )
            SET_KEY = null;


        Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
        if ( SET_KEY != null )
            headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
        if ( !ASSUME_IDENTICAL_SAMPLES )
             headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
        VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
        vcfHeader.setWriteCommandLine(!SUPPRESS_COMMAND_LINE_HEADER);
        vcfWriter.writeHeader(vcfHeader);


        // collect the actual rod bindings into a list for use later
        for ( final RodBindingCollection<VariantContext> variantCollection : variantCollections )
            variants.addAll(variantCollection.getRodBindings());
    }


    private void validateAnnotateUnionArguments() {
        Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);


        if ( genotypeMergeOption == GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null )
            throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes");


        if ( PRIORITY_STRING != null){
            priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(",")));
            if ( rodNames.size() != priority.size() )
                throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority);


            if ( ! rodNames.containsAll(priority) )
                throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING);
        }


    }


    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null ) // RodWalkers can make funky map calls
            return 0;


        final Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
        // get all of the vcf rods at this locus
        // Need to provide reference bases to simpleMerge starting at current locus
        Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());


        if ( sitesOnlyVCF ) {
            vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
        }


        if ( ASSUME_IDENTICAL_SAMPLES ) {
            for ( final VariantContext vc : vcs ) {
                vcfWriter.add(vc);
            }


            return vcs.isEmpty() ? 0 : 1;
        }


        int numFilteredRecords = 0;
        for (final VariantContext vc : vcs) {
            if (vc.filtersWereApplied() && vc.isFiltered())
                numFilteredRecords++;
        }


        if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
            return 0;


        final List<VariantContext> mergedVCs = new ArrayList<>();


        if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) {
            final Map<VariantContext.Type, List<VariantContext>> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs);


            // TODO -- clean this up in a refactoring
            // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type)
            if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) {
                final List<VariantContext> refs = VCsByType.remove(VariantContext.Type.NO_VARIATION);
                for ( final VariantContext.Type type : VariantContext.Type.values() ) {
                    if ( VCsByType.containsKey(type) ) {
                        VCsByType.get(type).addAll(refs);
                        break;
                    }
                }
            }


            // iterate over the types so that it's deterministic
            for (final VariantContext.Type type : VariantContext.Type.values()) {
                // make sure that it is a variant or in case it is not, that we want to include the sites with no variants
                if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) {
                    if (VCsByType.containsKey(type)) {
                        mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(),
                                filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
                                SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
                    }
                }
            }
        }
        else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
            mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType,
                    genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
        }
        else {
            logger.warn("Ignoring all records at site " + ref.getLocus());
        }


        for ( final VariantContext mergedVC : mergedVCs ) {
            // only operate at the start of events
            if ( mergedVC == null )
                continue;


            if ( mergedVC.hasAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) )
                throw new UserException("CombineVariants should not be used to merge gVCFs produced by the HaplotypeCaller; use CombineGVCFs instead");


            final VariantContextBuilder builder = new VariantContextBuilder(mergedVC);
            // re-compute chromosome counts
            VariantContextUtils.calculateChromosomeCounts(builder, false);


            if ( minimalVCF )
                GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
            final VariantContext vc = builder.make();
            if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() )
                vcfWriter.add(builder.make());
        }


        return vcs.isEmpty() ? 0 : 1;
    }


    public Integer reduceInit() {
        return 0;
    }


    public Integer reduce(Integer counter, Integer sum) {
        return counter + sum;
    }


    @Override
    public Integer treeReduce(Integer lhs, Integer rhs) {
        return reduce(lhs, rhs);
    }


    public void onTraversalDone(Integer sum) {}
}
Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.CombineVariants

Related Classes of org.broadinstitute.gatk.tools.walkers.variantutils.CombineVariants