Package org.broadinstitute.gatk.tools.walkers.variantutils

Source Code of org.broadinstitute.gatk.tools.walkers.variantutils.VariantsToVCF

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.variantutils;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.refdata.VariantContextAdaptors;
import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature;
import org.broadinstitute.gatk.engine.walkers.Reference;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.Window;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantOverlapAnnotator;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.codecs.hapmap.RawHapMapFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory;

import java.io.File;
import java.util.*;

/**
* Converts variants from other file formats to VCF format.
*
* <p>
* Note that there must be a Tribble feature/codec for the file format as well as an adaptor.
*
* <h3>Input</h3>
* <p>
* A variant file to filter.
* </p>
*
* <h3>Output</h3>
* <p>
* A VCF file.
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T VariantsToVCF \
*   -o output.vcf \
*   --variant:RawHapMap input.hapmap \
*   --dbsnp dbsnp.vcf
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-40,stop=40))
public class VariantsToVCF extends RodWalker<Integer, Integer> {

    @Output(doc="File to which variants should be written")
    protected VariantContextWriter baseWriter = null;
    private VariantContextWriter vcfwriter; // needed because hapmap/dbsnp indel records move

    /**
     * Variants from this input file are used by this tool as input.
     */
    @Input(fullName="variant", shortName = "V", doc="Input variant file", required=true)
    public RodBinding<Feature> variants;

    @ArgumentCollection
    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

    /**
     * This argument is used for data (like GELI) with genotypes but no sample names encoded within.
     */
    @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false)
    protected String sampleName = null;

    private Set<String> allowedGenotypeFormatStrings = new HashSet<String>();
    private boolean wroteHeader = false;
    private Set<String> samples;

    // for dealing with indels in hapmap
    CloseableIterator<GATKFeature> dbsnpIterator = null;
    VariantOverlapAnnotator variantOverlapAnnotator = null;

    public void initialize() {
        vcfwriter = VariantContextWriterFactory.sortOnTheFly(baseWriter, 40, false);
        variantOverlapAnnotator = new VariantOverlapAnnotator(dbsnp.dbsnp, getToolkit().getGenomeLocParser());
    }

    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
            return 0;

        Collection<VariantContext> contexts = getVariantContexts(tracker, ref);

        for ( VariantContext vc : contexts ) {
            VariantContextBuilder builder = new VariantContextBuilder(vc);

            // set the appropriate sample name if necessary
            if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) {
                Genotype g = new GenotypeBuilder(vc.getGenotype(variants.getName())).name(sampleName).make();
                builder.genotypes(g);
            }

            final VariantContext withID = variantOverlapAnnotator.annotateRsID(tracker, builder.make());
            writeRecord(withID, tracker, ref.getLocus());
        }

        return 1;
    }

    private Collection<VariantContext> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref) {

        List<Feature> features = tracker.getValues(variants, ref.getLocus());
        List<VariantContext> VCs = new ArrayList<VariantContext>(features.size());

        for ( Feature record : features ) {
            if ( VariantContextAdaptors.canBeConvertedToVariantContext(record) ) {
                // we need to special case the HapMap format because indels aren't handled correctly
                if ( record instanceof RawHapMapFeature) {

                    // is it an indel?
                    RawHapMapFeature hapmap = (RawHapMapFeature)record;
                    if ( hapmap.getAlleles()[0].equals(RawHapMapFeature.NULL_ALLELE_STRING) || hapmap.getAlleles()[1].equals(RawHapMapFeature.NULL_ALLELE_STRING) ) {
                        // get the dbsnp object corresponding to this record (needed to help us distinguish between insertions and deletions)
                        VariantContext dbsnpVC = getDbsnp(hapmap.getName());
                        if ( dbsnpVC == null || dbsnpVC.isMixed() )
                            continue;

                        Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
                        alleleMap.put(RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion()));
                        alleleMap.put(RawHapMapFeature.INSERTION, Allele.create((char)ref.getBase() + ((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion()));
                        hapmap.setActualAlleles(alleleMap);

                        // also, use the correct positioning for insertions
                        hapmap.updatePosition(dbsnpVC.getStart());

                        if ( hapmap.getStart() < ref.getWindow().getStart() ) {
                            logger.warn("Hapmap record at " + ref.getLocus() + " represents an indel too large to be converted; skipping...");
                            continue;
                        }
                    }
                }

                // ok, we might actually be able to turn this record in a variant context
                VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), record, ref);

                if ( vc != null ) // sometimes the track has odd stuff in it that can't be converted
                    VCs.add(vc);
            }
        }

        return VCs;
    }

    private VariantContext getDbsnp(String rsID) {
        if ( dbsnpIterator == null ) {

            if ( dbsnp == null )
                throw new UserException.BadInput("No dbSNP rod was provided, but one is needed to decipher the correct indel alleles from the HapMap records");

            RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
                                                          getToolkit().getGenomeLocParser(),
                                                          getToolkit().getArguments().unsafe,
                                                          getToolkit().getArguments().disableAutoIndexCreationAndLockingWhenReadingRods,
                                                          null);
            dbsnpIterator = builder.createInstanceOfTrack(VCFCodec.class, new File(dbsnp.dbsnp.getSource())).getIterator();
            // Note that we should really use some sort of seekable iterator here so that the search doesn't take forever
            // (but it's complicated because the hapmap location doesn't match the dbsnp location, so we don't know where to seek to)
        }

        while ( dbsnpIterator.hasNext() ) {
            GATKFeature feature = dbsnpIterator.next();
            VariantContext vc = (VariantContext)feature.getUnderlyingObject();
            if ( vc.getID().equals(rsID) )
                return vc;
        }

        return null;
    }

    private void writeRecord(VariantContext vc, RefMetaDataTracker tracker, GenomeLoc loc) {
        if ( !wroteHeader ) {
            wroteHeader = true;

            // setup the header fields
            Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
            hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variants.getName())));
            hInfo.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));

            allowedGenotypeFormatStrings.add(VCFConstants.GENOTYPE_KEY);
            for ( VCFHeaderLine field : hInfo ) {
                if ( field instanceof VCFFormatHeaderLine) {
                    allowedGenotypeFormatStrings.add(((VCFFormatHeaderLine)field).getID());
                }
            }

            samples = new LinkedHashSet<String>();
            if ( sampleName != null ) {
                samples.add(sampleName);
            } else {
                // try VCF first
                samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variants.getName()));

                if ( samples.isEmpty() ) {
                    List<Feature> features = tracker.getValues(variants, loc);
                    if ( features.size() == 0 )
                        throw new IllegalStateException("No rod data is present, but we just created a VariantContext");

                    Feature f = features.get(0);
                    if ( f instanceof RawHapMapFeature )
                        samples.addAll(Arrays.asList(((RawHapMapFeature)f).getSampleIDs()));
                    else
                        samples.addAll(vc.getSampleNames());
                }
            }

            vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
        }

        vc = GATKVariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
        vcfwriter.add(vc);
    }

    public Integer reduceInit() {
        return 0;
    }

    public Integer reduce(Integer value, Integer sum) {
        return value + sum;
    }

    public void onTraversalDone(Integer sum) {
        if ( dbsnpIterator != null )
            dbsnpIterator.close();
        vcfwriter.close();
    }
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.variantutils.VariantsToVCF

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.