Package org.broadinstitute.gatk.tools.walkers.beagle

Source Code of org.broadinstitute.gatk.tools.walkers.beagle.VariantsToBeagleUnphased

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.beagle;

import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.Input;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;

import java.io.PrintStream;
import java.util.Arrays;
import java.util.Set;

/**
* Produces an input file to Beagle imputation engine, listing unphased, hard-called genotypes for a single sample
* in input variant file.  Will additionally hold back a fraction of the sites for evaluation, marking the
* genotypes at that sites as missing, and writing the truth of these sites to a second VCF file
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
public class VariantsToBeagleUnphased extends RodWalker<Integer, Integer> {
    @Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
    public RodBinding<VariantContext> variants;

    @Output(doc="File to which BEAGLE unphased genotypes should be written")
    protected PrintStream  beagleWriter = null;

    @Argument(fullName = "bootstrap_fraction", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
    public double bootstrap = 0.0;

    @Argument(fullName = "bootstrap_vcf",shortName = "bsvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
    VariantContextWriter bootstrapVCFOutput = null;

    @Argument(fullName = "missing", shortName = "missing", doc = "String to identify missing data in beagle output", required = false)
    public String MISSING = "?";

    private Set<String> samples = null;
    private int bootstrapSetSize = 0;
    private int testSetSize = 0;

    public void initialize() {
        samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variants.getName()));

        beagleWriter.print("I marker alleleA alleleB");
        for ( String sample : samples )
            beagleWriter.print(String.format(" %s %s", sample, sample));

        beagleWriter.println();

        if ( bootstrap < 0.0 | bootstrap > 1.0 )
            throw new UserException.BadArgumentValue("bootstrap", "Bootstrap value must be fraction between 0 and 1");

        if ( bootstrapVCFOutput != null ) {
            Set<VCFHeaderLine> hInfo = GATKVCFUtils.getHeaderFields(getToolkit());
            bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit())));
        }
    }

    /**
     * Iterate over each site, emitting the BEAGLE unphased genotypes file format
     * @param tracker
     * @param ref
     * @param context
     * @return
     */
    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
        if( tracker != null ) {
            GenomeLoc loc = context.getLocation();
            VariantContext vc = tracker.getFirstValue(variants, loc);

            if ( ProduceBeagleInput.canBeOutputToBeagle(vc) ) {
                // do we want to hold back this site?
                boolean makeMissing = dropSite(vc);

                // if we are holding it back and we are writing a bootstrap VCF, write it out
                if ( makeMissing && bootstrapVCFOutput != null ) {
                    bootstrapVCFOutput.add(vc);
                }

                // regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate
                writeUnphasedBeagleOutput(vc, makeMissing);
            }
        }

        return 0;
    }

    /**
     * Do we want to hold back this site for bootstrap?  Considers the bootstrap fraction member variable
     *
     * @param vc
     * @return
     */
    public boolean dropSite(VariantContext vc) {
        if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
            bootstrapSetSize++;
            return true;
        } else {
            testSetSize++;
            return false;
        }
    }

    public void writeUnphasedBeagleOutput(VariantContext vc, boolean makeMissing) {
        GenomeLoc currentLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc);
        StringBuffer beagleOut = new StringBuffer();

        String marker = String.format("%s:%d ",currentLoc.getContig(), currentLoc.getStart());
        beagleOut.append("M ").append(marker);

        // write out the alleles at this site
        for ( Allele allele : vc.getAlleles() ) {
            beagleOut.append(allele.isNoCall() ? "-" : allele.getBaseString()).append(" ");
        }

        // write out sample level genotypes
        for ( String sample : samples ) {
            Genotype genotype = vc.getGenotype(sample);
            if ( ! makeMissing && genotype.isCalled() ) {
                addAlleles(beagleOut, genotype);
            } else {
                addAlleles(beagleOut, MISSING, MISSING);
            }
        }

        beagleWriter.println(beagleOut.toString());
    }

    private void addAlleles(StringBuffer buf, Genotype g) {
        addAlleles(buf, g.getAllele(0).getBaseString(), g.getAllele(1).getBaseString());

    }

    private void addAlleles(StringBuffer buf, String a, String b) {
        buf.append(a).append(" ").append(b);
    }

    public Integer reduceInit() { return 0; }
    public Integer reduce( Integer value, Integer sum ) { return value + sum; }

    public void onTraversalDone( Integer includedSites ) {
        logger.info("Sites included in beagle genotypes file : " + includedSites);
    }
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.beagle.VariantsToBeagleUnphased

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.