Package org.broadinstitute.gatk.tools.walkers.beagle

Source Code of org.broadinstitute.gatk.tools.walkers.beagle.ProduceBeagleInput$CachingFormatter

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.beagle;

import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.samples.Gender;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VQSRCalibrationCurve;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;

import java.io.File;
import java.io.PrintStream;
import java.util.*;

/**
*  Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
* <p>
*
* <h3>Input</h3>
* <p>
* A VCF with variants to convert to Beagle format
* </p>
*
* <h2>Outputs</h2>
* <p>
* A single text file which can be fed to Beagle
* </p>
* <p>
* Optional: A file with a list of markers
* </p>
  *
* <h3>Examples</h3>
* <pre>
*     java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
*      -R reffile.fasta -T ProduceBeagleInput \
*      -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
* </pre>
*
*/

@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
public class ProduceBeagleInput extends RodWalker<Integer, Integer> {

    @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();

    @Hidden
    @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false)
    public RodBinding<VariantContext> validation;


    @Output(doc="File to which BEAGLE input should be written")
    protected PrintStream  beagleWriter = null;

    @Hidden
    @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false, defaultToStdout = false)
    protected PrintStream  markers = null;
    int markerCounter = 1;

    @Hidden
    @Input(doc="VQSqual calibration file", shortName = "cc", required=false)
    protected File VQSRCalibrationFile = null;
    protected VQSRCalibrationCurve VQSRCalibrator = null;

    @Hidden
    @Argument(doc="VQSqual key", shortName = "vqskey", required=false)
    protected String VQSLOD_KEY = "VQSqual";

    @Hidden
     @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
    public double insertedNoCallRate  = 0;
    @Hidden
     @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
    public double validationPrior = -1.0;
    @Hidden
     @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
    public double bootstrap = 0.0;
    @Hidden
     @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
    VariantContextWriter bootstrapVCFOutput = null;

    /**
     * If sample gender is known, this flag should be set to true to ensure that Beagle treats male Chr X properly.
     */
    @Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
    public boolean CHECK_IS_MALE_ON_CHR_X = false;

    @Hidden
    @Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
    public double variantPrior = 0.96;

    private Set<String> samples = null;
    private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
    private int bootstrapSetSize = 0;
    private int testSetSize = 0;
    private CachingFormatter formatter = new CachingFormatter("%5.4f ", 100000);
    private int certainFPs = 0;

    public void initialize() {

        samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(variantCollection.variants.getName()));

        beagleWriter.print("marker alleleA alleleB");
        for ( String sample : samples )
            beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));

        beagleWriter.println();

        if ( bootstrapVCFOutput != null ) {
            initializeVcfWriter();
        }

        if ( VQSRCalibrationFile != null ) {
            VQSRCalibrator = VQSRCalibrationCurve.readFromFile(VQSRCalibrationFile);
            logger.info("Read calibration curve");
            VQSRCalibrator.printInfo(logger);
        }
    }

    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
        if( tracker != null ) {
            GenomeLoc loc = context.getLocation();
            VariantContext variant_eval = tracker.getFirstValue(variantCollection.variants, loc);
            VariantContext validation_eval = tracker.getFirstValue(validation, loc);

            if ( goodSite(variant_eval,validation_eval) ) {
                if ( useValidation(validation_eval, ref) ) {
                    writeBeagleOutput(validation_eval, variant_eval, true, validationPrior);
                    return 1;
                } else {
                    if ( goodSite(variant_eval) ) {
                        writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
                        return 1;
                    } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap set -- what do?
                        return 0;
                    }
                }
            } else {
                return 0;
            }
        } else {
            return 0;
        }
    }

    public boolean goodSite(VariantContext a, VariantContext b) {
        return goodSite(a) || goodSite(b);
    }

    public boolean goodSite(VariantContext v) {
        if ( canBeOutputToBeagle(v) ) {
            if ( VQSRCalibrator != null && VQSRCalibrator.certainFalsePositive(VQSLOD_KEY, v) ) {
                certainFPs++;
                return false;
            } else {
                return true;
            }
        } else {
            return false;
        }
    }

    public static boolean canBeOutputToBeagle(VariantContext v) {
        return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
    }

    public boolean useValidation(VariantContext validation, ReferenceContext ref) {
        if( goodSite(validation) ) {
            // if using record keeps us below expected proportion, use it
            logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
            if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
                if ( bootstrapVCFOutput != null ) {
                    bootstrapVCFOutput.add(new VariantContextBuilder(validation).filters(BOOTSTRAP_FILTER).make());
                }
                bootstrapSetSize++;
                return true;
            } else {
                if ( bootstrapVCFOutput != null ) {
                    bootstrapVCFOutput.add(validation);
                }
                testSetSize++;
                return false;
            }
        } else {
            if ( validation != null && bootstrapVCFOutput != null ) {
                bootstrapVCFOutput.add(validation);
            }
            return false;
        }
    }

    private final static double[] HAPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.5, 0.0, 0.5 });
    private final static double[] DIPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.33, 0.33, 0.33 });

    public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
        GenomeLoc currentLoc = GATKVariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), preferredVC);
        StringBuffer beagleOut = new StringBuffer();

        String marker = String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart());
        beagleOut.append(marker);
        if ( markers != null ) markers.append(marker).append("\t").append(Integer.toString(markerCounter++)).append("\t");
        for ( Allele allele : preferredVC.getAlleles() ) {
            String bglPrintString;
            if (allele.isNoCall())
                bglPrintString = "-";
            else
                bglPrintString = allele.getBaseString()// get rid of * in case of reference allele

            beagleOut.append(String.format("%s ", bglPrintString));
            if ( markers != null ) markers.append(bglPrintString).append("\t");
        }
        if ( markers != null ) markers.append("\n");

        GenotypesContext preferredGenotypes = preferredVC.getGenotypes();
        GenotypesContext otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
        for ( String sample : samples ) {
            boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE;

            Genotype genotype;
            boolean isValidation;
            // use sample as key into genotypes structure
            if ( preferredGenotypes.containsSample(sample) ) {
                genotype = preferredGenotypes.get(sample);
                isValidation = isValidationSite;
            } else if ( otherGenotypes != null && otherGenotypes.containsSample(sample) ) {
                genotype = otherGenotypes.get(sample);
                isValidation = ! isValidationSite;
            } else {
                // there is magically no genotype for this sample.
                throw new GATKException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
            }

            /*
             * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
             */
            double [] log10Likelihoods = null;
            if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) {
                log10Likelihoods = genotype.getLikelihoods().getAsVector();

                // see if we need to randomly mask out genotype in this position.
                if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate ) {
                    // we are masking out this genotype
                    log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
                }

                if( isMaleOnChrX ) {
                    log10Likelihoods[1] = -255// todo -- warning this is dangerous for multi-allele case
                }
            }
            /**
             * otherwise, use the prior uniformly
             */
            else if (! isValidation && genotype.isCalled() && ! genotype.hasLikelihoods() ) {
                // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called genotype
                // is confident.  This is useful for Hapmap and 1KG release VCFs.
                double AA = (1.0-prior)/2.0;
                double AB = (1.0-prior)/2.0;
                double BB = (1.0-prior)/2.0;

                if (genotype.isHomRef()) { AA = prior; }
                else if (genotype.isHet()) { AB = prior; }
                else if (genotype.isHomVar()) { BB = prior; }

                log10Likelihoods = MathUtils.toLog10(new double[]{ AA, isMaleOnChrX ? 0.0 : AB, BB });
            }
            else  {
                log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
            }

            writeSampleLikelihoods(beagleOut, preferredVC, log10Likelihoods);
        }

        beagleWriter.println(beagleOut.toString());
    }

    private void writeSampleLikelihoods( StringBuffer out, VariantContext vc, double[] log10Likelihoods ) {
        if ( VQSRCalibrator != null ) {
            log10Likelihoods = VQSRCalibrator.includeErrorRateInLikelihoods(VQSLOD_KEY, vc, log10Likelihoods);
        }

        double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
        // see if we need to randomly mask out genotype in this position.
        for (double likeVal: normalizedLikelihoods) {
            out.append(formatter.format(likeVal));
//            out.append(String.format("%5.4f ",likeVal));
        }
    }


    public Integer reduceInit() {
        return 0; // Nothing to do here
    }

    public Integer reduce( Integer value, Integer sum ) {
        return value + sum; // count up the sites
    }

    public void onTraversalDone( Integer includedSites ) {
        logger.info("Sites included in beagle likelihoods file             : " + includedSites);
        logger.info(String.format("Certain false positive found from recalibration curve : %d (%.2f%%)",
                certainFPs, (100.0 * certainFPs) / (Math.max(certainFPs + includedSites, 1))));
    }

    private void initializeVcfWriter() {
        final List<String> inputNames = Arrays.asList(validation.getName());

        // setup the header fields
        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
        hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
        hInfo.add(new VCFFilterHeaderLine("bootstrap","This site used for genotype bootstrapping with ProduceBeagleInputWalker"));

        bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
    }

    public static class CachingFormatter {
        private String format;
        private LRUCache<Double, String> cache;

        public String getFormat() {
            return format;
        }

        public String format(double value) {
            String f = cache.get(value);
            if ( f == null ) {
                f = String.format(format, value);
                cache.put(value, f);
//                if ( cache.usedEntries() < maxCacheSize ) {
//                    System.out.printf("CACHE size %d%n", cache.usedEntries());
//                } else {
//                    System.out.printf("CACHE is full %f%n", value);
//                }
//            }
//            } else {
//                System.out.printf("CACHE hit %f%n", value);
//            }
            }

            return f;
        }

        public CachingFormatter(String format, int maxCacheSize) {
            this.format = format;
            this.cache = new LRUCache<Double, String>(maxCacheSize);
        }
    }

    /**
    * An LRU cache, based on <code>LinkedHashMap</code>.
    *
    * <p>
    * This cache has a fixed maximum number of elements (<code>cacheSize</code>).
    * If the cache is full and another entry is added, the LRU (least recently used) entry is dropped.
    *
    * <p>
    * This class is thread-safe. All methods of this class are synchronized.
    *
    * <p>
    * Author: Christian d'Heureuse, Inventec Informatik AG, Zurich, Switzerland<br>
    * Multi-licensed: EPL / LGPL / GPL / AL / BSD.
    */
    public static class LRUCache<K,V> {

    private static final float   hashTableLoadFactor = 0.75f;

    private LinkedHashMap<K,V>   map;
    private int                  cacheSize;

    /**
    * Creates a new LRU cache.
    * @param cacheSize the maximum number of entries that will be kept in this cache.
    */
    public LRUCache (int cacheSize) {
       this.cacheSize = cacheSize;
       int hashTableCapacity = (int)Math.ceil(cacheSize / hashTableLoadFactor) + 1;
       map = new LinkedHashMap<K,V>(hashTableCapacity, hashTableLoadFactor, true) {
          // (an anonymous inner class)
          private static final long serialVersionUID = 1;
          @Override protected boolean removeEldestEntry (Map.Entry<K,V> eldest) {
             return size() > LRUCache.this.cacheSize; }}; }

    /**
    * Retrieves an entry from the cache.<br>
    * The retrieved entry becomes the MRU (most recently used) entry.
    * @param key the key whose associated value is to be returned.
    * @return    the value associated to this key, or null if no value with this key exists in the cache.
    */
    public synchronized V get (K key) {
       return map.get(key); }

    /**
    * Adds an entry to this cache.
    * The new entry becomes the MRU (most recently used) entry.
    * If an entry with the specified key already exists in the cache, it is replaced by the new entry.
    * If the cache is full, the LRU (least recently used) entry is removed from the cache.
    * @param key    the key with which the specified value is to be associated.
    * @param value  a value to be associated with the specified key.
    */
    public synchronized void put (K key, V value) {
       map.put (key, value); }

    /**
    * Clears the cache.
    */
    public synchronized void clear() {
       map.clear(); }

    /**
    * Returns the number of used entries in the cache.
    * @return the number of entries currently in the cache.
    */
    public synchronized int usedEntries() {
       return map.size(); }

    /**
    * Returns a <code>Collection</code> that contains a copy of all cache entries.
    * @return a <code>Collection</code> with a copy of the cache content.
    */
    public synchronized Collection<Map.Entry<K,V>> getAll() {
       return new ArrayList<Map.Entry<K,V>>(map.entrySet()); }

    } // end class LRUCache
}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.beagle.ProduceBeagleInput$CachingFormatter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.