Package org.broadinstitute.gatk.utils.activeregion

Source Code of org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfileUnitTest

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.activeregion;


// the imports for unit testing.


import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.apache.commons.lang.ArrayUtils;
import htsjdk.tribble.readers.LineIterator;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;


public class BandPassActivityProfileUnitTest extends BaseTest {
    private final static boolean DEBUG = false;
    private GenomeLocParser genomeLocParser;

    private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
    private final static double ACTIVE_PROB_THRESHOLD= 0.002;

    @BeforeClass
    public void init() throws FileNotFoundException {
        // sequence
        ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
        genomeLocParser = new GenomeLocParser(seq);
    }

    @DataProvider(name = "BandPassBasicTest")
    public Object[][] makeBandPassTest() {
        final List<Object[]> tests = new LinkedList<Object[]>();

        for ( int start : Arrays.asList(1, 10, 100, 1000) ) {
            for ( boolean precedingIsActive : Arrays.asList(true, false) ) {
                for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) {
                    for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) {
                        for ( double sigma : Arrays.asList(1.0, 2.0, BandPassActivityProfile.DEFAULT_SIGMA) ) {
//        for ( int start : Arrays.asList(10) ) {
//            for ( boolean precedingIsActive : Arrays.asList(false) ) {
//                for ( int precedingSites: Arrays.asList(0) ) {
//                    for ( int bandPassSize : Arrays.asList(1) ) {
                            tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize, sigma });
                        }
                    }
                }
            }
        }

        return tests.toArray(new Object[][]{});
    }

    @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest")
    public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) {
        final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, bandPassSize, sigma, false);

        final int expectedBandSize = bandPassSize * 2 + 1;
        Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size");
        Assert.assertEquals(profile.getSigma(), sigma, "Wrong sigma");
        Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size");

        final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
        final double precedingProb = precedingIsActive ? 1.0 : 0.0;
        for ( int i = 0; i < nPrecedingSites; i++ ) {
            final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start);
            final ActivityProfileState state = new ActivityProfileState(loc, precedingProb);
            profile.add(state);
        }

        final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start);
        profile.add(new ActivityProfileState(nextLoc, 1.0));

        if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) {
            // we have enough space that all probs fall on the genome
            final double[] probs = profile.getProbabilitiesAsArray();
            Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states");
        }
    }

    private double[] bandPassInOnePass(final BandPassActivityProfile profile, final double[] activeProbArray) {
        final double[] bandPassProbArray = new double[activeProbArray.length];

        // apply the band pass filter for activeProbArray into filteredProbArray
        final double[] GaussianKernel = profile.getKernel();
        for( int iii = 0; iii < activeProbArray.length; iii++ ) {
            final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii));
            final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1));
            bandPassProbArray[iii] = dotProduct(activeProbSubArray, kernel);
        }

        return bandPassProbArray;
    }

    public static double dotProduct(double[] v1, double[] v2) {
        Assert.assertEquals(v1.length,v2.length,"Array lengths do not mach in dotProduct");
        double result = 0.0;
        for (int k = 0; k < v1.length; k++)
            result += v1[k] * v2[k];

        return result;
    }

    @DataProvider(name = "BandPassComposition")
    public Object[][] makeBandPassComposition() {
        final List<Object[]> tests = new LinkedList<Object[]>();

        for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassActivityProfile.MAX_FILTER_SIZE) ) {
            for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) {
                tests.add(new Object[]{ bandPassSize, integrationLength });
            }
        }

        return tests.toArray(new Object[][]{});
    }

    @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition")
    public void testBandPassComposition(final int bandPassSize, final int integrationLength) {
        final int start = 1;
        final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE,
                ACTIVE_PROB_THRESHOLD, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA);
        final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2];

        // add a buffer so that we can get all of the band pass values
        final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName();
        int pos = start;
        int rawProbsOffset = 0;
        for ( int i = 0; i < bandPassSize; i++ ) {
            final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++);
            final ActivityProfileState state = new ActivityProfileState(loc, 0.0);
            profile.add(state);
            rawActiveProbs[rawProbsOffset++] = 0.0;
            rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0;
        }

        for ( int i = 0; i < integrationLength; i++ ) {
            final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++);
            profile.add(new ActivityProfileState(nextLoc, 1.0));
            rawActiveProbs[rawProbsOffset++] = 1.0;

            for ( int j = 0; j < profile.size(); j++ ) {
                Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j);
                Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j);
            }
        }

        final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs);
        for ( int j = 0; j < profile.size(); j++ ) {
            Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j);
        }
    }

    // ------------------------------------------------------------------------------------
    //
    // Code to test the creation of the kernels
    //
    // ------------------------------------------------------------------------------------

    /**

     kernel <- function(sd, pThres) {
     raw = dnorm(-80:81, mean=0, sd=sd)
     norm = raw / sum(raw)
     bad = norm < pThres
     paste(norm[! bad], collapse=", ")
     }

     print(kernel(0.01, 1e-5))
     print(kernel(1, 1e-5))
     print(kernel(5, 1e-5))
     print(kernel(17, 1e-5))

     * @return
     */

    @DataProvider(name = "KernelCreation")
    public Object[][] makeKernelCreation() {
        final List<Object[]> tests = new LinkedList<Object[]>();

        tests.add(new Object[]{ 0.01, 1000, new double[]{1.0}});
        tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}});
        tests.add(new Object[]{ 1.0, 0, new double[]{1.0}});
        tests.add(new Object[]{ 1.0, 1, new double[]{0.2740686, 0.4518628, 0.2740686}});
        tests.add(new Object[]{ 1.0, 2, new double[]{0.05448868, 0.24420134, 0.40261995, 0.24420134, 0.05448868}});
        tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}});
        tests.add(new Object[]{ 5.0, 1000, new double[]{1.1788613551308e-05, 2.67660451529771e-05, 5.83893851582921e-05, 0.000122380386022754, 0.000246443833694604, 0.000476817640292968, 0.000886369682387602, 0.00158309031659599, 0.00271659384673712, 0.00447890605896858, 0.00709491856924629, 0.0107981933026376, 0.0157900316601788, 0.0221841669358911, 0.029945493127149, 0.0388372109966426, 0.0483941449038287, 0.0579383105522965, 0.0666449205783599, 0.0736540280606647, 0.0782085387950912, 0.0797884560802865, 0.0782085387950912, 0.0736540280606647, 0.0666449205783599, 0.0579383105522965, 0.0483941449038287, 0.0388372109966426, 0.029945493127149, 0.0221841669358911, 0.0157900316601788, 0.0107981933026376, 0.00709491856924629, 0.00447890605896858, 0.00271659384673712, 0.00158309031659599, 0.000886369682387602, 0.000476817640292968, 0.000246443833694604, 0.000122380386022754, 5.83893851582921e-05, 2.67660451529771e-05, 1.1788613551308e-05}});
        tests.add(new Object[]{17.0, 1000, new double[]{1.25162575710745e-05, 1.57001772728555e-05, 1.96260034693739e-05, 2.44487374842009e-05, 3.03513668801384e-05, 3.75489089511911e-05, 4.62928204154855e-05, 5.68757597480354e-05, 6.96366758708924e-05, 8.49661819944029e-05, 0.000103312156275406, 0.000125185491708561, 0.000151165896477646, 0.000181907623161359, 0.000218144981137171, 0.000260697461819069, 0.000310474281706066, 0.000368478124457557, 0.000435807841336874, 0.00051365985048857, 0.000603327960854364, 0.000706201337376934, 0.000823760321812988, 0.000957569829285965, 0.00110927005589186, 0.00128056425833231, 0.00147320340358764, 0.00168896753568649, 0.00192964376796036, 0.00219700088266432, 0.00249276060490197, 0.00281856571330067, 0.00317594525418154, 0.00356627723683793, 0.00399074930220799, 0.00445031797242299, 0.00494566720070898, 0.00547716704583487, 0.00604483338842317, 0.00664828968356621, 0.00728673180099395, 0.00795889703644795, 0.00866303838230695, 0.00939690511889675, 0.0101577307281371, 0.010942229037054, 0.0117465993701676, 0.0125665413280325, 0.0133972796167302, 0.0142335991336574, 0.0150698902735454, 0.0159002041614507, 0.0167183172536454, 0.0175178044808441, 0.0182921198494897, 0.0190346831745763, 0.0197389714002676, 0.020398612780527, 0.0210074820484496, 0.0215597946062309, 0.0220501977225941, 0.022473856734247, 0.0228265343139947, 0.0231046609899767, 0.0233053952756892, 0.0234266719946158, 0.0234672376502799, 0.0234266719946158, 0.0233053952756892, 0.0231046609899767, 0.0228265343139947, 0.022473856734247, 0.0220501977225941, 0.0215597946062309, 0.0210074820484496, 0.020398612780527, 0.0197389714002676, 0.0190346831745763, 0.0182921198494897, 0.0175178044808441, 0.0167183172536454, 0.0159002041614507, 0.0150698902735454, 0.0142335991336574, 0.0133972796167302, 0.0125665413280325, 0.0117465993701676, 0.010942229037054, 0.0101577307281371, 0.00939690511889675, 0.00866303838230695, 0.00795889703644795, 0.00728673180099395, 0.00664828968356621, 0.00604483338842317, 0.00547716704583487, 0.00494566720070898, 0.00445031797242299, 0.00399074930220799, 0.00356627723683793, 0.00317594525418154, 0.00281856571330067, 0.00249276060490197, 0.00219700088266432, 0.00192964376796036, 0.00168896753568649, 0.00147320340358764, 0.00128056425833231, 0.00110927005589186, 0.000957569829285965, 0.000823760321812988, 0.000706201337376934, 0.000603327960854364, 0.00051365985048857, 0.000435807841336874, 0.000368478124457557, 0.000310474281706066, 0.000260697461819069, 0.000218144981137171, 0.000181907623161359, 0.000151165896477646, 0.000125185491708561, 0.000103312156275406, 8.49661819944029e-05, 6.96366758708924e-05, 5.68757597480354e-05, 4.62928204154855e-05, 3.75489089511911e-05, 3.03513668801384e-05, 2.44487374842009e-05, 1.96260034693739e-05, 1.57001772728555e-05, 1.25162575710745e-05}});

        return tests.toArray(new Object[][]{});
    }

    @Test( enabled = ! DEBUG, dataProvider = "KernelCreation")
    public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) {
        final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD,
                maxSize, sigma, true);

        final double[] kernel = profile.getKernel();
        Assert.assertEquals(kernel.length, expectedKernel.length);
        for ( int i = 0; i < kernel.length; i++ )
            Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i);
    }

    // ------------------------------------------------------------------------------------
    //
    // Large-scale test, reading in 1000G Phase I chr20 calls and making sure that
    // the regions returned are the same if you run on the entire profile vs. doing it
    // incremental
    //
    // ------------------------------------------------------------------------------------

    @DataProvider(name = "VCFProfile")
    public Object[][] makeVCFProfile() {
        final List<Object[]> tests = new LinkedList<Object[]>();

        //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000});
        //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000});
        //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
        tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
        tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000});

        return tests.toArray(new Object[][]{});
    }

    @Test( dataProvider = "VCFProfile")
    public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception {
        final int extension = 50;
        final int minRegionSize = 50;
        final int maxRegionSize = 300;

        final File file = new File(path);
        final VCFCodec codec = new VCFCodec();
        final Pair<VCFHeader, GATKVCFUtils.VCIterable<LineIterator>> reader = GATKVCFUtils.readAllVCs(file, codec);

        final List<ActiveRegion> incRegions = new ArrayList<ActiveRegion>();
        final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
        final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD);
        int pos = start;
        for ( final VariantContext vc : reader.getSecond() ) {
            if ( vc == null ) continue;
            while ( pos < vc.getStart() ) {
                final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
                //logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart());
                incProfile.add(new ActivityProfileState(loc, 0.0));
                fullProfile.add(new ActivityProfileState(loc, 0.0));
                pos++;
            }
            if ( vc.getStart() >= start && vc.getEnd() <= end ) {
                final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
                //logger.warn("Adding 1.0 at " + loc);
                ActivityProfileState.Type type = ActivityProfileState.Type.NONE;
                Number value = null;
                if ( vc.isBiallelic() && vc.isIndel() ) {
                    type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS;
                    value = Math.abs(vc.getIndelLengths().get(0));
                }
                final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value);
                incProfile.add(state);
                fullProfile.add(state);
                pos++;
            }

            incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false));

            if ( vc.getStart() > end )
                break;
        }

        incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true));

        final List<ActiveRegion> fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true);
        assertGoodRegions(fullRegions, start, end, maxRegionSize);
        assertGoodRegions(incRegions, start, end, maxRegionSize);

        Assert.assertEquals(incRegions.size(),  fullRegions.size(), "incremental and full region sizes aren't the same");
        for ( int i = 0; i < fullRegions.size(); i++ ) {
            final ActiveRegion incRegion = incRegions.get(i);
            final ActiveRegion fullRegion = fullRegions.get(i);
            Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion);
        }
    }

    private void assertGoodRegions(final List<ActiveRegion> regions, final int start, final int end, final int maxRegionSize) {
        int lastPosSeen = start - 1;
        for ( int regionI = 0; regionI < regions.size(); regionI++ ) {
            final ActiveRegion region = regions.get(regionI);
            Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region.  lastPosSeen " + lastPosSeen + " but region is " + region);
            Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region);
            lastPosSeen = region.getLocation().getStop();

            for ( final ActivityProfileState state : region.getSupportingStates() ) {
                Assert.assertEquals(state.isActiveProb > ACTIVE_PROB_THRESHOLD, region.isActive(),
                        "Region is active=" + region.isActive() + " but contains a state " + state + " with prob "
                                + state.isActiveProb + " not within expected values given threshold for activity of "
                                + ACTIVE_PROB_THRESHOLD);
            }
        }
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfileUnitTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.