Package org.broadinstitute.gatk.utils.fasta

Source Code of org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFileUnitTest

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.fasta;


// the imports for unit testing.


import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Priority;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
* Basic unit test for CachingIndexedFastaSequenceFile
*/
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
    private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
    private static final int STEP_SIZE = 1;
    private final static boolean DEBUG = false;

    //private static final List<Integer> QUERY_SIZES = Arrays.asList(1);
    private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100);
    private static final List<Integer> CACHE_SIZES = Arrays.asList(-1, 100, 1000);

    @DataProvider(name = "fastas")
    public Object[][] createData1() {
        List<Object[]> params = new ArrayList<Object[]>();
        for ( File fasta : Arrays.asList(simpleFasta) ) {
            for ( int cacheSize : CACHE_SIZES ) {
                for ( int querySize : QUERY_SIZES ) {
                    params.add(new Object[]{fasta, cacheSize, querySize});
                }
            }
        }

        return params.toArray(new Object[][]{});
    }

    private static long getCacheSize(final long cacheSizeRequested) {
        return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested;
    }

    @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
    public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);

        SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
        logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));
        testSequential(caching, fasta, querySize);
    }

    private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException {
        Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers");

        final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
        for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) {
            int start = i;
            int stop = start + querySize;
            if ( stop <= contig.getSequenceLength() ) {
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }

        // asserts for efficiency.  We are going to make contig.length / STEP_SIZE queries
        // at each of range: start -> start + querySize against a cache with size of X.
        // we expect to hit the cache each time range falls within X.  We expect a hit
        // on the cache if range is within X.  Which should happen at least (X - query_size * 2) / STEP_SIZE
        // times.
        final int minExpectedHits = (int)Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE);
        caching.printEfficiency(Priority.WARN);
        Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits());

    }

    // Tests grabbing sequences around a middle cached value.
    @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
    public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
        final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);

        SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);

        int middleStart = (contig.getSequenceLength() - querySize) / 2;
        int middleStop = middleStart + querySize;

        logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d with intermediate query",
                contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize));

        for ( int i = 0; i < contig.getSequenceLength(); i += 10 ) {
            int start = i;
            int stop = start + querySize;
            if ( stop <= contig.getSequenceLength() ) {
                ReferenceSequence grabMiddle = caching.getSubsequenceAt(contig.getSequenceName(), middleStart, middleStop);
                ReferenceSequence cachedVal = caching.getSubsequenceAt(contig.getSequenceName(), start, stop);
                ReferenceSequence uncachedVal = uncached.getSubsequenceAt(contig.getSequenceName(), start, stop);

                Assert.assertEquals(cachedVal.getName(), uncachedVal.getName());
                Assert.assertEquals(cachedVal.getContigIndex(), uncachedVal.getContigIndex());
                Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases());
            }
        }
    }

    @DataProvider(name = "ParallelFastaTest")
    public Object[][] createParallelFastaTest() {
        List<Object[]> params = new ArrayList<Object[]>();

        for ( File fasta : Arrays.asList(simpleFasta) ) {
            for ( int cacheSize : CACHE_SIZES ) {
                for ( int querySize : QUERY_SIZES ) {
                    for ( int nt : Arrays.asList(1, 2, 3, 4) ) {
                        params.add(new Object[]{fasta, cacheSize, querySize, nt});
                    }
                }
            }
        }

        return params.toArray(new Object[][]{});
    }


    @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
    public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
        final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);

        logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
        for ( int iterations = 0; iterations < 1; iterations++ ) {
            final ExecutorService executor = Executors.newFixedThreadPool(nt);
            final Collection<Callable<Object>> tasks = new ArrayList<Callable<Object>>(nt);
            for ( int i = 0; i < nt; i++ )
                tasks.add(new Callable<Object>() {
                    @Override
                    public Object call() throws Exception {
                        testSequential(caching, fasta, querySize);
                        return null;
                    }
                });
            executor.invokeAll(tasks);
            executor.shutdownNow();
        }
    }

    // make sure some bases are lower case and some are upper case
    @Test(enabled = true)
    public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException {
        final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA));
        final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true);
        final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA));

        int nMixedCase = 0;
        for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) {
            nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1);

            final int step = 100;
            for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) {
                testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos);
            }
        }

        Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file.  Unexpected test state");
    }

    private int testCases(final IndexedFastaSequenceFile original,
                          final IndexedFastaSequenceFile casePreserving,
                          final IndexedFastaSequenceFile allUpper,
                          final String contig, final int start, final int stop ) {
        final String orig = fetchBaseString(original, contig, start, stop);
        final String keptCase = fetchBaseString(casePreserving, contig, start, stop);
        final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase();

        final String origToUpper = orig.toUpperCase();
        if ( ! orig.equals(origToUpper) ) {
            Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig);
            Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig);
            return 1;
        } else {
            return 0;
        }
    }

    private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) {
        if ( start == -1 )
            return new String(reader.getSequence(contig).getBases());
        else
            return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
    }

    @Test(enabled = true)
    public void testIupacChanges() throws FileNotFoundException, InterruptedException {
        final String testFasta = privateTestDir + "iupacFASTA.fasta";
        final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
        final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));

        int preservingNs = 0;
        int changingNs = 0;
        for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) {
            final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
            preservingNs += StringUtils.countMatches(sPreserving, "N");

            final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
            changingNs += StringUtils.countMatches(sChanging, "N");
        }

        Assert.assertEquals(changingNs, preservingNs + 4);
    }

    @Test(enabled = true, expectedExceptions = {UserException.class})
    public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
        final String testFasta = privateTestDir + "problematicFASTA.fasta";
        final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));

        for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) {
            fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
        }
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFileUnitTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.