/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.utils;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.tribble.BasicFeature;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
/**
* @author aaron
* <p/>
* Class GenomeLocParserUnitTest
* <p/>
* Test out the functionality of the new genome loc parser
*/
public class GenomeLocParserUnitTest extends BaseTest {
private GenomeLocParser genomeLocParser;
private SAMFileHeader header;
@BeforeClass
public void init() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
@Test(expectedExceptions=UserException.MalformedGenomeLoc.class)
public void testGetContigIndex() {
assertEquals(genomeLocParser.getContigIndex("blah"), -1); // should not be in the reference
}
@Test
public void testGetContigIndexValid() {
assertEquals(genomeLocParser.getContigIndex("chr1"), 0); // should be in the reference
}
@Test(expectedExceptions=UserException.class)
public void testGetContigInfoUnknownContig1() {
assertEquals(null, genomeLocParser.getContigInfo("blah")); // should *not* be in the reference
}
@Test(expectedExceptions=UserException.class)
public void testGetContigInfoUnknownContig2() {
assertEquals(null, genomeLocParser.getContigInfo(null)); // should *not* be in the reference
}
@Test()
public void testHasContigInfoUnknownContig1() {
assertEquals(false, genomeLocParser.contigIsInDictionary("blah")); // should *not* be in the reference
}
@Test()
public void testHasContigInfoUnknownContig2() {
assertEquals(false, genomeLocParser.contigIsInDictionary(null)); // should *not* be in the reference
}
@Test()
public void testHasContigInfoKnownContig() {
assertEquals(true, genomeLocParser.contigIsInDictionary("chr1")); // should be in the reference
}
@Test
public void testGetContigInfoKnownContig() {
assertEquals(0, "chr1".compareTo(genomeLocParser.getContigInfo("chr1").getSequenceName())); // should be in the reference
}
@Test(expectedExceptions=ReviewedGATKException.class)
public void testParseBadString() {
genomeLocParser.parseGenomeLoc("Bad:0-1");
}
@Test
public void testContigHasColon() {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate);
SAMSequenceDictionary dict = new SAMSequenceDictionary();
SAMSequenceRecord rec = new SAMSequenceRecord("c:h:r1", 10);
rec.setSequenceLength(10);
dict.addSequence(rec);
header.setSequenceDictionary(dict);
final GenomeLocParser myGenomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
GenomeLoc loc = myGenomeLocParser.parseGenomeLoc("c:h:r1:4-5");
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStart(), 4);
assertEquals(loc.getStop(), 5);
}
@Test
public void testParseGoodString() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-10");
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStop(), 10);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc1() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1, 100);
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStop(), 100);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc1point5() { // in honor of VAAL!
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1");
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStop(), 1);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc2() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1, 100);
assertEquals("chr1", loc.getContig());
assertEquals(loc.getStop(), 100);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc3() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1);
assertEquals("chr1", loc.getContig());
assertEquals(loc.getStop(), 1);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc4() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1);
assertEquals(0, loc.getContigIndex());
assertEquals(loc.getStop(), 1);
assertEquals(loc.getStart(), 1);
}
@Test
public void testCreateGenomeLoc5() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 1, 100);
GenomeLoc copy = genomeLocParser.createGenomeLoc(loc.getContig(),loc.getStart(),loc.getStop());
assertEquals(0, copy.getContigIndex());
assertEquals(copy.getStop(), 100);
assertEquals(copy.getStart(), 1);
}
@Test
public void testGenomeLocPlusSign() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1+");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
@Test
public void testGenomeLocParseOnlyChrome() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
@Test(expectedExceptions=ReviewedGATKException.class)
public void testGenomeLocParseOnlyBadChrome() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr12");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
@Test(expectedExceptions=ReviewedGATKException.class)
public void testGenomeLocBad() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
@Test(expectedExceptions=UserException.class)
public void testGenomeLocBad2() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1-500-0");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
@Test(expectedExceptions=UserException.class)
public void testGenomeLocBad3() {
GenomeLoc loc = genomeLocParser.parseGenomeLoc("chr1:1--0");
assertEquals(loc.getContigIndex(), 0);
assertEquals(loc.getStop(), 10); // the size
assertEquals(loc.getStart(), 1);
}
// test out the validating methods
@Test
public void testValidationOfGenomeLocs() {
assertTrue(genomeLocParser.isValidGenomeLoc("chr1",1,1));
assertTrue(!genomeLocParser.isValidGenomeLoc("chr2",1,1)); // shouldn't have an entry
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,11)); // past the end of the contig
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",-1,10)); // bad start
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",1,-2)); // bad stop
assertTrue( genomeLocParser.isValidGenomeLoc("chr1",-1,2, false)); // bad stop
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",10,11)); // bad start, past end
assertTrue( genomeLocParser.isValidGenomeLoc("chr1",10,11, false)); // bad start, past end
assertTrue(!genomeLocParser.isValidGenomeLoc("chr1",2,1)); // stop < start
}
@Test(expectedExceptions = ReviewedGATKException.class)
public void testValidateGenomeLoc() {
// bad contig index
genomeLocParser.validateGenomeLoc("chr1", 1, 1, 2, false);
}
private static class FlankingGenomeLocTestData extends TestDataProvider {
final GenomeLocParser parser;
final int basePairs;
final GenomeLoc original, flankStart, flankStop;
private FlankingGenomeLocTestData(String name, GenomeLocParser parser, int basePairs, String original, String flankStart, String flankStop) {
super(FlankingGenomeLocTestData.class, name);
this.parser = parser;
this.basePairs = basePairs;
this.original = parse(parser, original);
this.flankStart = flankStart == null ? null : parse(parser, flankStart);
this.flankStop = flankStop == null ? null : parse(parser, flankStop);
}
private static GenomeLoc parse(GenomeLocParser parser, String str) {
return "unmapped".equals(str) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(str);
}
}
@DataProvider(name = "flankingGenomeLocs")
public Object[][] getFlankingGenomeLocs() {
int contigLength = 10000;
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigLength);
GenomeLocParser parser = new GenomeLocParser(header.getSequenceDictionary());
new FlankingGenomeLocTestData("atStartBase1", parser, 1,
"chr1:1", null, "chr1:2");
new FlankingGenomeLocTestData("atStartBase50", parser, 50,
"chr1:1", null, "chr1:2-51");
new FlankingGenomeLocTestData("atStartRange50", parser, 50,
"chr1:1-10", null, "chr1:11-60");
new FlankingGenomeLocTestData("atEndBase1", parser, 1,
"chr1:" + contigLength, "chr1:" + (contigLength - 1), null);
new FlankingGenomeLocTestData("atEndBase50", parser, 50,
"chr1:" + contigLength, String.format("chr1:%d-%d", contigLength - 50, contigLength - 1), null);
new FlankingGenomeLocTestData("atEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 10, contigLength),
String.format("chr1:%d-%d", contigLength - 60, contigLength - 11),
null);
new FlankingGenomeLocTestData("nearStartBase1", parser, 1,
"chr1:2", "chr1:1", "chr1:3");
new FlankingGenomeLocTestData("nearStartRange50", parser, 50,
"chr1:21-30", "chr1:1-20", "chr1:31-80");
new FlankingGenomeLocTestData("nearEndBase1", parser, 1,
"chr1:" + (contigLength - 1), "chr1:" + (contigLength - 2), "chr1:" + contigLength);
new FlankingGenomeLocTestData("nearEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 30, contigLength - 21),
String.format("chr1:%d-%d", contigLength - 80, contigLength - 31),
String.format("chr1:%d-%d", contigLength - 20, contigLength));
new FlankingGenomeLocTestData("beyondStartBase1", parser, 1,
"chr1:3", "chr1:2", "chr1:4");
new FlankingGenomeLocTestData("beyondStartRange50", parser, 50,
"chr1:101-200", "chr1:51-100", "chr1:201-250");
new FlankingGenomeLocTestData("beyondEndBase1", parser, 1,
"chr1:" + (contigLength - 3),
"chr1:" + (contigLength - 4),
"chr1:" + (contigLength - 2));
new FlankingGenomeLocTestData("beyondEndRange50", parser, 50,
String.format("chr1:%d-%d", contigLength - 200, contigLength - 101),
String.format("chr1:%d-%d", contigLength - 250, contigLength - 201),
String.format("chr1:%d-%d", contigLength - 100, contigLength - 51));
new FlankingGenomeLocTestData("unmapped", parser, 50,
"unmapped", null, null);
new FlankingGenomeLocTestData("fullContig", parser, 50,
"chr1", null, null);
return FlankingGenomeLocTestData.getTests(FlankingGenomeLocTestData.class);
}
@Test(dataProvider = "flankingGenomeLocs")
public void testCreateGenomeLocAtStart(FlankingGenomeLocTestData data) {
GenomeLoc actual = data.parser.createGenomeLocAtStart(data.original, data.basePairs);
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.flankStart);
assertEquals(actual, data.flankStart, description);
}
@Test(dataProvider = "flankingGenomeLocs")
public void testCreateGenomeLocAtStop(FlankingGenomeLocTestData data) {
GenomeLoc actual = data.parser.createGenomeLocAtStop(data.original, data.basePairs);
String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n",
data.toString(), data.original, actual, data.flankStop);
assertEquals(actual, data.flankStop, description);
}
@DataProvider(name = "parseGenomeLoc")
public Object[][] makeParsingTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
tests.add(new Object[]{ "chr1:10", "chr1", 10 });
tests.add(new Object[]{ "chr1:100", "chr1", 100 });
tests.add(new Object[]{ "chr1:1000", "chr1", 1000 });
tests.add(new Object[]{ "chr1:1,000", "chr1", 1000 });
tests.add(new Object[]{ "chr1:10000", "chr1", 10000 });
tests.add(new Object[]{ "chr1:10,000", "chr1", 10000 });
tests.add(new Object[]{ "chr1:100000", "chr1", 100000 });
tests.add(new Object[]{ "chr1:100,000", "chr1", 100000 });
tests.add(new Object[]{ "chr1:1000000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1,000,000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1000,000", "chr1", 1000000 });
tests.add(new Object[]{ "chr1:1,000000", "chr1", 1000000 });
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "parseGenomeLoc")
public void testParsingPositions(final String string, final String contig, final int start) {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10000000);
GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final GenomeLoc loc = genomeLocParser.parseGenomeLoc(string);
Assert.assertEquals(loc.getContig(), contig);
Assert.assertEquals(loc.getStart(), start);
Assert.assertEquals(loc.getStop(), start);
}
@Test( )
public void testCreationFromSAMRecord() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertEquals(loc.getContig(), read.getReferenceName());
Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex());
Assert.assertEquals(loc.getStart(), read.getAlignmentStart());
Assert.assertEquals(loc.getStop(), read.getAlignmentEnd());
}
@Test( )
public void testCreationFromSAMRecordUnmapped() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
read.setReadUnmappedFlag(true);
read.setReferenceIndex(-1);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertTrue(loc.isUnmapped());
}
@Test( )
public void testCreationFromSAMRecordUnmappedButOnGenome() {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5);
read.setReadUnmappedFlag(true);
read.setCigarString("*");
final GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
Assert.assertEquals(loc.getContig(), read.getReferenceName());
Assert.assertEquals(loc.getContigIndex(), (int)read.getReferenceIndex());
Assert.assertEquals(loc.getStart(), read.getAlignmentStart());
Assert.assertEquals(loc.getStop(), read.getAlignmentStart());
}
@Test
public void testCreationFromFeature() {
final Feature feature = new BasicFeature("chr1", 1, 5);
final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature);
Assert.assertEquals(loc.getContig(), feature.getChr());
Assert.assertEquals(loc.getStart(), feature.getStart());
Assert.assertEquals(loc.getStop(), feature.getEnd());
}
@Test
public void testCreationFromVariantContext() {
final VariantContext feature = new VariantContextBuilder("x", "chr1", 1, 5, Arrays.asList(Allele.create("AAAAA", true))).make();
final GenomeLoc loc = genomeLocParser.createGenomeLoc(feature);
Assert.assertEquals(loc.getContig(), feature.getChr());
Assert.assertEquals(loc.getStart(), feature.getStart());
Assert.assertEquals(loc.getStop(), feature.getEnd());
}
@Test
public void testcreateGenomeLocOnContig() throws FileNotFoundException {
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMSequenceDictionary dict = seq.getSequenceDictionary();
final GenomeLocParser genomeLocParser = new GenomeLocParser(dict);
for ( final SAMSequenceRecord rec : dict.getSequences() ) {
final GenomeLoc loc = genomeLocParser.createOverEntireContig(rec.getSequenceName());
Assert.assertEquals(loc.getContig(), rec.getSequenceName());
Assert.assertEquals(loc.getStart(), 1);
Assert.assertEquals(loc.getStop(), rec.getSequenceLength());
}
}
@DataProvider(name = "GenomeLocOnContig")
public Object[][] makeGenomeLocOnContig() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = header.getSequence(0).getSequenceLength();
for ( int start = -10; start < contigLength + 10; start++ ) {
for ( final int len : Arrays.asList(1, 10, 20) ) {
tests.add(new Object[]{ "chr1", start, start + len });
}
}
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "GenomeLocOnContig")
public void testGenomeLocOnContig(final String contig, final int start, final int stop) {
final int contigLength = header.getSequence(0).getSequenceLength();
final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, stop);
if ( stop < 1 || start > contigLength )
Assert.assertNull(loc, "GenomeLoc should be null if the start/stops are not meaningful");
else {
Assert.assertNotNull(loc);
Assert.assertEquals(loc.getContig(), contig);
Assert.assertEquals(loc.getStart(), Math.max(start, 1));
Assert.assertEquals(loc.getStop(), Math.min(stop, contigLength));
}
}
@DataProvider(name = "GenomeLocPadding")
public Object[][] makeGenomeLocPadding() {
final List<Object[]> tests = new LinkedList<Object[]>();
final int contigLength = header.getSequence(0).getSequenceLength();
for ( int pad = 0; pad < contigLength + 1; pad++) {
for ( int start = 1; start < contigLength; start++ ) {
for ( int stop = start; stop < contigLength; stop++ ) {
tests.add(new Object[]{ genomeLocParser.createGenomeLoc("chr1", start, stop), pad});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "GenomeLocPadding")
public void testGenomeLocPadding(final GenomeLoc input, final int pad) {
final int contigLength = header.getSequence(0).getSequenceLength();
final GenomeLoc padded = genomeLocParser.createPaddedGenomeLoc(input, pad);
Assert.assertNotNull(padded);
Assert.assertEquals(padded.getContig(), input.getContig());
Assert.assertEquals(padded.getStart(), Math.max(input.getStart() - pad, 1));
Assert.assertEquals(padded.getStop(), Math.min(input.getStop() + pad, contigLength));
}
}