Examples of org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource

Package org.broadinstitute.gatk.engine.datasources.reference

Examples of org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource

org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource
Loads reference data from fasta file Looks for fai and dict files, and tries to create them if they don't exist

     * @return the sharding strategy
     */
    protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
        ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
        DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null;
        ReferenceDataSource referenceDataSource = this.getReferenceDataSource();


        // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
        if(!readsDataSource.isEmpty()) {
            if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
                throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed.  The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
            if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
                throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");


            if(walker instanceof LocusWalker) {
                if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
                    throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data.  Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
                if(intervals == null)
                    return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
            } 
            else if(walker instanceof ActiveRegionWalker) {
                if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
                    throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data.  Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
                if(intervals == null)
                    return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer());
            } 
            else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
                // Apply special validation to read pair walkers.
                if(walker instanceof ReadPairWalker) {
                    if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
                        throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files.  You will need to resort your input BAM file in query name order to use this walker.");
                    if(intervals != null && !intervals.isEmpty())
                        throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
                }


                if(intervals == null)
                    return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer());
            }
            else
                throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName());
        }
        else {
            // TODO -- Determine what the ideal shard size should be here.  Matt suggested that a multiple of 16K might work well
            // TODO --  (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
            // TODO --  size the more efficient the traversal (at least for RODWalkers).  Keeping the previous values for now.  [EB]
            final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
            if(intervals == null)
                return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
            else
                return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
        }
    }

View Full Code Here

     * Opens a reference sequence file paired with an index.  Only public for testing purposes
     *
     * @param refFile Handle to a reference sequence file.  Non-null.
     */
    public void setReferenceDataSource(File refFile) {
        this.referenceDataSource = new ReferenceDataSource(refFile);
        genomeLocParser = new GenomeLocParser(referenceDataSource.getReference());
    }

View Full Code Here


    @BeforeClass
    public void init() {
        File hg18Ref = new File(BaseTest.hg18Reference);
        try {
            ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg18Ref);
            hg18Header = new SAMFileHeader();
            hg18Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
            ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg18Ref);
            hg18GenomeLocParser = new GenomeLocParser(seq);
            hg18ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
        }
        catch(FileNotFoundException ex) {
            throw new UserException.CouldNotReadInputFile(hg18Ref,ex);
        }


        File hg19Ref = new File(BaseTest.hg19Reference);
        try {
            ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg19Ref);
            hg19Header = new SAMFileHeader();
            hg19Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
            ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
            hg19GenomeLocParser = new GenomeLocParser(seq);
            hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;


            hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals)));
        }
        catch(FileNotFoundException ex) {
            throw new UserException.CouldNotReadInputFile(hg19Ref,ex);

View Full Code Here

     * Returns a map of contig names with their sizes.
     * @param reference The reference for the intervals.
     * @return A map of contig names with their sizes.
     */
    public static Map<String, Integer> getContigSizes(File reference) {
        ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
        List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList();
        Map<String, Integer> lengths = new LinkedHashMap<String, Integer>();
        for (GenomeLoc loc: locs)
            lengths.put(loc.getContig(), loc.size());
        return lengths;
    }

View Full Code Here

            size += loc.size();
        return size;
    }


    public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
        ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
        GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
        List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());


        if (originalList.isEmpty())
            throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");


        List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);


        if (flankingList.isEmpty())
            throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");


        SAMFileHeader samFileHeader = new SAMFileHeader();
        samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
        IntervalList intervalList = new IntervalList(samFileHeader);
        int i = 0;
        for (GenomeLoc loc: flankingList)
            intervalList.add(toInterval(loc, ++i));
        intervalList.write(flankingIntervals);

View Full Code Here

TOP

Related Classes of org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource

org.broadinstitute.gatk.engine.datasources.reads.LocusShard

org.broadinstitute.gatk.engine.GenomeAnalysisEngine

org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile

org.broadinstitute.gatk.utils.interval.IntervalUtils

org.broadinstitute.gatk.utils.interval.IntervalUtilsUnitTest

java.io.File

org.broadinstitute.gatk.utils.exceptions.UserException

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.