Package org.broadinstitute.gatk.engine.datasources.reference

Examples of org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource


     * @return the sharding strategy
     */
    protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
        ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
        DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null;
        ReferenceDataSource referenceDataSource = this.getReferenceDataSource();

        // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
        if(!readsDataSource.isEmpty()) {
            if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM))
                throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed.  The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported.");
            if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
                throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");

            if(walker instanceof LocusWalker) {
                if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
                    throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data.  Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
                if(intervals == null)
                    return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
            }
            else if(walker instanceof ActiveRegionWalker) {
                if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
                    throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data.  Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
                if(intervals == null)
                    return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer());
            }
            else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
                // Apply special validation to read pair walkers.
                if(walker instanceof ReadPairWalker) {
                    if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
                        throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files.  You will need to resort your input BAM file in query name order to use this walker.");
                    if(intervals != null && !intervals.isEmpty())
                        throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
                }

                if(intervals == null)
                    return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
                else
                    return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer());
            }
            else
                throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName());
        }
        else {
            // TODO -- Determine what the ideal shard size should be here.  Matt suggested that a multiple of 16K might work well
            // TODO --  (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard
            // TODO --  size the more efficient the traversal (at least for RODWalkers).  Keeping the previous values for now.  [EB]
            final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000;
            if(intervals == null)
                return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE);
            else
                return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE);
        }
    }
View Full Code Here


     * Opens a reference sequence file paired with an index.  Only public for testing purposes
     *
     * @param refFile Handle to a reference sequence file.  Non-null.
     */
    public void setReferenceDataSource(File refFile) {
        this.referenceDataSource = new ReferenceDataSource(refFile);
        genomeLocParser = new GenomeLocParser(referenceDataSource.getReference());
    }
View Full Code Here

    @BeforeClass
    public void init() {
        File hg18Ref = new File(BaseTest.hg18Reference);
        try {
            ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg18Ref);
            hg18Header = new SAMFileHeader();
            hg18Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
            ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg18Ref);
            hg18GenomeLocParser = new GenomeLocParser(seq);
            hg18ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
        }
        catch(FileNotFoundException ex) {
            throw new UserException.CouldNotReadInputFile(hg18Ref,ex);
        }

        File hg19Ref = new File(BaseTest.hg19Reference);
        try {
            ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg19Ref);
            hg19Header = new SAMFileHeader();
            hg19Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
            ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
            hg19GenomeLocParser = new GenomeLocParser(seq);
            hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;

            hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals)));
        }
        catch(FileNotFoundException ex) {
            throw new UserException.CouldNotReadInputFile(hg19Ref,ex);
View Full Code Here

     * Returns a map of contig names with their sizes.
     * @param reference The reference for the intervals.
     * @return A map of contig names with their sizes.
     */
    public static Map<String, Integer> getContigSizes(File reference) {
        ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
        List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList();
        Map<String, Integer> lengths = new LinkedHashMap<String, Integer>();
        for (GenomeLoc loc: locs)
            lengths.put(loc.getContig(), loc.size());
        return lengths;
    }
View Full Code Here

            size += loc.size();
        return size;
    }

    public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) {
        ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference);
        GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference());
        List<GenomeLoc> originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath());

        if (originalList.isEmpty())
            throw new UserException.MalformedFile(inputIntervals, "File contains no intervals");

        List<GenomeLoc> flankingList = getFlankingIntervals(parser, originalList, basePairs);

        if (flankingList.isEmpty())
            throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals");

        SAMFileHeader samFileHeader = new SAMFileHeader();
        samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
        IntervalList intervalList = new IntervalList(samFileHeader);
        int i = 0;
        for (GenomeLoc loc: flankingList)
            intervalList.add(toInterval(loc, ++i));
        intervalList.write(flankingIntervals);
View Full Code Here

TOP

Related Classes of org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.