Package org.broadinstitute.gatk.engine.alignment.reference.bwt

Source Code of org.broadinstitute.gatk.engine.alignment.reference.bwt.CreateBWTFromReference

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.alignment.reference.bwt;

import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import org.broadinstitute.gatk.engine.alignment.reference.packing.PackUtils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;

import java.io.File;
import java.io.IOException;

/**
* Create a suffix array data structure.
*
* @author mhanna
* @version 0.1
*/
public class CreateBWTFromReference {
    private byte[] loadReference( File inputFile ) {
        // Read in the first sequence in the input file
        ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
        ReferenceSequence sequence = reference.nextSequence();
        return sequence.getBases();
    }

    private byte[] loadReverseReference( File inputFile ) {
        ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
        ReferenceSequence sequence = reference.nextSequence();
        PackUtils.reverse(sequence.getBases());
        return sequence.getBases();
    }

    private Counts countOccurrences( byte[] sequence ) {
        Counts occurrences = new Counts();
        for( byte base: sequence )
            occurrences.increment(base);
        return occurrences;
    }

    private long[] createSuffixArray( byte[] sequence ) {
        return SuffixArray.createFromReferenceSequence(sequence).sequence;
    }

    private long[] invertSuffixArray( long[] suffixArray ) {
        long[] inverseSuffixArray = new long[suffixArray.length];
        for( int i = 0; i < suffixArray.length; i++ )
            inverseSuffixArray[(int)suffixArray[i]] = i;
        return inverseSuffixArray;
    }

    private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) {
        long[] compressedSuffixArray = new long[suffixArray.length];
        compressedSuffixArray[0] = inverseSuffixArray[0];
        for( int i = 1; i < suffixArray.length; i++ )
            compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1];
        return compressedSuffixArray;
    }

    private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) {
        long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length];
        for( int i = 0; i < compressedSuffixArray.length; i++ )
            inverseCompressedSuffixArray[compressedSuffixArray[i]] = i;
        return inverseCompressedSuffixArray;
    }

    public static void main( String argv[] ) throws IOException {
        if( argv.length != 5 ) {
            System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
            return;
        }

        String inputFileName = argv[0];
        File inputFile = new File(inputFileName);

        String bwtFileName = argv[1];
        File bwtFile = new File(bwtFileName);

        String rbwtFileName = argv[2];
        File rbwtFile = new File(rbwtFileName);

        String saFileName = argv[3];
        File saFile = new File(saFileName);

        String rsaFileName = argv[4];
        File rsaFile = new File(rsaFileName);

        CreateBWTFromReference creator = new CreateBWTFromReference();

        byte[] sequence = creator.loadReference(inputFile);
        byte[] reverseSequence = creator.loadReverseReference(inputFile);

        // Count the occurences of each given base.
        Counts occurrences = creator.countOccurrences(sequence);
        System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A),
                                                                  occurrences.getCumulative(Bases.C),
                                                                  occurrences.getCumulative(Bases.G),
                                                                  occurrences.getCumulative(Bases.T));

        // Generate the suffix array and print diagnostics.
        long[] suffixArrayData = creator.createSuffixArray(sequence);
        long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);

        // Invert the suffix array and print diagnostics.
        long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
        long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);

        SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
        SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );

        /*
        // Create the data structure for the compressed suffix array and print diagnostics.
        int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
        int reconstructedInverseSA = compressedSuffixArray[0];
        for( int i = 0; i < 8; i++ ) {
            System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
            reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA];
        }

        // Create the data structure for the inverse compressed suffix array and print diagnostics.
        int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray);
        for( int i = 0; i < 8; i++ ) {
            System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
        }
        */

        // Create the BWT.
        BWT bwt = BWT.createFromReferenceSequence(sequence);
        BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence);

        byte[] bwtSequence = bwt.getSequence();
        System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());

        BWTWriter bwtWriter = new BWTWriter(bwtFile);
        bwtWriter.write(bwt);
        bwtWriter.close();

        BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
        reverseBWTWriter.write(reverseBWT);
        reverseBWTWriter.close();

        /*
        SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
        saWriter.write(suffixArray);
        saWriter.close();

        SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
        reverseSAWriter.write(reverseSuffixArray);
        reverseSAWriter.close();
        */

        File existingBWTFile = new File(inputFileName+".bwt");
        BWTReader existingBWTReader = new BWTReader(existingBWTFile);
        BWT existingBWT = existingBWTReader.read();

        byte[] existingBWTSequence = existingBWT.getSequence();
        System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());

        for( int i = 0; i < bwt.length(); i++ ) {
            if( bwtSequence[i] != existingBWTSequence[i] )
                throw new ReviewedGATKException("BWT mismatch at " + i);
        }

        File existingSAFile = new File(inputFileName+".sa");
        SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT);
        SuffixArray existingSuffixArray = existingSuffixArrayReader.read();

        for(int i = 0; i < suffixArray.length(); i++) {
            if( i % 10000 == 0 )
                System.out.printf("Validating suffix array entry %d%n", i);
            if( suffixArray.get(i) != existingSuffixArray.get(i) )
                throw new ReviewedGATKException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i)));
        }
    }

}
TOP

Related Classes of org.broadinstitute.gatk.engine.alignment.reference.bwt.CreateBWTFromReference

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.