Package picard.fastq

Source Code of picard.fastq.BamToBfqWriter

/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fastq;

import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileReader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.filter.AggregateFilter;
import htsjdk.samtools.filter.FailsVendorReadQualityFilter;
import htsjdk.samtools.filter.FilteringIterator;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.TagFilter;
import htsjdk.samtools.filter.WholeReadClippedFilter;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import picard.PicardException;

import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) --
* one or two of them, depending on whether it's a paired-end read.  This relies on the unmapped
* BAM file having all paired reads together in order.
*/
public class BamToBfqWriter {

    private static final int SEED_REGION_LENGTH = 28;
    private static final int MAX_SEED_REGION_NOCALL_FIXES = 2;

    private final File bamFile;
    private final String outputPrefix;
    private final String namePrefix;
    private final int nameTrim;
    private boolean pairedReads = false;
    private int wrote = 0;
    private int increment = 1;
    private int chunk = 0;
    private BinaryCodec codec1;
    private BinaryCodec codec2;
    private final Log log = Log.getInstance(BamToBfqWriter.class);
    private final boolean includeNonPfReads;
    private final boolean clipAdapters;
    private final Integer basesToWrite;

    /**
     * Constructor
     *
     * @param bamFile        the BAM file to read from
     * @param outputPrefix   the directory and file prefix for the binary fastq files
     * @param total          the total number of records that should be written, drawn evenly
     *                       from throughout the file (null for all).
     * @param chunk          the maximum number of records that should be written to any one file
     * @param pairedReads    whether these reads are from  a paired-end run
     * @param namePrefix     The string to be stripped off the read name
     *                       before writing to the bfq file. May be null, in which case
     *                       the name will not be trimmed.
     * @param includeNonPfReads whether to include non pf-reads
     * @param clipAdapters    whether to replace adapters as marked with XT:i clipping position attribute
     */
    public BamToBfqWriter(final File bamFile, final String outputPrefix, final Integer total,
                          final Integer chunk, final boolean pairedReads, String namePrefix,
                          boolean includeNonPfReads, boolean clipAdapters, Integer basesToWrite) {

        IOUtil.assertFileIsReadable(bamFile);
        this.bamFile = bamFile;
        this.outputPrefix = outputPrefix;
        this.pairedReads = pairedReads;
        if (total != null) {
            final double writeable = (double)countWritableRecords();
            this.increment = (int)Math.floor(writeable/total.doubleValue());
            if (this.increment == 0) {
                this.increment = 1;
            }
        }
        if (chunk != null) {
            this.chunk = chunk;
        }
        this.namePrefix = namePrefix;
        this.nameTrim = namePrefix != null ? namePrefix.length() : 0;
        this.includeNonPfReads = includeNonPfReads;
        this.clipAdapters = clipAdapters;
        this.basesToWrite = basesToWrite;
    }

    /**
     * Constructor
     *
     * @param bamFile   the BAM file to read from
     * @param outputPrefix   the directory and file prefix for the binary fastq files
     * @param pairedReads    whether these reads are from  a paired-end run
     * @param namePrefix     the barcode of the run (to be stripped off the read name
     *                       before writing to the bfq file)
     * @param includeNonPfReads whether to include non pf-reads
     */
    public BamToBfqWriter(final File bamFile, final String outputPrefix, final boolean pairedReads,
                          String namePrefix, boolean includeNonPfReads) {
        this(bamFile, outputPrefix, null, null, pairedReads, namePrefix, includeNonPfReads, true, null);
    }
    /**
     * Writes the binary fastq file(s) to the output directory
     */
    public void writeBfqFiles() {

        final Iterator<SAMRecord> iterator = (new SAMFileReader(IOUtil.openFileForReading(this.bamFile))).iterator();

        // Filter out noise reads and reads that fail the quality filter
        final TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1);
        final FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter();
        final WholeReadClippedFilter clippedFilter = new WholeReadClippedFilter();


        if (!pairedReads) {
            List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
            filters.add(tagFilter);
            filters.add(clippedFilter);
            if (!this.includeNonPfReads) {
                filters.add(qualityFilter);
            }
            writeSingleEndBfqs(iterator, filters);
            codec1.close();
        }
        else {
            writePairedEndBfqs(iterator, tagFilter, qualityFilter, clippedFilter);
            codec1.close();
            codec2.close();
        }
        log.info("Wrote " + wrote + " bfq records.");

    }

    /**
     * Path for writing bfqs for paired-end reads
     *
     * @param iterator      the iterator witht he SAM Records to write
     * @param tagFilter     the filter for noise reads
     * @param qualityFilter the filter for PF reads
     */
    private void writePairedEndBfqs(final Iterator<SAMRecord> iterator, final TagFilter tagFilter,
                                    final FailsVendorReadQualityFilter qualityFilter,
                                    SamRecordFilter ... otherFilters) {
        // Open the codecs for writing
        int fileIndex = 0;
        initializeNextBfqFiles(fileIndex++);

        int records = 0;

        RECORD_LOOP: while (iterator.hasNext()) {
            final SAMRecord first = iterator.next();
            if (!iterator.hasNext()) {
                throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath());
            }
            final SAMRecord second = iterator.next();
            if (!second.getReadName().equals(first.getReadName()) ||
                first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) {
                throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() +
                    ": " + first.getReadName() + ", " + second.getReadName() + ".");
            }

            // If *both* are noise reads, filter them out
            if (tagFilter.filterOut(first) && tagFilter.filterOut(second))  {
                continue;
            }

            // If either fails to pass filter, then exclude them as well
            if (!includeNonPfReads && (qualityFilter.filterOut(first) || qualityFilter.filterOut(second))) {
                continue;
            }

            // If either fails any of the other filters, exclude them both
            for (SamRecordFilter filter : otherFilters) {
                if (filter.filterOut(first) || filter.filterOut(second)) {
                    continue RECORD_LOOP;
                }
            }

            // Otherwise, write them out
            records++;
            if (records % increment == 0) {
                first.setReadName(first.getReadName() + "/1");
                writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first);
                second.setReadName(second.getReadName() + "/2");
                writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second);
                wrote++;
                if (wrote % 1000000 == 0) {
                    log.info(wrote + " records written.");
                }
                if (chunk > 0 && wrote % chunk == 0) {
                    initializeNextBfqFiles(fileIndex++);
                }
            }
        }
    }

    /**
     * Path for writing bfqs for single-end reads
     *
     * @param iterator  the iterator with he SAM Records to write
     * @param filters   the list of filters to be applied
     */
    private void writeSingleEndBfqs(final Iterator<SAMRecord> iterator, final List<SamRecordFilter> filters) {

        // Open the codecs for writing
        int fileIndex = 0;
        initializeNextBfqFiles(fileIndex++);

        int records = 0;

        final FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters));
        while (it.hasNext()) {
            final SAMRecord record = it.next();
            records++;
            if (records % increment == 0) {

                record.setReadName(record.getReadName() + "/1");
                writeFastqRecord(codec1, record);
                wrote++;
                if (wrote % 1000000 == 0) {
                    log.info(wrote + " records processed.");
                }
                if (chunk > 0 && wrote % chunk == 0) {
                    initializeNextBfqFiles(fileIndex++);
                }
            }
        }
    }

    /**
     * Closes any the open bfq file(s), if any, and opens the new one(s)
     *
     * @param fileIndex the index (counter) of the files to write
     */
    private void initializeNextBfqFiles(final int fileIndex) {
        // Close the codecs if they were writing before
        if (codec1 != null) {
            codec1.close();
            if (pairedReads) {
                codec2.close();
            }
        }

        // Open new file, using the fileIndex.
        final File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex);
        codec1 = new BinaryCodec(IOUtil.openFileForWriting(bfq1));
        log.info("Now writing to file " + bfq1.getAbsolutePath());
        if (pairedReads) {
            final File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex);
            codec2 = new BinaryCodec(IOUtil.openFileForWriting(bfq2));
            log.info("Now writing to file " + bfq2.getAbsolutePath());
        }
    }

    /**
     * Writes out a SAMRecord in Maq fastq format
     *
     * @param codec the code to write to
     * @param rec   the SAMRecord to write
     */
    private void writeFastqRecord(final BinaryCodec codec, final SAMRecord rec) {

        // Trim the run barcode off the read name
        String readName = rec.getReadName();
        if (namePrefix != null && readName.startsWith(namePrefix)) {
            readName = readName.substring(nameTrim);
        }
        // Writes the length of the read name and then the name (null-terminated)
        codec.writeString(readName, true, true);

        final char[] seqs = rec.getReadString().toCharArray();
        final char[] quals = rec.getBaseQualityString().toCharArray();

        int retainedLength = seqs.length;
        if (clipAdapters){
            // adjust to a shorter length iff clipping tag exists
            Integer trimPoint = rec.getIntegerAttribute(ReservedTagConstants.XT);
            if (trimPoint != null) {
                assert (rec.getReadLength() == seqs.length);
                retainedLength = Math.min(seqs.length, Math.max(SEED_REGION_LENGTH, trimPoint -1));
            }
        }

        // Write the length of the sequence
        codec.writeInt(basesToWrite != null ? basesToWrite : seqs.length);

        // Calculate and write the sequence and qualities
        final byte[] seqsAndQuals = encodeSeqsAndQuals(seqs, quals, retainedLength);
        codec.writeBytes(seqsAndQuals);
    }

    private byte[] encodeSeqsAndQuals(char[] seqs, char[] quals, int retainedLength) {
        final byte[] seqsAndQuals = new byte[basesToWrite == null ? seqs.length : basesToWrite];

        int seedRegionNoCallFixes = 0;
        for (int i = 0; i < retainedLength && i < seqsAndQuals.length; i++) {
            int quality = Math.min(quals[i]-33, 63);
            final int base;
            switch(seqs[i]) {
                case 'A':
                case 'a':
                    base = 0;
                    break;
                case 'C':
                case 'c':
                    base = 1;
                    break;
                case 'G':
                case 'g':
                    base = 2;
                    break;
                case 'T':
                case 't':
                    base = 3;
                    break;
                case 'N':
                case 'n':
                case '.':
                    base = 0;
                    if (i < SEED_REGION_LENGTH ) {
                        if (seedRegionNoCallFixes < MAX_SEED_REGION_NOCALL_FIXES) {
                            quality = 1;
                            seedRegionNoCallFixes++;
                        }
                        else {
                            quality = 0;
                        }
                    }
                    else {
                        quality = 1;
                    }
                    break;
                default:
                    throw new PicardException("Unknown base when writing bfq file: " + seqs[i]);
            }
            seqsAndQuals[i] = encodeBaseAndQuality(base, quality);
        }
        // rewrite clipped adapter with all A's of quality 1
        for (int i = retainedLength; i < seqsAndQuals.length; i++) {
            seqsAndQuals[i] = encodeBaseAndQuality(0, 1);
        }

        return seqsAndQuals;
    }

    private byte encodeBaseAndQuality(int base, int quality) {
        return (byte) ((base << 6) | quality);
    }

    /**
     * Count the number of records in the bamFile that could potentially be written
     *
     * @return  the number of records in the Bam file
     */
    private int countWritableRecords() {
        int count = 0;
       
        SAMFileReader reader = new SAMFileReader(IOUtil.openFileForReading(this.bamFile));
        if(!reader.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) {
          //this is a fix for issue PIC-274: It looks like BamToBfqWriter requires that the input BAM is queryname sorted,
          //but it doesn't check this early, nor produce an understandable error message."
          throw new PicardException("Input file (" + this.bamFile.getAbsolutePath() +") needs to be sorted by queryname.");
        }
        final PeekableIterator<SAMRecord> it = new PeekableIterator<SAMRecord>(reader.iterator());
        if (!this.pairedReads) {
            // Filter out noise reads and reads that fail the quality filter
            final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
            filters.add(new TagFilter(ReservedTagConstants.XN, 1));
            if (!this.includeNonPfReads) {
                filters.add(new FailsVendorReadQualityFilter());
            }
            final FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters));
            while (itr.hasNext()) {
                itr.next();
                count++;
            }
        }
        else {
            while (it.hasNext()) {
                final SAMRecord first = it.next();
                final SAMRecord second = it.next();
                // If both are noise reads, filter them out
                if (first.getAttribute(ReservedTagConstants.XN) != null &&
                    second.getAttribute(ReservedTagConstants.XN) != null)  {
                    // skip it
                }
                // If either fails to pass filter, then exclude them as well
                else if (!this.includeNonPfReads && (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag()) ) {
                    // skip it
                }
                // Otherwise, write them out
                else {
                    count++;
                }
            }
        }
        it.close();
        return count;
    }

    /**
     * Constructs the name for the output file and returns the file
     *
     * @param outputPrefix        the directory and file prefix for the output bfq file
     * @param read                whether this is the file for the first or second read
     * @param index               used in file name
     * @return                    a new File object for the bfq file.
     */
    private File getOutputFile(final String outputPrefix, final int read, final int index) {
        final File result = new File(outputPrefix + index + "." + read + ".bfq");
        IOUtil.assertFileIsWritable(result);
        return result;
    }

}
TOP

Related Classes of picard.fastq.BamToBfqWriter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.