Package org.broadinstitute.gatk.engine.filters

Source Code of org.broadinstitute.gatk.engine.filters.MalformedReadFilter

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.filters;

import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.engine.arguments.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
import org.broadinstitute.gatk.utils.exceptions.UserException;

import java.util.Collections;

/**
* Filter out malformed reads.
*
* @author mhanna
* @version 0.1
*/
public class MalformedReadFilter extends ReadFilter {


    private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ;

    private SAMFileHeader header;

    @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false)
    boolean filterReadsWithNCigar = false;


    @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false)
    boolean filterMismatchingBaseAndQuals = false;

    @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false)
    boolean filterBasesNotStored = false;

    /**
     * Indicates the applicable validation exclusions
     */
    private boolean allowNCigars;

    @Override
    public void initialize(final GenomeAnalysisEngine engine) {
        header = engine.getSAMFileHeader();
        ValidationExclusion validationExclusions = null;
        final SAMDataSource rds = engine.getReadsDataSource();
        if (rds != null) {
          final ReadProperties rps = rds.getReadsInfo();
          if (rps != null) {
            validationExclusions = rps.getValidationExclusionList();
          }
        }
        if (validationExclusions == null) {
            allowNCigars = false;
        } else {
            allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS);
        }
    }

    public boolean filterOut(final SAMRecord read) {
        // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided
        return  !checkInvalidAlignmentStart(read) ||
                !checkInvalidAlignmentEnd(read) ||
                !checkAlignmentDisagreesWithHeader(this.header,read) ||
                !checkHasReadGroup(read) ||
                !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) ||
                !checkCigarDisagreesWithAlignment(read) ||
                !checkSeqStored(read, filterBasesNotStored) ||
                !checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars);
    }

    private static boolean checkHasReadGroup(final SAMRecord read) {
        if ( read.getReadGroup() == null ) {
            // there are 2 possibilities: either the RG tag is missing or it is not defined in the header
            final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG);
            if ( rgID == null )
                throw new UserException.ReadMissingReadGroup(read);
            throw new UserException.ReadHasUndefinedReadGroup(read, rgID);
        }
        return true;
    }

    /**
     * Check for the case in which the alignment start is inconsistent with the read unmapped flag.
     * @param read The read to validate.
     * @return true if read start is valid, false otherwise.
     */
    private static boolean checkInvalidAlignmentStart(final SAMRecord read ) {
        // read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START
        if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START )
            return false;
        // Read is not flagged as 'unmapped', but alignment start is -1
        if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 )
            return false;
        return true;
    }

    /**
     * Check for invalid end of alignments.
     * @param read The read to validate.
     * @return true if read end is valid, false otherwise.
     */
    private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) {
        // Alignment aligns to negative number of bases in the reference.
        if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 )
            return false;
        return true;
    }

    /**
     * Check to ensure that the alignment makes sense based on the contents of the header.
     * @param header The SAM file header.
     * @param read The read to verify.
     * @return true if alignment agrees with header, false othrewise.
     */
    private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) {
        // Read is aligned to nonexistent contig
        if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
            return false;
        final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() );
        // Read is aligned to a point after the end of the contig
        if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() )
            return false;
        return true;
    }

    /**
     * Check for inconsistencies between the cigar string and the
     * @param read The read to validate.
     * @return true if cigar agrees with alignment, false otherwise.
     */
    private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) {
        // Read has a valid alignment start, but the CIGAR string is empty
        if( !read.getReadUnmappedFlag() &&
            read.getAlignmentStart() != -1 &&
            read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START &&
            read.getAlignmentBlocks().size() < 0 )
            return false;
        return true;
    }

    /**
     * Check for unsupported CIGAR operators.
     * Currently the N operator is not supported.
     * @param read The read to validate.
     * @param filterReadsWithNCigar whether the offending read should just
     *                              be silently filtered or not.
     * @param allowNCigars whether reads that contain N operators in their CIGARs
     *                     can be processed or an exception should be thrown instead.
     * @throws UserException.UnsupportedCigarOperatorException
     *   if {@link #filterReadsWithNCigar} is <code>false</code> and
     *   the input read has some unsupported operation.
     * @return <code>true</code> if the read CIGAR operations are
     * fully supported, otherwise <code>false</code>, as long as
     * no exception has been thrown.
     */
    private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) {
        if( containsNOperator(read)) {
            if (! filterReadsWithNCigar && !allowNCigars) {
                throw new UserException.UnsupportedCigarOperatorException(
                        CigarOperator.N,read,
                        "Perhaps you are"
                        + " trying to use RNA-Seq data?"
                        + " While we are currently actively working to"
                        + " support this data type unfortunately the"
                        + " GATK cannot be used with this data in its"
                        + " current form. You have the option of either"
                        + " filtering out all reads with operator "
                        + CigarOperator.N + " in their CIGAR string"
                        + " (please add --"
                        +  FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME
                        + " to your command line) or"
                        + " assume the risk of processing those reads as they"
                        + " are including the pertinent unsafe flag (please add -U"
                        + ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS
                        + " to your command line). Notice however that if you were"
                        + " to choose the latter, an unspecified subset of the"
                        + " analytical outputs of an unspecified subset of the tools"
                        + " will become unpredictable. Consequently the GATK team"
                        + " might well not be able to provide you with the usual support"
                        + " with any issue regarding any output");
            }
            return ! filterReadsWithNCigar;
        }
        return true;
    }

    private static boolean containsNOperator(final SAMRecord read) {
        final Cigar cigar = read.getCigar();
        if (cigar == null)   {
            return false;
        }
        for (final CigarElement ce : cigar.getCigarElements()) {
            if (ce.getOperator() == CigarOperator.N) {
                return true;
            }
        }
        return false;
    }

    /**
     * Check if the read has the same number of bases and base qualities
     * @param read the read to validate
     * @return true if they have the same number. False otherwise.
     */
    private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) {
        final boolean result;
        if (read.getReadLength() == read.getBaseQualities().length)
            result = true;
        else if (filterMismatchingBaseAndQuals)
            result = false;
        else
            throw new UserException.MalformedBAM(read,
                    String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals].%s",
                            read.getReadName(), read.getReadLength(), read.getBaseQualities().length,
                            read.getBaseQualities().length == 0 ? " You can use --defaultBaseQualities to assign a default base quality for all reads, but this can be dangerous in you don't know what you are doing." : ""));

        return result;
    }

    /**
     * Check if the read has its base sequence stored
     * @param read the read to validate
     * @return true if the sequence is stored and false otherwise ("*" in the SEQ field).
     */
    protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) {

        if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE )
            return true;

        if ( filterBasesNotStored )
            return false;

        throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName()));
    }
}
TOP

Related Classes of org.broadinstitute.gatk.engine.filters.MalformedReadFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.