Package picard.analysis

Source Code of picard.analysis.CpgLocation

/*
* The MIT License
*
* Copyright (c) 2013 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package picard.analysis;


import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import picard.metrics.PerUnitMetricCollector;
import picard.metrics.SAMRecordAndReference;
import picard.metrics.SAMRecordAndReferenceMultiLevelCollector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

public class RrbsMetricsCollector extends SAMRecordAndReferenceMultiLevelCollector<RrbsMetrics, Comparable<?>> {
  private final int minReadLength;
  private final double maxMismatchRate;
  private final int cQualityThreshold;
  private final int nextBaseQualityThreshold;

  public RrbsMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords,
                final int cQualityThreshold, final int nextBaseQualityThreshold, final int minReadLength,
                final double maxMismatchRate) {
    this.cQualityThreshold = cQualityThreshold;
    this.nextBaseQualityThreshold = nextBaseQualityThreshold;
    this.minReadLength = minReadLength;
    this.maxMismatchRate = maxMismatchRate;
    setup(accumulationLevels, samRgRecords);
  }

  @Override
  protected PerUnitMetricCollector<RrbsMetrics, Comparable<?>, SAMRecordAndReference> makeChildCollector(final String sample, final String library, final String readGroup) {
    return new PerUnitRrbsMetricsCollector(sample, library, readGroup);
  }

  private class PerUnitRrbsMetricsCollector implements PerUnitMetricCollector<RrbsMetrics, Comparable<?>, SAMRecordAndReference> {
    final String sample;
    final String library;
    final String readGroup;

    // Counters for CpG & non-CpG seen/converted sites
    int nCytoConverted = 0;
    int nCytoTotal = 0;
    final Histogram<CpgLocation> cpgTotal = new Histogram<CpgLocation>();
    final Histogram<CpgLocation> cpgConverted = new Histogram<CpgLocation>();

    // Counters for QC filters used in the final metrics
    int mappedRecordCount = 0;
    int smallReadCount = 0;
    int mismatchCount = 0;
    int noCpgCount = 0;

    // Final metrics calculated once all the reads are done
    double cytoConversionRate;
    double cpgConversionRate;
    int nCpgSeen;
    int nCpgConverted;
    double coverageMean;
    int coverageMedian;

    public PerUnitRrbsMetricsCollector(final String sample, final String library, final String readGroup) {
      this.sample = sample;
      this.library = library;
      this.readGroup = readGroup;
    }

    public void acceptRecord(final SAMRecordAndReference args) {
      mappedRecordCount++;

      final SAMRecord samRecord = args.getSamRecord();
      final ReferenceSequence referenceSequence = args.getReferenceSequence();

      final byte[] readBases = samRecord.getReadBases();
      final byte[] readQualities = samRecord.getBaseQualities();
      final byte[] refBases = referenceSequence.getBases();

      if (samRecord.getReadLength() < minReadLength) {
        smallReadCount++;
        return;
      } else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
        mismatchCount++;
        return;
      }

      // We only record non-CpG C sites if there was at least one CpG in the read, keep track of
      // the values for this record and then apply to the global totals if valid
      int recordCpgs = 0;

      for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
        final int blockLength = alignmentBlock.getLength();
        final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
        final int readFragmentStart = alignmentBlock.getReadStart() - 1;

        final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
        final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
        final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);

        if (samRecord.getReadNegativeStrandFlag()) {
          // In the case of a negative strand, reverse (and complement for base arrays) the reference,
          // reads & qualities so that it can be treated as a positive strand for the rest of the process
          SequenceUtil.reverseComplement(refFragment);
          SequenceUtil.reverseComplement(readFragment);
          SequenceUtil.reverseQualities(readQualityFragment);
        }

        for (int i=0; i < blockLength-1; i++) {
          final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());

          // Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
          // (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
          if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
              (SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
            // We want to catch the case where there's a CpG in the reference, even if it is not valid
            // to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
            // in one if statement
            if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
              recordCpgs++;
              final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
              cpgTotal.increment(curLocation);
              if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
                cpgConverted.increment(curLocation);
              }
            }
            i++;
          } else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
              SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
            // C base in the reference that's not associated with a CpG
            nCytoTotal++;
            if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
              nCytoConverted++;
            }
          }
        }
      }

      if (recordCpgs == 0) {
        noCpgCount++;
      }
    }

    public void finish() {
      cytoConversionRate = nCytoTotal == 0 ? 0 : nCytoConverted / (double)nCytoTotal;
      nCpgSeen = (int)cpgTotal.getSumOfValues();
      nCpgConverted = (int)cpgConverted.getSumOfValues();
      cpgConversionRate = nCpgSeen == 0 ? 0 : nCpgConverted / (double)nCpgSeen;
      coverageMean = cpgTotal.getMeanBinSize();
      coverageMedian = (int)cpgTotal.getMedianBinSize();
    }

    @Override
    public void addMetricsToFile(final MetricsFile<RrbsMetrics, Comparable<?>> metricsFile) {
      // Create both the summary and detail metrics & add them to the RrbsMetrics container class for
      // the downstream code to use as desired
      final RrbsSummaryMetrics summaryMetrics = buildSummaryMetrics();
      final List<RrbsCpgDetailMetrics> detailMetrics = buildDetailMetrics();
      final RrbsMetrics rrbsMetrics = new RrbsMetrics(summaryMetrics, detailMetrics);
      metricsFile.addMetric(rrbsMetrics);
    }

    private RrbsSummaryMetrics buildSummaryMetrics() {
      final RrbsSummaryMetrics summaryMetrics = new RrbsSummaryMetrics();
      summaryMetrics.SAMPLE = sample;
      summaryMetrics.READ_GROUP = readGroup;
      summaryMetrics.LIBRARY = library;
      summaryMetrics.READS_ALIGNED = mappedRecordCount;
      summaryMetrics.NON_CPG_BASES = nCytoTotal;
      summaryMetrics.NON_CPG_CONVERTED_BASES = nCytoConverted;
      summaryMetrics.PCT_NON_CPG_BASES_CONVERTED = cytoConversionRate;
      summaryMetrics.CPG_BASES_SEEN = nCpgSeen;
      summaryMetrics.CPG_BASES_CONVERTED = nCpgConverted;
      summaryMetrics.PCT_CPG_BASES_CONVERTED = cpgConversionRate;
      summaryMetrics.MEAN_CPG_COVERAGE = coverageMean;
      summaryMetrics.MEDIAN_CPG_COVERAGE = coverageMedian;
      summaryMetrics.READS_IGNORED_SHORT = smallReadCount;
      summaryMetrics.READS_WITH_NO_CPG = noCpgCount;
      summaryMetrics.READS_IGNORED_MISMATCHES = mismatchCount;
      return summaryMetrics;
    }

    private List<RrbsCpgDetailMetrics> buildDetailMetrics() {
      final List<RrbsCpgDetailMetrics> detailMetrics = new ArrayList<RrbsCpgDetailMetrics>();
      for (final CpgLocation key : cpgTotal.keySet()) {
        final RrbsCpgDetailMetrics cpgMetric = new RrbsCpgDetailMetrics();
        cpgMetric.SAMPLE = sample;
        cpgMetric.READ_GROUP = readGroup;
        cpgMetric.LIBRARY = library;
        cpgMetric.SEQUENCE_NAME = key.getSequence();
        cpgMetric.POSITION = key.getPosition();
        cpgMetric.TOTAL_SITES = (int)cpgTotal.get(key).getValue();
        cpgMetric.CONVERTED_SITES = cpgConverted.containsKey(key) ? (int)cpgConverted.get(key).getValue() : 0;
        cpgMetric.PCT_CONVERTED = cpgMetric.CONVERTED_SITES == 0 ? 0 : cpgMetric.CONVERTED_SITES / (double)cpgMetric.TOTAL_SITES;
        detailMetrics.add(cpgMetric);
      }
      return detailMetrics;
    }
  }

  private byte[] getFragment(final byte[] fullArray, final int fragmentStart, final int length) {
    return Arrays.copyOfRange(fullArray, fragmentStart, fragmentStart + length);
  }

  /**
   * True if there's a C in the reference as well as read (possibly bisulfite converted)
   */
  private boolean isC(final byte refBase, final byte readBase) {
    return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase));
  }

  /**
   * Checks a pair of bases for CpG status & quality thresholds
   */
  private boolean isValidCpg(final byte[] refBases, final byte[] readBases, final byte[] readQualities, final int index) {
    return isC(refBases[index], readBases[index]) && SequenceUtil.basesEqual(refBases[index+1], readBases[index+1]) &&
        isAboveCytoQcThreshold(readQualities, index);
  }

  /**
   * Any cyto base (CpG context or not) needs to be above a specified quality threshold. Similarly, the neighboring
   * base on the 3' side must also be above another threshold unless the cyto base is the final base in the 3'
   * direction.
   */
  private boolean isAboveCytoQcThreshold(final byte[] readQualities, final int index) {
    return ((index < readQualities.length - 1) && (readQualities[index] >= cQualityThreshold) &&
        (readQualities[index+1] >= nextBaseQualityThreshold));
  }

  /**
   * Accounts for the fact that negative strand counts have been reversed
   */
  private int getCurRefIndex(final int refStart, final int blockLength, final int idx, final boolean isNegative) {
    return isNegative ? refStart + (blockLength - 1) - idx - 1 : refStart + idx;
  }
}

/**
* Used to keep track of the location of CpG sites
*/
class CpgLocation implements Comparable<CpgLocation> {
  private final String sequence;
  private final Integer position;

  public CpgLocation(final String sequence, final int position) {
    this.sequence = sequence;
    this.position = position;
  }

  @Override
  public int compareTo(final CpgLocation other) {
    final int seqComp = sequence.compareTo(other.sequence);
    return seqComp == 0 ? position.compareTo(other.position) : seqComp;
  }

  @Override
  public boolean equals(final Object other) {
    if (this == other) {
      return true;
    }

    if (other == null || getClass() != other.getClass()) {
      return false;
    }

    final CpgLocation that = (CpgLocation)other;

    return (sequence.equals(that.sequence)) && (position.equals(that.position));
  }

  @Override
  public int hashCode() {
    int result = sequence.hashCode();
    result = 31 * result + position;
    return result;
  }

  public String getSequence() {
    return sequence;
  }

  public Integer getPosition() {
    return position;
  }
}
TOP

Related Classes of picard.analysis.CpgLocation

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.