Package org.broadinstitute.gatk.engine.walkers.diffengine

Source Code of org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine$SummaryReportParams

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.walkers.diffengine;

import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.report.GATKReport;
import org.broadinstitute.gatk.engine.report.GATKReportTable;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.classloader.PluginManager;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;

import java.io.File;
import java.io.PrintStream;
import java.util.*;

/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 7/4/11
* Time: 12:51 PM
* A generic engine for comparing tree-structured objects
*
*/
public class DiffEngine {
    final protected static Logger logger = Logger.getLogger(DiffEngine.class);

    private final Map<String, DiffableReader> readers = new HashMap<String, DiffableReader>();

    public DiffEngine() {
        loadDiffableReaders();
    }

    // --------------------------------------------------------------------------------
    //
    // difference calculation
    //
    // --------------------------------------------------------------------------------

    public List<Difference> diff(DiffElement master, DiffElement test) {
        DiffValue masterValue = master.getValue();
        DiffValue testValue = test.getValue();

        if ( masterValue.isCompound() && masterValue.isCompound() ) {
            return diff(master.getValueAsNode(), test.getValueAsNode());
        } else if ( masterValue.isAtomic() && testValue.isAtomic() ) {
            return diff(masterValue, testValue);
        } else {
            // structural difference in types.  one is node, other is leaf
            return Arrays.asList(new Difference(master, test));
        }
    }

    public List<Difference> diff(DiffNode master, DiffNode test) {
        Set<String> allNames = new HashSet<String>(master.getElementNames());
        allNames.addAll(test.getElementNames());
        List<Difference> diffs = new ArrayList<Difference>();

        for ( String name : allNames ) {
            DiffElement masterElt = master.getElement(name);
            DiffElement testElt = test.getElement(name);
            if ( masterElt == null && testElt == null ) {
                throw new ReviewedGATKException("BUG: unexpectedly got two null elements for field: " + name);
            } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value
                // todo -- should one of these be a special MISSING item?
                diffs.add(new Difference(masterElt, testElt));
            } else {
                diffs.addAll(diff(masterElt, testElt));
            }
        }

        return diffs;
    }

    public List<Difference> diff(DiffValue master, DiffValue test) {
        if ( master.getValue().equals(test.getValue()) ) {
            return Collections.emptyList();
        } else {
            return Arrays.asList(new Difference(master.getBinding(), test.getBinding()));
        }
    }

    // --------------------------------------------------------------------------------
    //
    // Summarizing differences
    //
    // --------------------------------------------------------------------------------

    /**
     * Emits a summary of the diffs to out.  Suppose you have the following three differences:
     *
     *   A.X.Z:1!=2
     *   A.Y.Z:3!=4
     *   B.X.Z:5!=6
     *
     * The above is the itemized list of the differences.  The summary looks for common differences
     * in the name hierarchy, counts those shared elements, and emits the differences that occur
     * in order of decreasing counts.
     *
     * So, in the above example, what are the shared elements?
     *
     * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2
     * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3
     * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1
     *
     * So we would emit the following summary:
     *
     * *.*.Z: 3
     * *.X.Z: 2
     * A.X.Z: 1 [specific difference: 1!=2]
     * A.Y.Z: 1 [specific difference: 3!=4]
     * B.X.Z: 1 [specific difference: 5!=6]
     *
     * The algorithm to accomplish this calculation is relatively simple. Start with all of the
     * concrete differences.  For each pair of differences A1.A2....AN and B1.B2....BN:
     *
     * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si
     * If i == 0, then there's no shared substructure
     * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN
     * if X is a known summary, increment it's count, otherwise set its count to 1
     *
     * Not that only pairs of the same length are considered as potentially equivalent
     *
     * @param params determines how we display the items
     * @param diffs the list of differences to summarize
     */
    public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
        printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params );
    }

    final protected static String[] diffNameToPath(String diffName) {
        return diffName.split("\\.");
    }

    protected List<Difference> summarizedDifferencesOfPathsFromString(List<String> singletonDiffs) {
        List<Difference> diffs = new ArrayList<Difference>();

        for ( String diff : singletonDiffs ) {
            diffs.add(new Difference(diff));
        }

        return summarizedDifferencesOfPaths(diffs, true, -1);
    }

    /**
     * Computes a minimum set of potential differences between all singleton differences
     * in singletonDiffs.  Employs an expensive pairwise O(n^2) algorithm.
     *
     * @param singletonDiffs
     * @param maxRawDiffsToSummarize
     * @return
     */
    private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
                                                             final int maxRawDiffsToSummarize) {
        Map<String, Difference> summaries = new HashMap<String, Difference>();

        // create the initial set of differences
        for ( int i = 0; i < singletonDiffs.size(); i++ ) {
            for ( int j = 0; j <= i; j++ ) {
                Difference diffPath1 = singletonDiffs.get(i);
                Difference diffPath2 = singletonDiffs.get(j);
                if ( diffPath1.length() == diffPath2.length() ) {
                    int lcp = longestCommonPostfix(diffPath1.getParts(), diffPath2.getParts());
                    String path = diffPath2.getPath();
                    if ( lcp != 0 && lcp != diffPath1.length() )
                        path = summarizedPath(diffPath2.getParts(), lcp);
                    Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest());
                    sumDiff.setCount(0);
                    addSummaryIfMissing(summaries, sumDiff);

                    if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
                        return summaries;
                }
            }
        }

        return summaries;
    }

    /**
     * Computes the possible leaf differences among the singleton diffs.
     *
     * The leaf differences are all of the form *.*...*.X where all internal
     * differences are wildcards and the only summarized difference considered
     * interesting to compute is
     *
     * @param singletonDiffs
     * @param maxRawDiffsToSummarize
     * @return
     */
    private Map<String, Difference> initialLeafSummaries(final List<? extends Difference> singletonDiffs,
                                                         final int maxRawDiffsToSummarize) {
        Map<String, Difference> summaries = new HashMap<String, Difference>();

        // create the initial set of differences
        for ( final Difference d : singletonDiffs ) {
            final String path = summarizedPath(d.getParts(), 1);
            Difference sumDiff = new Difference(path, d.getMaster(), d.getTest());
            sumDiff.setCount(0);
            addSummaryIfMissing(summaries, sumDiff);

            if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
                return summaries;
        }

        return summaries;
    }

    protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
                                                            final boolean doPairwise,
                                                            final int maxRawDiffsToSummarize) {
        final Map<String, Difference> summaries = doPairwise
                ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize)
                : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize);

        // count differences
        for ( Difference diffPath : singletonDiffs ) {
            for ( Difference sumDiff : summaries.values() ) {
                if ( sumDiff.matches(diffPath.getParts()) )
                    sumDiff.incCount();
            }
        }

        List<Difference> sortedSummaries = new ArrayList<Difference>(summaries.values());
        Collections.sort(sortedSummaries);
        return sortedSummaries;
    }

    protected void addSummaryIfMissing(Map<String, Difference> summaries, Difference diff) {
        if ( ! summaries.containsKey(diff.getPath()) ) {
            summaries.put(diff.getPath(), diff);
        }
    }

    protected void printSummaryReport(List<Difference> sortedSummaries, SummaryReportParams params ) {
        List<Difference> toShow = new ArrayList<Difference>();
        int count = 0, count1 = 0;
        for ( Difference diff : sortedSummaries ) {
            if ( diff.getCount() < params.minSumDiffToShow )
                // in order, so break as soon as the count is too low
                break;

            if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay )
                break;

            if ( diff.getCount() == 1 ) {
                count1++;
                if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems )
                    break;
            }

            toShow.add(diff);
        }

        // if we want it in descending order, reverse the list
        if ( ! params.descending ) {
            Collections.reverse(toShow);
        }

        // now that we have a specific list of values we want to show, display them
        GATKReport report = new GATKReport();
        final String tableName = "differences";
        report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3);
        final GATKReportTable table = report.getTable(tableName);
        table.addColumn("Difference");
        table.addColumn("NumberOfOccurrences");
        table.addColumn("ExampleDifference");
        for ( final Difference diff : toShow ) {
            final String key = diff.getPath();
            table.addRowID(key, true);
            table.set(key, "NumberOfOccurrences", diff.getCount());
            table.set(key, "ExampleDifference", diff.valueDiffString());
        }
        GATKReport output = new GATKReport(table);
        output.print(params.out);
    }

    protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) {
        int i = 0;
        for ( ; i < diffPath1.length; i++ ) {
            int j = diffPath1.length - i - 1;
            if ( ! diffPath1[j].equals(diffPath2[j]) )
                break;
        }
        return i;
    }

    /**
     * parts is [A B C D]
     * commonPostfixLength: how many parts are shared at the end, suppose its 2
     * We want to create a string *.*.C.D
     *
     * @param parts the separated path values [above without .]
     * @param commonPostfixLength
     * @return
     */
    protected static String summarizedPath(String[] parts, int commonPostfixLength) {
        int stop = parts.length - commonPostfixLength;
        if ( stop > 0 ) parts = parts.clone();
        for ( int i = 0; i < stop; i++ ) {
            parts[i] = "*";
        }
        return Utils.join(".", parts);
    }

    // --------------------------------------------------------------------------------
    //
    // plugin manager
    //
    // --------------------------------------------------------------------------------

    public void loadDiffableReaders() {
        List<Class<? extends DiffableReader>> drClasses = new PluginManager<DiffableReader>( DiffableReader.class ).getPlugins();

        logger.info("Loading diffable modules:");
        for (Class<? extends DiffableReader> drClass : drClasses ) {
            logger.info("\t" + drClass.getSimpleName());

            try {
                DiffableReader dr = drClass.newInstance();
                readers.put(dr.getName(), dr);
            } catch (InstantiationException e) {
                throw new ReviewedGATKException("Unable to instantiate module '" + drClass.getSimpleName() + "'");
            } catch (IllegalAccessException e) {
                throw new ReviewedGATKException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'");
            }
        }
    }

    protected Map<String, DiffableReader> getReaders() {
        return readers;
    }

    protected DiffableReader getReader(String name) {
        return readers.get(name);
    }

    /**
     * Returns a reader appropriate for this file, or null if no such reader exists
     * @param file
     * @return
     */
    public DiffableReader findReaderForFile(File file) {
        for ( DiffableReader reader : readers.values() )
            if (reader.canRead(file) )
                return reader;

        return null;
    }

    /**
     * Returns true if reader appropriate for this file, or false if no such reader exists
     * @param file
     * @return
     */
    public boolean canRead(File file) {
        return findReaderForFile(file) != null;
    }


    public DiffElement createDiffableFromFile(File file) {
        return createDiffableFromFile(file, -1);
    }

    public DiffElement createDiffableFromFile(File file, int maxElementsToRead) {
        DiffableReader reader = findReaderForFile(file);
        if ( reader == null )
            throw new UserException("Unsupported file type: " + file);
        else
            return reader.readFromFile(file, maxElementsToRead);
    }

    public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) {
        DiffEngine diffEngine = new DiffEngine();

        if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) {
            DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead);
            DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead);
            List<Difference> diffs = diffEngine.diff(master, test);
            diffEngine.reportSummarizedDifferences(diffs, params);
            return true;
        } else {
            return false;
        }
    }

    public static class SummaryReportParams {
        final PrintStream out;
        final int maxItemsToDisplay;
        final int maxCountOneItems;
        final int minSumDiffToShow;
        final int maxRawDiffsToSummarize;
        final boolean doPairwise;
        boolean descending = true;

        public SummaryReportParams(PrintStream out,
                                   int maxItemsToDisplay,
                                   int maxCountOneItems,
                                   int minSumDiffToShow,
                                   int maxRawDiffsToSummarize,
                                   final boolean doPairwise) {
            this.out = out;
            this.maxItemsToDisplay = maxItemsToDisplay;
            this.maxCountOneItems = maxCountOneItems;
            this.minSumDiffToShow = minSumDiffToShow;
            this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
            this.doPairwise = doPairwise;
        }

        public void setDescending(boolean descending) {
            this.descending = descending;
        }
    }
}
TOP

Related Classes of org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine$SummaryReportParams

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.