Package org.broad.igv.maf

Source Code of org.broad.igv.maf.MAFIndex$RunningAverage

/*
* Copyright (c) 2007-2013 by The Broad Institute of MIT and Harvard.  All Rights Reserved.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*
* THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR
* WARRANTES OF ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING,
* WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
* PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER
* OR NOT DISCOVERABLE.  IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR RESPECTIVE
* TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES
* OF ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
* ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER
* THE BROAD OR MIT SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT
* SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
*/
package org.broad.igv.maf;

import org.broad.igv.Globals;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.index.Interval;
import org.broad.igv.util.index.IntervalTree;
import htsjdk.tribble.readers.AsciiLineReader;

import java.io.*;
import java.util.*;

/**
* @author jrobinso
*         Date: 2/8/13
*         Time: 7:23 AM
*/
public class MAFIndex {

    private List<String> species;

    /**
     * Map of chromosome name -> interval tree
     */
    private Map<String, IntervalTree> intervalTrees;

    /**
     * The # of alignments represented by an interval in the tree.
     * Note: This is not private so that it can be manipulated by unit tests.
     */
    public static int blockSize = 50;

    public MAFIndex() {
        intervalTrees = new HashMap<String, IntervalTree>();
    }

    /**
     * Species represented in this file.
     */
    public List<String> getSpecies() {
        return species;
    }

    public void setSpecies(List<String> species) {
        this.species = species;
    }

    /**
     * The reference species.
     */
    public String getRefId() {
        return species == null || species.isEmpty() ? null : species.get(0);
    }

    public Collection<String> getChromosomes() {
        return intervalTrees.keySet();
    }

    public IntervalTree getIntervalTree(String chr) {

        IntervalTree iv = intervalTrees.get(chr);
        if(iv == null) {
            iv = intervalTrees.get("*"); // To support legacy MAF indeces, files are split by chromosome
        }
        return iv;
    }


    public void putIntervalTree(String s, IntervalTree iv) {

        intervalTrees.put(s, iv);
    }

    public void insertInterval(String lastChr, int intervalStart, int intervalEnd, long value) {
        IntervalTree iv = intervalTrees.get(lastChr);
        if (iv == null) {
            iv = new IntervalTree();
            intervalTrees.put(lastChr, iv);
        }
        iv.insert(new Interval(intervalStart, intervalEnd, value));
    }


    /**
     * @param idxFile
     * @throws java.io.IOException
     */
    public static MAFIndex loadIndex(String idxFile) throws IOException {

        MAFIndex index = new MAFIndex();
        index.species = new ArrayList<String>();

        //index = new IntervalTree<Long>();
        BufferedReader br = null;
        try {
            br = ParsingUtils.openBufferedReader(idxFile);

            // Peak at first line
            String line = br.readLine();
            if (line.startsWith("#species")) {
                // Reference species is first -- this is required.
                while ((line = br.readLine()) != null) {
                    if (line.startsWith("#end")) {
                        break;
                    }
                    index.species.add(line.trim());
                }

                IntervalTree iv = null;
                while ((line = br.readLine()) != null) {
                    if (line.trim().length() == 0) continue;
                    if (line.startsWith("#chr=")) {
                        String chr = ParsingUtils.EQ_PATTERN.split(line)[1];
                        iv = new IntervalTree();
                        index.putIntervalTree(chr, iv);
                    } else if (iv != null) {
                        String[] info = Globals.tabPattern.split(line);
                        int start = Integer.parseInt(info[0]);
                        int end = Integer.parseInt(info[1]) + start;
                        long offset = Long.parseLong(info[2]);
                        iv.insert(new Interval(start, end, offset));
                    } else {
                        // log.info("Skipping line " + line);
                    }
                }

            } else {
                // A "legacy" index, created for Broad hosted files that are separated by chromosome.
                // Every alignment is indexed, which is overkill.  Below we lump them into blocks of 50.
                IntervalTree iv = new IntervalTree();
                int l = 0;
                int intervalStart = 0;
                int intervalEnd = 0;
                long lastOffset = 0;
                while ((line = br.readLine()) != null) {
                    String[] info = Globals.tabPattern.split(line);
                    int start = Integer.parseInt(info[0]);
                    intervalEnd = Integer.parseInt(info[1]) + start;
                    if (l % 50 == 0) {
                        iv.insert(new Interval(intervalStart, intervalEnd, lastOffset));
                        intervalStart = intervalEnd;
                        lastOffset = Long.parseLong(info[2]);
                    }
                    l++;
                }

                if(intervalEnd > intervalStart) {
                    iv.insert(new Interval(intervalStart, intervalEnd, lastOffset));
                }

                index.putIntervalTree("*", iv);
            }
        } finally {
            if (br != null) br.close();
        }
        return index;
    }


    /**
     * Create an index for the MAF file.
     * <p/>
     * Example MAF lines:
     * a score=34237.000000
     * s hg19.chr1     10917 479 + 249250621 gagaggc
     * s panTro2.chr15 13606 455 - 100063
     *
     * @param alignmentFile
     * @throws IOException
     */
    public static MAFIndex createIndex(String alignmentFile) throws IOException {

        MAFIndex index = new MAFIndex();
        AsciiLineReader reader = ParsingUtils.openAsciiReader(new ResourceLocator(alignmentFile));


        try {

            String lastChr = null;
            int intervalStart = 0;
            int intervalEnd = 0;
            int blockCount = 0;
            String line;
            long lastOffset = 0;
            boolean newBlock = false;

            Set<String> allSpecies = new HashSet<String>();
            List<String> blockSpecies = new ArrayList<String>();
            Map<String, RunningAverage> speciesRanks = new HashMap<String, RunningAverage>();

            while ((line = reader.readLine()) != null) {
                //Ignore all comment lines
                if (line.startsWith("#") || line.trim().length() == 0) {
                    continue;
                }

                if (line.startsWith("a ")) {
                    newBlock = true;
                    blockCount++;

                    // Merge species list, if any, from previous block and start new one
                    mergeSpecies(blockSpecies, allSpecies, speciesRanks);
                    blockSpecies.clear();

                } else if (line.startsWith("s ")) {

                    String[] tokens = Globals.whitespacePattern.split(line);

                    String src = tokens[1];
                    String species = src;
                    String chr = src;
                    if (src.contains(".")) {
                        String[] srcTokens = ParsingUtils.PERIOD_PATTERN.split(src);
                        species = srcTokens[0];
                        chr = srcTokens[1];
                    }
                    if (lastChr == null) lastChr = chr;

                    blockSpecies.add(species);

                    if (newBlock) {
                        // This will be the reference sequence line (its always first after the "a")
                        int start = Integer.parseInt(tokens[2]);
                        int end = Integer.parseInt(tokens[3]) + start;

                        if ((!chr.equals(lastChr) && blockCount > 0) ||
                                blockCount > blockSize) {

                            // Record previous interval and start a new one.
                            index.insertInterval(lastChr, intervalStart, intervalEnd, lastOffset);

                            blockCount = 1;
                            lastOffset = reader.getPosition();
                            intervalStart = start;
                        }

                        lastChr = chr;
                        intervalEnd = end;
                        newBlock = false;
                    }

                } else if (line.startsWith("i ")) {
                    //We do not handle information lines yet.
                    continue;
                } else if (line.startsWith("q ")) {
                    //We do not handle quality lines yet.
                    continue;
                }
            }


            if (blockCount > 0) {
                index.insertInterval(lastChr, intervalStart, intervalEnd, lastOffset);
            }

            // Merge species list, if any, from previous block and start new one
            mergeSpecies(blockSpecies, allSpecies, speciesRanks);
            index.setSpecies(sortSpecies(allSpecies, speciesRanks));

            return index;


        } finally {
            reader.close();

        }

    }

    private static class RunningAverage {
        int nPts = 1;
        double average = 0;

        void addValue(double value) {
            average = (nPts * average + value) / nPts++;
        }
    }

    private static void mergeSpecies(List<String> blockSpecies, Set<String> allSpecies, Map<String, RunningAverage> speciesRank) {
        allSpecies.addAll(blockSpecies);
        for (int i = 0; i < blockSpecies.size(); i++) {
            String sp = blockSpecies.get(i);
            RunningAverage rank = speciesRank.get(sp);
            if (rank == null) {
                rank = new RunningAverage();
                speciesRank.put(sp, rank);
            }
            rank.addValue(i);
            speciesRank.put(sp, rank);
        }
    }

    private static List<String> sortSpecies(final Collection<String> allSpecies, final Map<String, RunningAverage> speciesRank) {
        List<String> speciesList = new ArrayList<String>(allSpecies);
        Collections.sort(speciesList, new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                double v = speciesRank.get(o1).average - speciesRank.get(o2).average;
                return v > 0 ? 1 : (v < 0 ? -1 : 0);
            }
        });
        return speciesList;
    }


    public static void writeIndex(MAFIndex index, String indexFileName) throws IOException {

        PrintWriter pw = null;

        try {
            pw = new PrintWriter(new BufferedWriter(new FileWriter(indexFileName)));

            pw.println("#species");
            List<String> species = index.species;
            for (String sp : species) {
                pw.println(sp);
            }
            pw.println("#endSpecies");

            Collection<String> chrList = index.getChromosomes();
            for (String chr : chrList) {
                pw.println("#chr=" + chr);
                IntervalTree tree = index.getIntervalTree(chr);
                Collection<Interval> intervals = tree.getIntervals();
                for (Interval node : intervals) {
                    pw.print(String.valueOf(node.getLow()));
                    pw.print("\t");
                    pw.print(String.valueOf(node.getHigh() - node.getLow()));
                    pw.print("\t");
                    pw.println(String.valueOf(node.getValue()));
                }
            }

        } finally {
            if (pw != null) pw.close();
        }
    }

}
TOP

Related Classes of org.broad.igv.maf.MAFIndex$RunningAverage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.