Package org.broad.igv.data.rnai

Source Code of org.broad.igv.data.rnai.RNAIGCTDatasetParser

/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/

package org.broad.igv.data.rnai;

import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.PreferenceManager;
import org.broad.igv.data.expression.ProbeToLocusMap;
import org.broad.igv.exceptions.LoadResourceFromServerException;
import org.broad.igv.feature.FeatureDB;
import org.broad.igv.feature.NamedFeature;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.ui.IGV;
import org.broad.igv.util.HttpUtils;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.readers.AsciiLineReader;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.*;
import java.util.zip.GZIPInputStream;

/**
* Created by IntelliJ IDEA.
* User: nazaire
* Date: Feb 25, 2009
*/
public class RNAIGCTDatasetParser {

    private static Logger log = Logger.getLogger(RNAIGCTDatasetParser.class);
    private ResourceLocator dataFileLocator;
    private int dataStartColumn = 2;
    private int descriptionColumn = 1;
    Genome genome;


    /**
     * @param gctFile
     * @param genome
     */
    public RNAIGCTDatasetParser(ResourceLocator gctFile, Genome genome) {
        this.dataFileLocator = gctFile;
        this.genome = genome;
        dataStartColumn = 2;
    }

    public Collection<RNAIDataSource> parse() {
        // Create a buffer for the string split utility.  We use  a custom utility as opposed
        AsciiLineReader reader = null;
        List dataSources = null;
        String nextLine = null;
        InputStream probeMappingStream = null;

        try {
            reader = ParsingUtils.openAsciiReader(dataFileLocator);

            String headerLine = null;

            // Skip header rows

            nextLine = reader.readLine();
            nextLine = reader.readLine();

            headerLine = reader.readLine();

            // Parse column headings
            int skip = 1;

            String[] tokens = Globals.tabPattern.split(headerLine, -1);
            int nTokens = tokens.length;

            String description = (nTokens > descriptionColumn)
                    ? new String(tokens[descriptionColumn]) : null;

            int nColumns = (nTokens - dataStartColumn) / skip;
            String[] columnHeadings = new String[nColumns];
            for (int i = 0; i < nColumns; i++) {
                String heading = tokens[dataStartColumn + i * skip].replace('\"', ' ').trim();
                columnHeadings[i] = heading;
            }


            Map<String, String[]> rnaiProbeMap = getProbeMap();

            HashMap<String, HashMap<String, Float>> sampleGeneScoreMap = new HashMap();
            while ((nextLine = reader.readLine()) != null) {
                tokens = Globals.tabPattern.split(nextLine, -1);
                nTokens = tokens.length;
                String probeId = new String(tokens[0]);
                float[] values = new float[nColumns];

                String[] identifiers = (String[]) rnaiProbeMap.get(probeId);
                String identifier = null;
                if (identifiers == null || identifiers.length == 0) {
                    log.info("Could not find mapping for: " + probeId);
                    continue;
                } else {
                    identifier = identifiers[0];
                }

                NamedFeature gene = FeatureDB.getFeature(identifier.toUpperCase());
                if (gene == null) {
                    log.debug("Unknown identifier: " + identifier);
                    continue;
                }

                for (int i = 0; i < nColumns; i++) {
                    try {
                        int dataIndex = dataStartColumn + i * skip;

                        // If we are out of value tokens, or the cell is blank, assign NAN to the cell.
                        if ((dataIndex >= nTokens) || (tokens[dataIndex].length() == 0)) {
                            values[i] = Float.NaN;
                        } else {
                            values[i] = Float.parseFloat(tokens[dataIndex]);
                        }

                        String sample = columnHeadings[i];
                        RNAIHairpinValue hairpin = new RNAIHairpinValue(probeId, values[i]);
                        RNAIHairpinCache.getInstance().addHairpinScore(sample, gene.getName(),
                                hairpin);

                        HashMap<String, Float> geneScoreMap = sampleGeneScoreMap.get(sample);

                        if (geneScoreMap == null) {
                            geneScoreMap = new HashMap();
                            sampleGeneScoreMap.put(sample, geneScoreMap);
                        }

                        Float geneScore = geneScoreMap.get(gene.getName());
                        if (geneScore == null) {
                            geneScore = values[i];
                            geneScoreMap.put(gene.getName(), geneScore);
                        } else {

                            geneScore = new Float(Math.min(values[i], geneScore.floatValue()));
                            geneScoreMap.put(gene.getName(), geneScore);
                        }
                    } catch (NumberFormatException numberFormatException) {

                        // This is an expected condition.  IGV uses NaN to
                        // indicate non numbers (missing data values)
                        values[i] = Float.NaN;
                    }
                }
            }

            dataSources = computeGeneScores(sampleGeneScoreMap);

        } catch (IOException ex) {
            log.error("Error parsing RNAi file", ex);
            throw new RuntimeException(ex);
        } finally {
            if (probeMappingStream != null) {
                try {
                    probeMappingStream.close();
                } catch (IOException e) {
                    log.error("Error closing probe mapping stream", e);
                }
            }
            if (reader != null) {
                reader.close();
            }
        }

        return dataSources;
    }

    private List computeGeneScores(HashMap<String, HashMap<String, Float>> sampleGeneScoreMap) {
        int confidence = 3;
        List dataSources = new ArrayList();
        Iterator samplesIt = sampleGeneScoreMap.keySet().iterator();
        while (samplesIt.hasNext()) {
            String sample = (String) samplesIt.next();
            HashMap geneMap = sampleGeneScoreMap.get(sample);
            RNAIDataSource ds = new RNAIDataSource(sample, "", genome);
            Iterator geneScoreIt = geneMap.keySet().iterator();
            while (geneScoreIt.hasNext()) {
                String gene = (String) geneScoreIt.next();
                Float score = (Float) geneMap.get(gene);
                int numHairpins;
                Collection hairpins = RNAIHairpinCache.getInstance().getHairpinScores(sample, gene);
                if (hairpins == null) {
                    numHairpins = 0;
                } else {
                    numHairpins = hairpins.size();
                }
                ds.addGeneScore(new RNAIGeneScore(sample,
                        FeatureDB.getFeature(gene), score.floatValue(), numHairpins));
            }
            dataSources.add(ds);
        }
        return dataSources;
    }


    public final static String RNAI_MAPPING_URL_KEY = "RNAI_MAPPING_URL";
    private final static String DEFAULT_RNAI_MAPPING_URL = "http://www.broadinstitute.org/igv/resources/probes/rnai/RNAI_probe_mapping.txt.gz";
    private static String RNAI_MAPPING_URL;

    static {
        if (IGV.hasInstance()) {
            RNAI_MAPPING_URL = IGV.getInstance().getSession().getPersistent(RNAI_MAPPING_URL_KEY, DEFAULT_RNAI_MAPPING_URL);
        } else {
            RNAI_MAPPING_URL = PreferenceManager.getInstance().getPersistent(RNAI_MAPPING_URL_KEY, DEFAULT_RNAI_MAPPING_URL);
        }
    }

    private static Map<String, String[]> rnaiProbeMap = null;

    private synchronized static Map<String, String[]> getProbeMap() throws IOException {
        if (rnaiProbeMap == null) {
            rnaiProbeMap = Collections.synchronizedMap(new HashMap<String, String[]>(20000));
            URL url = new URL(RNAI_MAPPING_URL);

            InputStream probeMappingStream = null;
            try {
                probeMappingStream = new GZIPInputStream(HttpUtils.getInstance().openConnectionStream(url));
                AsciiLineReader br = new AsciiLineReader(probeMappingStream);

                ProbeToLocusMap.getInstance().loadMapping(br, rnaiProbeMap);
            } catch (Exception e) {
                throw new LoadResourceFromServerException(e.getMessage(), RNAI_MAPPING_URL, e.getClass().getSimpleName());
            } finally {
                if (probeMappingStream != null) {
                    probeMappingStream.close();
                }
            }
        }

        return rnaiProbeMap;
    }
}
TOP

Related Classes of org.broad.igv.data.rnai.RNAIGCTDatasetParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.