Package edu.msu.cme.rdp.classifier.train.validation.distance

Source Code of edu.msu.cme.rdp.classifier.train.validation.distance.CompareTrainingSets

/*
* Copyright (C) 2014 wangqion
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package edu.msu.cme.rdp.classifier.train.validation.distance;

import edu.msu.cme.rdp.classifier.train.LineageSequence;
import edu.msu.cme.rdp.classifier.train.LineageSequenceParser;
import edu.msu.cme.rdp.classifier.train.validation.HierarchyTree;
import edu.msu.cme.rdp.classifier.train.validation.TreeFactory;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;

/**
*
* @author wangqion
*/
public class CompareTrainingSets {
    private ArrayList<HierarchyTreeExtend> trainsets = new ArrayList<HierarchyTreeExtend>();
    private ArrayList<String> ranks = new ArrayList<String>()// an  order list of ranks
   
    public class HierarchyTreeExtend {
        private HierarchyTree root;
        private String trainsetName;
        private HashMap<String, String> seqMap ; // seqID, desc
       
        public HierarchyTreeExtend(HierarchyTree root, String name){
            this.root = root;
            trainsetName = name;
        }
       
        public HierarchyTree getRoot(){
            return root;
        }
       
        public String getTrainsetName(){
            return trainsetName;
        }
    }
   
    public CompareTrainingSets(String rankFile, String[] files) throws IOException{
        BufferedReader reader = new BufferedReader(new FileReader(new File(rankFile)));
        String line;
        while ( (line = reader.readLine()) != null){
            ranks.add(line.trim());
        }
        reader.close();
       
        for ( int i = 0; i < files.length; i+=2 ){
            HierarchyTreeExtend trainset = parseOneTraining(files[i], files[i+1], i/2, "", "");
           
            trainsets.add(trainset);
        }
       
    }
   
    private HierarchyTreeExtend parseOneTraining(String taxFile, String seqFile, int trainset_no, String version, String modification) throws IOException{
        File temp = new File(taxFile);
        int index = temp.getName().indexOf(".");
        String trainsetName = temp.getName();
        if ( index != -1){
            trainsetName = trainsetName.substring(0, index);
        }
        // need to use ISO encoding for UNITE
        FileReader tax  = new FileReader(new File(taxFile));
        TreeFactory factory = new TreeFactory(tax);
              
        LineageSequenceParser parser = new LineageSequenceParser(new File(seqFile));
        LineageSequence seq;
        HashMap<String, String> seqMap = new HashMap<String, String>(); // seqID, desc
        while ( parser.hasNext()){
            seq = parser.next();
            factory.addSequence(seq, false); // donot check the kmers
           
            if ( seq.getSeqName().contains("|S00") ){ // rdpID
                String[] values = seq.getSeqName().split("\\|");
                seqMap.put(values[0], seq.getDesc());  
            }else if (seq.getSeqName().contains("|SH") ){  // if it's seq from UNITE, we need to do something with the seqID            
                String[] values = seq.getSeqName().split("\\|");
                seqMap.put(values[1], seq.getDesc());
            }else {
                seqMap.put(seq.getSeqName(), seq.getDesc());
            }
           
        }
        parser.close();
        HierarchyTreeExtend retVal = new HierarchyTreeExtend(factory.getRoot(), trainsetName);
        retVal.seqMap = seqMap;
        return retVal;
       
    }
  
   
    public void compare(String summaryOutFile, String detailOutFile) throws IOException{
        PrintStream outStream = new PrintStream(summaryOutFile);
        PrintStream detailOutStream = new PrintStream(detailOutFile);
        outStream.println("## data for Taxonomic Composition");
        outStream.print("Rank"  );
        for ( HierarchyTreeExtend factory: this.trainsets ){
            outStream.print("\t" + factory.trainsetName);
        }
        outStream.println();
        for ( int i = 0; i < ranks.size(); i++){
            outStream.print(ranks.get(i));
            for ( int t = 0; t < this.trainsets.size(); t++){                  
                HierarchyTree root = trainsets.get(t).getRoot();
                HashMap<String, HierarchyTree> nodeMap = new HashMap<String, HierarchyTree>();
                root.getNodeMap(ranks.get(i), nodeMap);
                outStream.print("\t" + nodeMap.size());
            }
            outStream.println();
        }
        outStream.print("All Seqs");
        for ( HierarchyTreeExtend factory: this.trainsets ){
            HierarchyTree root = factory.getRoot();
            outStream.print("\t" + root.getTotalSeqs());
        }
        outStream.println();
       
        // data for Venn Diagram if less than 3 sets      
        if ( this.trainsets.size() == 2 || this.trainsets.size() == 3){
            outStream.println("\n## data for Venn Diagram");
            for ( int i = 0; i < ranks.size(); i++){
                outStream.println("\n## Rank " + ranks.get(i));
                ArrayList<Set<String>> taxaList = new ArrayList<Set<String>>();
                for ( int t = 0; t < this.trainsets.size(); t++){                  
                    HierarchyTree root = trainsets.get(t).getRoot();
                    HashMap<String, HierarchyTree> nodeMap = new HashMap<String, HierarchyTree>();
                    root.getNodeMap(ranks.get(i), nodeMap);
                    taxaList.add(nodeMap.keySet());
                }
               
                Set<String> tempSet = new HashSet<String>();
                tempSet.addAll(taxaList.get(0));
                for ( int t = 1; t < taxaList.size(); t++){
                    tempSet.retainAll(taxaList.get(t));
                }
                outStream.println("Shared by all:\t" + tempSet.size());
               
                for ( int t = 0; t < taxaList.size(); t++){
                    tempSet.clear();
                    tempSet.addAll(taxaList.get(t));                   
                    for ( int k = 1; k < taxaList.size(); k++){
                        tempSet.removeAll(taxaList.get( (k+t)%(taxaList.size())));                       
                    }
                    outStream.println("Unique to " + trainsets.get(t).getTrainsetName()+ ":\t" + tempSet.size());
                    print(detailOutStream, ranks.get(i) + " unique to " + trainsets.get(t).getTrainsetName(), tempSet);
                }
               
                for ( int t = 0; t < taxaList.size(); t++){
                    tempSet.clear();
                    tempSet.addAll(taxaList.get(t));                   
                    tempSet.retainAll(taxaList.get( (t+1)%(taxaList.size()) ));
                   
                    if ( taxaList.size() == 3){
                        tempSet.removeAll(taxaList.get( (t+2)%(taxaList.size()) ));                       
                    }
                    outStream.println("Shared only by " + trainsets.get(t).getTrainsetName()+ " and " + trainsets.get((t+1)%(taxaList.size())).getTrainsetName()+ ":\t"+ tempSet.size());
                }
               
            }
           
            // shared seqs
            outStream.println("\n## Shared Sequences (by seqID)" );
            Set<String> tempSet = new HashSet<String>();
            tempSet.addAll(trainsets.get(0).seqMap.keySet());
            for ( int t = 1; t < trainsets.size(); t++){
                tempSet.retainAll(trainsets.get(t).seqMap.keySet());
            }
            outStream.println("Shared seqs by all:\t" + tempSet.size());

            for ( int t = 0; t < trainsets.size(); t++){
                tempSet.clear();
                tempSet.addAll(trainsets.get(t).seqMap.keySet());
                for ( int k = 1; k < trainsets.size(); k++){
                    tempSet.removeAll(trainsets.get( (t+k)%(trainsets.size())).seqMap.keySet());
                }
                outStream.println("Unique to " + trainsets.get(t).getTrainsetName()+ ":\t" + tempSet.size());
                printSeqs(detailOutStream, "Unique seqs to " + trainsets.get(t).getTrainsetName(), tempSet,trainsets.get(t));
            }

            for ( int t = 0; t < trainsets.size(); t++){
                tempSet.clear();
                tempSet.addAll(trainsets.get(t).seqMap.keySet());
                tempSet.retainAll(trainsets.get( (t+1)%(trainsets.size()) ).seqMap.keySet());
                if ( trainsets.size() == 3){
                    tempSet.removeAll(trainsets.get( (t+2)%(trainsets.size()) ).seqMap.keySet());
                }
                outStream.println("Shared only by " + trainsets.get(t).getTrainsetName()+ " and " + trainsets.get((t+1)%(trainsets.size())).getTrainsetName()+ ":\t"+ tempSet.size());
            }
        }
       
        outStream.close();
        detailOutStream.close();
    }
   
    private void print(PrintStream out, String message, Set<String> tempSet){
        out.println("##" + message);
        for ( String s: tempSet){
            out.println(s);
        }
        out.println();
    }
   
    private void printSeqs(PrintStream out, String message, Set<String> tempSet, HierarchyTreeExtend trainset){
        out.println("##" + message);
        for ( String s: tempSet){
            out.println(s + "\t" + trainset.seqMap.get(s));
        }
        out.println();
    }
   
    public static void main(String[] args) throws Exception{
        String usage = "Usage: rank.txt summary_out.txt detail_out.txt set1_taxon.txt set1_seq.fasta set2_taxon.txt set2_seq.fasta ...\n" +
                "rank.txt contains an ordered list of ranks to be compared, from the highest rank down to lowest rank. one per line\n" +
                "Each input training set requires a taxonomy file and a sequences file with lineage information as the description\n" +
                "  This program compares multiple training sets and generates the taxonomic composition data at each rank.\n" +
                "  For two or three sets, it produces an summary output data suitable to generate Venn diagrams\n" +
                "  and a detailed output contained the detailed list of taxa or sequences unique to each training set";
        if (args.length < 5 || args.length %2 != 1){
            System.err.println(usage);
            System.exit(1);
        }
       
        CompareTrainingSets theObj = new CompareTrainingSets(args[0], Arrays.copyOfRange(args, 3, args.length));
        theObj.compare(args[1], args[2]);
    }
   
}
TOP

Related Classes of edu.msu.cme.rdp.classifier.train.validation.distance.CompareTrainingSets

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.