Source Code of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTester

/*
 * Tester.java
 *
 * Created on June 24, 2002, 6:26 PM
 */
/**
 *
 * @author  wangqion
 * @version
 */
package edu.msu.cme.rdp.classifier.train.validation.leaveoneout;


import java.util.*;
import java.io.*;
import java.text.*;


import edu.msu.cme.rdp.classifier.train.validation.ValidationClassificationResult;
import edu.msu.cme.rdp.classifier.train.validation.ValidClassificationResultFacade;
import edu.msu.cme.rdp.classifier.train.validation.CorrectAssignment;
import edu.msu.cme.rdp.classifier.train.validation.DecisionMaker;
import edu.msu.cme.rdp.classifier.train.LineageSequence;
import edu.msu.cme.rdp.classifier.train.LineageSequenceParser;
import edu.msu.cme.rdp.classifier.train.validation.*;
import edu.msu.cme.rdp.readseq.utils.orientation.GoodWordIterator;


/** */
public class LeaveOneOutTester {


    BufferedWriter outFile;
    private Map<String, CorrectAssignment> num_hierLevel = new HashMap<String, CorrectAssignment>();  // key is the hierarchy level, value is CorrectAssignment
    private String testRank = Taxonomy.GENUS;
    private int numGoodBases = 0;  // 0 indicates all the bases from the query sequence


    /**
     * 
     * @param writer
     * @param numGoodBases 0 for all bases from the query, positive integer for partial length testing
     * @throws IOException
     */
    public LeaveOneOutTester(Writer writer, int numGoodBases) throws IOException {
        outFile = new BufferedWriter(writer);
        this.numGoodBases = numGoodBases;
    }


    /**
     * perform leave-one-out test. For each query sequence, remove it from the training set and test the classifier,
     * put it back after each testing.
     * 
     * The accuracy, misclassified sequences and the misclassified count for each taxon is written into the output file.
     * @param factory
     * @param parser
     * @param useSeed
     * @throws IOException
     */
    public ArrayList<HashMap<String, StatusCount>> classify(TreeFactory factory, LineageSequenceParser parser, 
            boolean useSeed, int min_bootstrap_words, boolean hideTaxon)throws IOException {


        testRank = factory.getLowestRank();
        List<ValidClassificationResultFacade> resultList = new ArrayList<ValidClassificationResultFacade>();
        HashMap<String, HierarchyTree> nodeMap = new HashMap<String, HierarchyTree>();


        DecisionMaker dm = new DecisionMaker(factory);
        HierarchyTree root = factory.getRoot();
        root.getNodeMap(testRank, nodeMap);


        if (nodeMap.isEmpty()) {
            throw new IllegalArgumentException("\nThere is no node in " + testRank + " level!");
        }
       
        ArrayList<HashMap<String, StatusCount>> statusCountList = new ArrayList<HashMap<String, StatusCount>>();
        // initialize a list of statusCount, one for each bootstrap from 0 to 100
        for ( int b = 0; b <= 100; b++){
             HashMap<String, StatusCount> statusCountMap = new HashMap<String, StatusCount>();
             statusCountList.add(statusCountMap);
             for (String rank: factory.getRankSet()){
                 statusCountMap.put(rank, new StatusCount());
             }
         }        
        
        int i = 0;
        while (parser.hasNext()) {
            LineageSequence pSeq = parser.next();
            if (pSeq.getSeqString().length() == 0) {
                continue;
            }
            GoodWordIterator wordIterator = null;
            if (numGoodBases > 0) {
                wordIterator = pSeq.getPartialSeqIteratorbyGoodBases(numGoodBases);  // test partial sequences with good words only


            } else {
                wordIterator = new GoodWordIterator(pSeq.getSeqString()); // full sequence  
            }


            if (wordIterator == null || wordIterator.getNumofWords() == 0) {
                //System.err.println(pSeq.getSeqName() + " unable to find good subsequence with length " + numGoodBases);
                continue;
            }


            //for leave-one-out testing, we need to remove the word occurrence for
            //the current sequence. This is similar to hide a sequence leaf.
            HierarchyTree curTree = nodeMap.get((String) pSeq.getAncestors().get(pSeq.getAncestors().size() - 1));
            if ( !hideTaxon){
                if ( curTree.isSingleton()){
                    nodeMap.remove(curTree.getName());
                }else {
                    curTree.hideSeq(wordIterator);
                }
            }else {
                nodeMap.remove(curTree.getName());
            }
            
            List<ValidationClassificationResult> result = dm.getBestClasspath( wordIterator, nodeMap, useSeed, min_bootstrap_words);
            ValidClassificationResultFacade resultFacade = new ValidClassificationResultFacade(pSeq, result);
            resultFacade.setLabeledNode(curTree);
            compareClassificationResult(resultFacade);


            // for ROC curve
            labelAssignmentStatus(factory, resultFacade, statusCountList);
          
            resultList.add(resultFacade);


            System.out.print(i++ + " ");
            if ( !hideTaxon){                
                if ( curTree.isSingleton()){
                    nodeMap.put(curTree.getName(), curTree);
                }else {
                    // recover the wordOccurrence of the genus node, unhide the sequence leaf
                    curTree.unhideSeq(wordIterator);
                }
            }else {
                nodeMap.put(curTree.getName(), curTree);
            }
        }




        displayClassification(resultList);
        //
        outFile.write("\n**misclassified sequences group by taxon\n");
        outFile.write("Rank" + "\t" + "Name" + "\t" + "Total Seqs" + "\t" + "Tested Seqs (non-singleton)" + "\t" + "misclassified" +"\t" + "pct misclassified"+ "\n");


        displayTreeErrorRate(root, 0);
        
        outFile.write("\n**ROC matrix\n");
        outFile.write( StatusCountUtils.calROCMatrix(Integer.toString(min_bootstrap_words), statusCountList));
        outFile.write("\n**Area under curve\n");
        outFile.write( StatusCountUtils.calAUC(Integer.toString(min_bootstrap_words), statusCountList));


        outFile.close();
        return statusCountList;
    }




    public void displayTreeErrorRate(HierarchyTree root, int indent) throws IOException {


        double pct_miss = 0;
        if ( root.getNumTotalTestedseq() > 0){
            pct_miss = (double)root.getMissCount()/ (double)root.getNumTotalTestedseq();
        }
        outFile.write(root.getTaxonomy().getHierLevel() + "\t" + root.getName() + "\t" + root.getTotalSeqs() 
                + "\t" + root.getNumTotalTestedseq() + "\t" + root.getMissCount() + "\t" + pct_miss + "\n");
        Iterator i = root.getSubclasses().iterator();


        while (i.hasNext()) {
            displayTreeErrorRate((HierarchyTree) i.next(), indent + 1);
        }
    }


    /** Displays the results of the classification, true path, assigned path and the
     * error rate
     */
    private void displayClassification(List seqs) throws IOException {
        int i = 0;


        DecimalFormat df = new DecimalFormat("0.###");


        outFile.write("\n\n**The statistics for each hierarchy level: \n 1: number of correct assigned sequences / number of total sequences for that bin rage\n");
        outFile.write("Level\t100-95\t\t94-90");
        for (i = 9; i > 0; i--) {
            outFile.write("\t\t" + (i * 10 - 1) + "-" + (i - 1) * 10);
        }


        outFile.write("\t\t" + "Correct Assigned\tTotal Seqs\tCorrect/Total" + "\n");


        Iterator it = num_hierLevel.keySet().iterator();
        while (it.hasNext()) {
            String name = (String) it.next();
            outFile.write(name + "\t");
            CorrectAssignment assign = (CorrectAssignment) num_hierLevel.get(name);


            calStandardError(assign);
            int total = 0;
            int correct = 0;
            for (i = assign.bins - 1; i >= 0; i--) {
                outFile.write(assign.numCorrect[i] + "\t" + assign.numTotal[i] + "\t");
                total += assign.numTotal[i];
                correct += assign.numCorrect[i];
            }
            if (total > 0) {
                outFile.write(correct + "\t" + total + "\t" + (double) correct / ((double) total) + "\t");
            } else {
                outFile.write(correct + "\t" + total + "\t" + 0 + "\t");
            }
            outFile.write("\n");
        }
                


        outFile.write("\n\n** 2. The average votes for each bin range \n");
        it = num_hierLevel.keySet().iterator();
        while (it.hasNext()) {
            String name = (String) it.next();
            outFile.write(name + " \t ");
            CorrectAssignment assign = (CorrectAssignment) num_hierLevel.get(name);


            for (i = assign.bins - 1; i >= 0; i--) {
                if (assign.numTotal[i] == 0) {
                    outFile.write("0\t");
                } else {
                    outFile.write(df.format(assign.sumOfVotes[i] * 100 / assign.numTotal[i]) + "\t");
                }
            }
            outFile.write(" \n");
        }


        outFile.write("\n\n** 3. The percentage of correctness for each bin range (the percentage of #1)\n");
        it = num_hierLevel.keySet().iterator();
        while (it.hasNext()) {
            String name = (String) it.next();
            outFile.write(name + " \t ");
            CorrectAssignment assign = (CorrectAssignment) num_hierLevel.get(name);


            for (i = assign.bins - 1; i >= 0; i--) {
                if (assign.numTotal[i] == 0) {
                    outFile.write("0\t");
                } else {
                    outFile.write(df.format((float) assign.numCorrect[i] / (float) assign.numTotal[i]) + "\t");
                }
            }
            outFile.write(" \n");
        }


        outFile.write("\n\n** 4. The standard error for each bin range \n");


        it = num_hierLevel.keySet().iterator();
        while (it.hasNext()) {
            String name = (String) it.next();
            outFile.write(name + " \t ");
            CorrectAssignment assign = (CorrectAssignment) num_hierLevel.get(name);


            for (i = assign.bins - 1; i >= 0; i--) {
                outFile.write(df.format(assign.standardError[i]) + "\t");
            }
            outFile.write(" \n");


        }


        outFile.write("\n**misclassified sequences: \n");
        for (i = 0; i < seqs.size(); i++) {
            ValidClassificationResultFacade resultFacade = (ValidClassificationResultFacade) seqs.get(i);
            if (resultFacade.isMissed()) {
                outFile.write(resultFacade.getPath() + "\n");


            }
        }
        
        outFile.write("\n**singleton sequences: \n");
        for (i = 0; i < seqs.size(); i++) {
            ValidClassificationResultFacade resultFacade = (ValidClassificationResultFacade) seqs.get(i);
            if (resultFacade.getLabeledNode().isSingleton()) {
                outFile.write(resultFacade.getPath() + "\n");


            }
        }


    }


    private void compareClassificationResult(ValidClassificationResultFacade resultFacade) {
        HierarchyTree trueParent = resultFacade.getLabeledNode();
        List hitList = resultFacade.getRankAssignment();


        // compare the true taxon and the hit taxon with same rank. mark the sequence that is misclassified
        // update the total count and misclassified count


        while (trueParent != null) {


            if (!trueParent.isSingleton()) {
                ValidationClassificationResult hit = null;
                for (int i = 0; i < hitList.size(); i++) {
                    ValidationClassificationResult tmp = (ValidationClassificationResult) hitList.get(i);
                    if ((trueParent.getTaxonomy().getHierLevel()).equals(tmp.getBestClass().getTaxonomy().getHierLevel())) {
                        hit = tmp;
                        break;
                    }
                }


                if (trueParent.getTaxonomy().getHierLevel().equalsIgnoreCase(testRank)) {
                    trueParent.incNumTotalTestedseq();
                }


                boolean correct = false;


                if ( hit != null){
                    if( trueParent.getTaxonomy().getTaxID() == hit.getBestClass().getTaxonomy().getTaxID()){
                         correct = true;
                    }else {
                        resultFacade.setMissedRank(trueParent.getTaxonomy().getHierLevel());
                    }
                    increaseCount(hit, trueParent.getTaxonomy().getHierLevel(), correct, num_hierLevel);
                }
                if ( !correct && trueParent.getTaxonomy().getHierLevel().equalsIgnoreCase(testRank)) {
                    trueParent.incMissCount();
                }
                               


            } else {
                // System.err.println(" singleton: " + trueParent.getName() + " " + trueParent.getTaxonomy().getHierLevel() );
            }


            trueParent = trueParent.getParent();


        }


    }


    private void increaseCount(ValidationClassificationResult aResult, String level, boolean correct, Map aMap) {
        // bin range: 100-95, 94-90, 89-80, 79-70.....9-0
        CorrectAssignment assign = (CorrectAssignment) aMap.get(level);
        if (assign == null) {
            assign = new CorrectAssignment();
            aMap.put(level, assign);
        }


        int binIndex = (int) Math.floor(aResult.getNumOfVotes() * 10);
        if (binIndex == 9) {
            if (aResult.getNumOfVotes() >= 0.95) {
                binIndex = 10;   // we put 100-95 to the bin #10
            }
        }
        assign.numTotal[binIndex]++;
        assign.sumOfVotes[binIndex] += aResult.getNumOfVotes();


        if (correct) {
            assign.numCorrect[binIndex]++;
        }


    }


    private void calStandardError(CorrectAssignment assign) {
        for (int i = 0; i < assign.bins; i++) {
            if (assign.numTotal[i] == 0) {
                assign.standardError[i] = (float) 0.0;
            } else {
                float p = (float) assign.numCorrect[i] / (float) assign.numTotal[i];
                assign.standardError[i] = (float) Math.sqrt(p * (1 - p) / assign.numTotal[i]);
            }


        }
    }




    /** there are four status
     * TP: bootstrap above cutoff, labeled taxon matches the label in training set
     * FN: bootstrap below cutoff, labeled taxon matches the label in training set
     * FP: bootstrap above cutoff, labeled taxon NOT match the label in in training set
     * TN: bootstrap below cutoff, labeled taxon NOT match the label in training set
     * 
     **/
     private void labelAssignmentStatus( TreeFactory factory, ValidClassificationResultFacade resultFacade,
        ArrayList<HashMap<String, StatusCount>> statusCountList ) throws IOException{
         // determine assignment status
        //System.err.println(resultFacade.getPath());


        // find all the labeled taxa in the training set
        HierarchyTree trueParent = resultFacade.getLabeledNode();
        HashMap<String, HierarchyTree> labeledTreeMap = new HashMap<String, HierarchyTree>();
        while (trueParent != null) {
            labeledTreeMap.put(trueParent.getTaxonomy().getHierLevel(), trueParent);
            trueParent= trueParent.getParent();
        }
       
        List<ValidationClassificationResult> hitList = resultFacade.getRankAssignment();
        for ( ValidationClassificationResult curRankResult: hitList){
            
            
            String curRank = curRankResult.getBestClass().getTaxonomy().getHierLevel();
            boolean match = true;
            // find the corresponding ancestor at the current rank
            HierarchyTree matchingRankTreeNode = labeledTreeMap.get(curRank);
            
            if ( matchingRankTreeNode != null &&  matchingRankTreeNode.isSingleton()){  // ignore singletons
                continue;
            }
            if ( matchingRankTreeNode == null) {  // no TreeNode with matching rank found
                match = false;
            }else if (!curRankResult.getBestClass().getName().equalsIgnoreCase(matchingRankTreeNode.getName()) ){
                match = false;
            }
                  
            int bootstrap = (int)(curRankResult.getNumOfVotes()*100);
            //System.err.println( "assigned rank: " + curRank + "\t" + curRankResult.getBestClass().getName() + "\t match=" + match + "\t" + bootstrap +"\t");
            if ( match){  // TP or FN
                for( int b = 0; b <= bootstrap; b++){
                    ((StatusCount)statusCountList.get(b).get(curRank)).incNumTP(1);
                }
                for( int b = bootstrap+1; b < statusCountList.size(); b++){
                    ((StatusCount)statusCountList.get(b).get(curRank)).incNumFN(1);
                }


            }else {// TN or FP
                for( int b = 0; b <= bootstrap; b++){
                    ((StatusCount)statusCountList.get(b).get(curRank)).incNumFP(1);
                }
                for( int b = bootstrap+1; b < statusCountList.size(); b++){
                    ((StatusCount)statusCountList.get(b).get(curRank)).incNumTN(1);
                }
            }


        }
     }
     
   
    protected Map getNum_hierLevelMap() {
        return num_hierLevel;
    }
}
Source Code of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTester

Related Classes of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTester