Examples of edu.msu.cme.rdp.classifier.train.LineageSequenceParser

Package edu.msu.cme.rdp.classifier.train

Examples of edu.msu.cme.rdp.classifier.train.LineageSequenceParser

edu.msu.cme.rdp.classifier.train.LineageSequenceParser
A parser to parse a reader containing the raw sequences. @author wangqion @version

         // create a tree
        createTree(factory, new File(inFile));                            
        outWriter = new BufferedWriter(new FileWriter(outFile));
        outWriter.write("taxon file: " + taxFile + "\n" + "train sequence file: " + inFile +"\n");        
        
        LineageSequenceParser parser = new LineageSequenceParser(new File(testFile) ); 
        while ( parser.hasNext()){
            seqList.add(parser.next());
        }
        
        outWriter.write("query sequence file: " + testFile + "\n" + "classify moving window of size " + FindWindowFrame.window_size + ", step " + FindWindowFrame.step +"\n");
               
        Iterator windowIt = windowFrames.iterator();

View Full Code Here

    }
    
    /** reads from the stream, parses the sequence and creates the tree */
    private void createTree(TreeFactory factory, File input) throws IOException{
      long startTime = System.currentTimeMillis(); 
      LineageSequenceParser parser = new LineageSequenceParser(input);     
      
      while ( parser.hasNext() ){
        factory.addSequence( (LineageSequence)parser.next()); 
        
      }   
      //after all the training set is being parsed, calculate the prior probability for all the words.
      factory.calculateWordPrior();
      // create the word occurrence for all the nodes

View Full Code Here

        TreeFactory factory = new TreeFactory(new FileReader(taxFile));
        // create a tree
        createTree(factory, trainseqFile);


        BufferedWriter outWriter = new BufferedWriter(new FileWriter(outFile));
        LineageSequenceParser parser = new LineageSequenceParser(new File(testFile));
        LeaveOneOutTester tester = new LeaveOneOutTester(outWriter, numGoodBases);


        outWriter.write("taxon file: " + taxFile + "\n" + "train sequence file: " + trainseqFile + "\n");
        outWriter.write("word size: " + GoodWordIterator.getWordsize() + "\n");
        outWriter.write("minimum number of words for bootstrap: " + min_bootstrap_words + "\n");

View Full Code Here


    }


    /** reads from the stream, parses the sequences and creates the tree */
    private void createTree(TreeFactory factory, String input) throws IOException {
        LineageSequenceParser parser = new LineageSequenceParser(new File(input));


        while (parser.hasNext()) {
            factory.addSequence((LineageSequence) parser.next());
        }


        //after all the training set is being parsed, calculate the prior probability for all the words.
        factory.calculateWordPrior();
        // create the word occurrence for all the nodes, this is necessary if test on different level

View Full Code Here

    
    public void calSabSimilarity(String taxonFile, String trainSeqFile, String testSeqFile) throws IOException{        
        TreeFactory factory = new TreeFactory(new FileReader(taxonFile));
        factory.buildTree();
        // get the lineage of the trainSeqFile  
        LineageSequenceParser trainParser = new LineageSequenceParser(new File(trainSeqFile));
        HashMap<String, List<String>> lineageMap = new HashMap<String, List<String>>();
        while (trainParser.hasNext()) {
            LineageSequence seq = (LineageSequence) trainParser.next();
            lineageMap.put(seq.getSeqName(), seq.getAncestors());
            
         }
        trainParser.close();
        NuclSeqMatch sabCal = new NuclSeqMatch(trainSeqFile);
        LineageSequenceParser parser = new LineageSequenceParser(new File(testSeqFile));


        int count = 0;
        while (parser.hasNext()) {
            LineageSequence seq = (LineageSequence) parser.next();
            HashMap<String,HierarchyTree> queryAncestorNodes = getAncestorNodes(factory.getRoot(), seq.getSeqName(), seq.getAncestors());
           TreeSet<KmerMatchCore.BestMatch> matchResults = sabCal.findAllMatches(seq);
            
            short withinLowestRankSab = -1;
            short diffLowestRankSab = -1;  
            String bestDiffLowestRankMatch = null;
            for (KmerMatchCore.BestMatch match: matchResults){
                if ( match.getBestMatch().getSeqName().equals(seq.getSeqName())) continue;
                short sab = (short)(Math.round(100*match.getSab()));
                HashMap<String,HierarchyTree> matchAncestorNodes = getAncestorNodes(factory.getRoot(), match.getBestMatch().getSeqName(), lineageMap.get(match.getBestMatch().getSeqName()));
                boolean withinTaxon = false; 
                for (int i = ranks.size() -1; i >=0; i--){                    
                    HierarchyTree queryTaxon = queryAncestorNodes.get( ranks.get(i));
                    HierarchyTree matchTaxon = matchAncestorNodes.get( ranks.get(i));
                    if ( queryTaxon != null && matchTaxon != null){
                        if ( queryTaxon.getName().equals(matchTaxon.getName())){
                            if ( !withinTaxon){  // if the query and match are not in the same child taxon, add sab to the current taxon
                                (sabCoutMap.get(ranks.get(i)))[sab]++; 
                            }
                            withinTaxon = true;                            
                        }else {
                            withinTaxon = false;
                        }
                    }
                    
                }  
                
                // find within or different lowest level rank sab score, be either species or genus or any rank
                HierarchyTree speciesQueryTaxon = queryAncestorNodes.get( ranks.get(ranks.size()-1));    
                HierarchyTree speciesMatchTaxon = matchAncestorNodes.get( ranks.get(ranks.size()-1)); 
                
                if ( speciesQueryTaxon != null && speciesMatchTaxon != null && speciesQueryTaxon.getName().equals(speciesMatchTaxon.getName())){
                    withinLowestRankSab = sab >= withinLowestRankSab ? sab: withinLowestRankSab;
                }else {
                    
                    if ( sab >= diffLowestRankSab ){
                        bestDiffLowestRankMatch = match.getBestMatch().getSeqName();
                        diffLowestRankSab = sab;
                    }
                }
            }
            if ( withinLowestRankSab > 0){
                withinLowestRankSabSet.add(withinLowestRankSab);
            }
            if ( diffLowestRankSab > 0 ){
                diffLowestRankSabSet.add(diffLowestRankSab);
            }
            //System.out.println(seq.getSeqName() + "\t" + withinLowestRankSab + "\t" + diffLowestRankSab );
            count++;
            if ( count % 100 == 0){
                System.out.println(count);
            }
        }
        parser.close();
    
    }

View Full Code Here

    
    public void calPairwiseSimilaritye(String taxonFile, String trainSeqFile, String testSeqFile) throws IOException, OverlapCheckFailedException{        
        TreeFactory factory = new TreeFactory(new FileReader(taxonFile));
        factory.buildTree();
        // get the lineage of the trainSeqFile  
        LineageSequenceParser trainParser = new LineageSequenceParser(new File(trainSeqFile));
        ArrayList<LineageSequence> trainSeqList = new ArrayList<LineageSequence>();
        while (trainParser.hasNext()) {
            LineageSequence seq = (LineageSequence) trainParser.next();
            trainSeqList.add(seq);
         }
        trainParser.close();
        LineageSequenceParser parser = new LineageSequenceParser(new File(testSeqFile));


        while (parser.hasNext()) {
            LineageSequence seq = (LineageSequence) parser.next();
            HashMap<String,HierarchyTree> queryAncestorNodes = getAncestorNodes(factory.getRoot(), seq.getSeqName(), seq.getAncestors());
            
            for (LineageSequence trainSeq: trainSeqList){
                if ( trainSeq.getSeqName().equals(seq.getSeqName())) continue;
                
                HashMap<String,HierarchyTree> matchAncestorNodes = getAncestorNodes(factory.getRoot(), trainSeq.getSeqName(), trainSeq.getAncestors());
                boolean withinTaxon = false;
                String lowestCommonRank = null;
                for (int i = ranks.size() -1; i >=0; i--){                    
                    HierarchyTree queryTaxon = queryAncestorNodes.get( ranks.get(i));
                    HierarchyTree matchTaxon = matchAncestorNodes.get( ranks.get(i));
                    if ( queryTaxon != null && matchTaxon != null){
                        if ( queryTaxon.getName().equals(matchTaxon.getName())){
                            if ( !withinTaxon){  // if the query and match are not in the same child taxon, add sab to the current taxon
                                lowestCommonRank = ranks.get(i);
                                //(sabCoutMap.get(ranks.get(i)))[sab]++; 
                            }
                            withinTaxon = true;                            
                        }else {
                            withinTaxon = false;
                        }
                    }
                }  
                
                if ( lowestCommonRank == null){  // not the rank we care
                    continue;
                }


                // we need to use overlap_trim mode and calculate distance as metric to count insertions, deletions and mismatches.
                PairwiseAlignment result = PairwiseAligner.align(seq.getSeqString().replaceAll("U", "T"), trainSeq.getSeqString().replaceAll("U", "T"), scoringMatrix, mode);
                short sab = (short) (100 - 100*dist.getDistance(result.getAlignedSeqj().getBytes(), result.getAlignedSeqi().getBytes(), 0));
                sabCoutMap.get(lowestCommonRank)[sab]++; 
                                
            }           
        }
        parser.close();
    
    }

View Full Code Here

             }
         }


        int totalTest = 0;
        int totalSeq = 0;
        LineageSequenceParser parser = new LineageSequenceParser(source_file);
        while ( parser.hasNext()){
            totalSeq ++;
          LineageSequence pSeq = parser.next();
          if ( !selectedTestSeqIDs.contains(pSeq.getSeqName()) || pSeq.getSeqString().length() == 0){
              continue;
          }
          GoodWordIterator wordIterator = null ;
          if ( partialLength != null ){
                wordIterator = pSeq.getPartialSeqIteratorbyGoodBases(partialLength.intValue());  // test partial sequences with good words only


          }else {
                wordIterator = new GoodWordIterator(pSeq.getSeqString()); // full sequence
          }


          if (wordIterator == null || wordIterator.getNumofWords() == 0){
            //System.err.println(pSeq.getSeqName() + " unable to find good sequence");
            continue;
          }
        
          List result = dm.getBestClasspath( wordIterator, genusNodeMap, useSeed, min_bootstrap_words);


          //xxx
          ValidClassificationResultFacade resultFacade = new ValidClassificationResultFacade(pSeq, result);


          compareClassificationResult(factory, resultFacade, rankNodeMap, statusCountList);


          totalTest++;
        }
        parser.close();


        outWriter.write("taxon file\t" + tax_file.getName() + "\n" + "train sequence file\t" + source_file.getName() + "\n");
        outWriter.write("word size\t" + GoodWordIterator.getWordsize() + "\n");
        outWriter.write("minimum number of words for bootstrap\t" + min_bootstrap_words + "\n");
        if ( partialLength != null){

View Full Code Here

     * @throws IOException
     */
    private TreeFactory setup(File tax_file, File source_file, Set<String> selectedTestSeqIDs) throws IOException {


        TreeFactory factory = new TreeFactory(new FileReader(tax_file));
        LineageSequenceParser parser = new LineageSequenceParser(source_file);


        while ( parser.hasNext() ){
            LineageSequence pSeq = parser.next();
            if ( !selectedTestSeqIDs.contains(pSeq.getSeqName())){
              factory.addSequence( pSeq);
            }
        }
        parser.close();


        //after all the training set is being parsed, calculate the prior probability for all the words.
        factory.calculateWordPrior();
        return factory;
      }

View Full Code Here

        }
        // need to use ISO encoding for UNITE
        FileReader tax  = new FileReader(new File(taxFile));
        TreeFactory factory = new TreeFactory(tax);
               
        LineageSequenceParser parser = new LineageSequenceParser(new File(seqFile));
        LineageSequence seq;
        HashMap<String, String> seqMap = new HashMap<String, String>(); // seqID, desc
        while ( parser.hasNext()){
            seq = parser.next();
            factory.addSequence(seq, false); // donot check the kmers
            
            if ( seq.getSeqName().contains("|S00") ){ // rdpID
                String[] values = seq.getSeqName().split("\\|");
                seqMap.put(values[0], seq.getDesc());   
            }else if (seq.getSeqName().contains("|SH") ){  // if it's seq from UNITE, we need to do something with the seqID             
                String[] values = seq.getSeqName().split("\\|");
                seqMap.put(values[1], seq.getDesc()); 
            }else {
                seqMap.put(seq.getSeqName(), seq.getDesc()); 
            }
            
        }
        parser.close();
        HierarchyTreeExtend retVal = new HierarchyTreeExtend(factory.getRoot(), trainsetName);
        retVal.seqMap = seqMap;
        return retVal;
        
    }

View Full Code Here

    
    public PairwiseSeqDistance(String trainseqFile, String taxFile, AlignmentMode mode, boolean show_alignment ) throws IOException, OverlapCheckFailedException{
        this.mode = mode;
        this.show_alignment = show_alignment;
        factory = new TreeFactory(new FileReader(taxFile));
        LineageSequenceParser parser = new LineageSequenceParser(new File(trainseqFile));
        
        while (parser.hasNext()) {
            LineageSequence tmp = (LineageSequence) parser.next();
            seqList.add(tmp);
            factory.addSequence((LineageSequence) parser.next());
        }
        parser.close();
        
        calDist();
    }

View Full Code Here

0 1

TOP

Related Classes of edu.msu.cme.rdp.classifier.train.LineageSequenceParser

edu.msu.cme.rdp.classifier.train.validation.crossvalidate.CrossValidate

edu.msu.cme.rdp.classifier.train.validation.crossvalidate.RdmSelectTaxon

edu.msu.cme.rdp.classifier.train.validation.DecisionMakerTest

edu.msu.cme.rdp.classifier.train.validation.distance.CompareTrainingSets

edu.msu.cme.rdp.classifier.train.validation.distance.PairwiseSeqDistance

edu.msu.cme.rdp.classifier.train.validation.distance.TaxaSimilarityMain

edu.msu.cme.rdp.classifier.train.validation.HierarchyTreeTest

edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTesterMain

edu.msu.cme.rdp.classifier.train.validation.movingwindow.MainMovingWindow

edu.msu.cme.rdp.classifier.train.validation.NBClassifierTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.