final List<Haplotype> activeAlleleHaplotypes,
final boolean allowLowComplexityGraphs,
final boolean allowNonUniqueKmersInRef) {
if ( refHaplotype.length() < kmerSize ) {
// happens in cases where the assembled region is just too small
return new AssemblyResult(AssemblyResult.Status.FAILED, null);
}
if ( !allowNonUniqueKmersInRef && !ReadThreadingGraph.determineNonUniqueKmers(new SequenceForKmers("ref", refHaplotype.getBases(), 0, refHaplotype.getBases().length, 1, true), kmerSize).isEmpty() ) {
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because reference contains non-unique kmers");
return null;
}
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for ( final Haplotype h : activeAlleleHaplotypes ) {
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
}
// Next pull kmers out of every read and throw them on the graph
for( final GATKSAMRecord read : reads ) {
rtgraph.addRead(read);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
// sanity check: make sure there are no cycles in the graph
if ( rtgraph.hasCycles() ) {
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
return null;
}
// sanity check: make sure the graph had enough complexity with the given kmer
if ( ! allowLowComplexityGraphs && rtgraph.isLowComplexity() ) {
if ( debug ) logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
return null;
}
printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"));
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference
if ( recoverDanglingBranches ) {
rtgraph.recoverDanglingTails(pruneFactor, minDanglingBranchLength);
rtgraph.recoverDanglingHeads(pruneFactor, minDanglingBranchLength);
}
// remove all heading and trailing paths
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"));
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
if (debugGraphTransformations) initialSeqGraph.printGraph(new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"),10000);
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph);
if (debug) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
printDebugGraphTransform(initialSeqGraph, new File( "" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"));
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
final AssemblyResult.Status status = cleaned.getStatus();
final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph());
result.setThreadingGraph(rtgraph);
return result;
}