diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 6cc2620f91..c01635c2e8 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -22,10 +22,10 @@ using namespace std; namespace vg { - unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const unordered_map>& projection_trans) { + unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const unordered_map>& projection_trans) { // create the injection translator, which maps a node in the original graph to every one of its occurrences // in the dagified graphfs - unordered_multimap > injection_trans; + unordered_multimap > injection_trans; for (const auto& trans_record : projection_trans) { #ifdef debug_multipath_alignment cerr << trans_record.second.first << " -> " << trans_record.first << (trans_record.second.second ? "-" : "+") << endl; @@ -36,23 +36,23 @@ namespace vg { return injection_trans; } - function(id_t)> MultipathAlignmentGraph::create_projector(const unordered_map>& projection_trans) { - return [&](id_t node_id) { return projection_trans.at(node_id); }; + function(nid_t)> MultipathAlignmentGraph::create_projector(const unordered_map>& projection_trans) { + return [&](nid_t node_id) { return projection_trans.at(node_id); }; } - unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const HandleGraph& graph, - const function(id_t)>& project) { - unordered_multimap> injection_trans; + unordered_multimap> MultipathAlignmentGraph::create_injection_trans(const HandleGraph& graph, + const function(nid_t)>& project) { + unordered_multimap> injection_trans; graph.for_each_handle([&](const handle_t& handle) { - id_t node_id = graph.get_id(handle); + nid_t node_id = graph.get_id(handle); auto proj = project(node_id); injection_trans.emplace(proj.first, make_pair(node_id, proj.second)); }); return injection_trans; } - unordered_map> MultipathAlignmentGraph::create_identity_projection_trans(const HandleGraph& graph) { - unordered_map> to_return; + unordered_map> MultipathAlignmentGraph::create_identity_projection_trans(const HandleGraph& graph) { + unordered_map> to_return; graph.for_each_handle([&](const handle_t& handle) { // Each node just projects from itself forward. @@ -64,8 +64,8 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, - const unordered_multimap>& injection_trans, bool realign_Ns, + const Alignment& alignment, const function(nid_t)>& project, + const unordered_multimap>& injection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) { // Set up the initial multipath graph from the given path chunks. @@ -81,7 +81,7 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, bool realign_Ns, + const Alignment& alignment, const function(nid_t)>& project, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, project, create_injection_trans(graph, project), realign_Ns, preserve_tail_anchors, @@ -92,7 +92,7 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns, + const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns, bool preserve_tail_anchors, vector* path_node_provenance) : MultipathAlignmentGraph(graph, path_chunks, alignment, create_projector(projection_trans), create_injection_trans(projection_trans), realign_Ns, preserve_tail_anchors, @@ -102,8 +102,8 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector& path_node_provenance, size_t max_branch_trim_length, gcsa::GCSA* gcsa, const MultipathMapper::match_fanouts_t* fanout_breaks) { @@ -140,7 +140,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, + const function(nid_t)>& project, vector& path_node_provenance, size_t max_branch_trim_length, gcsa::GCSA* gcsa, const MultipathMapper::match_fanouts_t* fanout_breaks) : @@ -153,7 +153,7 @@ namespace vg { } MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, + const unordered_map>& projection_trans, vector& path_node_provenance, size_t max_branch_trim_length, gcsa::GCSA* gcsa, const MultipathMapper::match_fanouts_t* fanout_breaks) : @@ -167,8 +167,8 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const function(id_t)>& project, - const unordered_multimap>& injection_trans) { + const function(nid_t)>& project, + const unordered_multimap>& injection_trans) { // this can only be done on aligned sequences if (!alignment.has_path() || alignment.path().mapping_size() == 0) { @@ -196,7 +196,7 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const unordered_map>& projection_trans) : + const unordered_map>& projection_trans) : MultipathAlignmentGraph(graph, alignment, snarl_manager, dist_index, max_snarl_cut_size, create_projector(projection_trans), create_injection_trans(projection_trans)) { @@ -205,15 +205,15 @@ namespace vg { MultipathAlignmentGraph::MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const function(id_t)>& project) : + const function(nid_t)>& project) : MultipathAlignmentGraph(graph, alignment, snarl_manager, dist_index, max_snarl_cut_size, project, create_injection_trans(graph, project)) { // Nothing to do } void MultipathAlignmentGraph::create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const Alignment& alignment, const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector* path_node_provenance) { for (const auto& path_chunk : path_chunks) { @@ -227,7 +227,7 @@ namespace vg { auto range = injection_trans.equal_range(path.mapping(0).position().node_id()); for (auto iter = range.first; iter != range.second; iter++) { - id_t injected_id = iter->second.first; + nid_t injected_id = iter->second.first; if (iter->second.second != path.mapping(0).position().is_reverse()) { continue; @@ -252,7 +252,7 @@ namespace vg { #ifdef debug_multipath_alignment cerr << "checking node " << graph.get_id(trav) << endl; #endif - pair projected_trav = project(graph.get_id(trav)); + pair projected_trav = project(graph.get_id(trav)); const auto& pos = path.mapping(stack.size() - 1).position(); if (projected_trav.first == pos.node_id() && @@ -635,8 +635,8 @@ namespace vg { } void MultipathAlignmentGraph::create_match_nodes(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector& path_node_provenance, int64_t max_branch_trim_length, const MultipathMapper::match_fanouts_t* fanout_breaks) { @@ -661,7 +661,7 @@ namespace vg { // performs a check to see if a hit is a redundant sub-MEM auto is_redundant = [&](string::const_iterator begin, string::const_iterator end, - const pos_t& hit_pos, id_t injected_id) { + const pos_t& hit_pos, nid_t injected_id) { // check all MEMs that traversed this node to see if this is a redundant sub-MEM if (node_matches.count(injected_id)) { @@ -717,7 +717,7 @@ namespace vg { if (prefix_through_length > relative_offset) { // we cross the relative offset on this node, so check if the path is in the predicted // position for a redundant sub-MEM - id_t node_id_here = mapping.position().node_id(); + nid_t node_id_here = mapping.position().node_id(); #ifdef debug_multipath_alignment cerr << "this mapping crosses where we would expect a child to be: " << node_id_here << (project(node_id_here).second ? "-" : "+") << ":" << mapping.position().offset() + relative_offset - prefix_length << endl; @@ -798,7 +798,7 @@ namespace vg { } // an id that corresponds to the original node - id_t injected_id = iter->second.first; + nid_t injected_id = iter->second.first; #ifdef debug_multipath_alignment cerr << "hit node exists in graph as " << injected_id << endl; @@ -2033,7 +2033,7 @@ namespace vg { vector> MultipathAlignmentGraph::get_cut_segments(path_t& path, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, - const function(id_t)>& project, + const function(nid_t)>& project, int64_t max_snarl_cut_size) const { // this list holds the beginning of the current segment at each depth in the snarl hierarchy @@ -2049,7 +2049,7 @@ namespace vg { for (size_t j = 0, last = path.mapping_size() - 1; j <= last; j++) { const position_t& position = path.mapping(j).position(); auto projection = project(position.node_id()); - id_t projected_id = projection.first; + nid_t projected_id = projection.first; bool projected_rev = (projection.second != position.is_reverse()); if (j > 0) { @@ -2115,7 +2115,7 @@ namespace vg { void MultipathAlignmentGraph::resect_snarls_from_paths(SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, - const function(id_t)>& project, + const function(nid_t)>& project, int64_t max_snarl_cut_size) { #ifdef debug_multipath_alignment cerr << "cutting with snarls" << endl; @@ -2479,9 +2479,354 @@ namespace vg { } void MultipathAlignmentGraph::add_reachability_edges(const HandleGraph& graph, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector* path_node_provenance) { + + // Don't let people do this twice. + assert(!has_reachability_edges); + + // optimization: we never add edges unless there are multiple nodes, and frequently there is only one + // so we can skip traversing over the entire graph + if (path_nodes.size() <= 1) { +#ifdef debug_multipath_alignment + cerr << "skipping reachability computation because there are " << path_nodes.size() << " path nodes" << endl; +#endif + has_reachability_edges = true; + return; + } + +#ifdef debug_multipath_alignment + cerr << "computing reachability" << endl; +#endif + +#ifdef debug_multipath_alignment + SubgraphExplainer graph_dump(true); + graph_dump.subgraph(graph); +#endif + + // get a topological order over the nodes in the graph to iterate over + vector topological_order = handlealgs::lazier_topological_order(&graph); + + // Check if the graph (which is known to be a DAG) is a stick. A stick + // will be connected straight through in topological order. + bool is_stick = true; + for (size_t i = 0; is_stick && i + 1 < topological_order.size(); i++) { + // To be a stick, the connecting edge must exist. + bool saw_edge = false; + graph.follow_edges(topological_order[i], false, [&](const handle_t next_handle) { + if (next_handle != topological_order[i + 1]) { + // If we're connected to anything other than the next + // handle in the order, we're not a stick. + is_stick = false; + return false; + } + saw_edge = true; + return true; + }); + if (!saw_edge) { + is_stick = false; + } + } + // We don't need to worry about the left-side edges, or edges on the + // final node in the topological order,b ecause the graph is known to + // be a DAG. + + if (is_stick) { + // We can use the easy implementation + this->add_reachability_edges_easy(graph, project, injection_trans, topological_order, path_node_provenance); + } else { + // We need to use the full-power, slower implementation + this->add_reachability_edges_general(graph, project, injection_trans, topological_order, path_node_provenance); + } + +#ifdef debug_multipath_alignment + cerr << "final graph after adding reachability edges:" << endl; + for (size_t i = 0; i < path_nodes.size(); i++) { + PathNode& path_node = path_nodes.at(i); + cerr << i; + if (path_node_provenance) { + cerr << " (hit " << path_node_provenance->at(i) << ")"; + } + cerr << " " << debug_string(path_node.path) << " " << string(path_node.begin, path_node.end) << endl; + cerr << "\t"; + for (auto edge : path_node.edges) { + cerr << "(to:" << edge.first << ", graph dist:" << edge.second << ", read dist: " << (path_nodes.at(edge.first).begin - path_node.end) << ") "; + } + cerr << endl; + } +#endif + + + } + + void MultipathAlignmentGraph::add_reachability_edges_easy(const HandleGraph& graph, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, + const vector& topological_order, + vector* path_node_provenance) { + + // MEMs are colinear if they are colinear in the read and colinear in + // the graph. + // + // MEMs are overlap-colinear if they overlap in the read and the + // corresponding parts of their paths also coincide. + + // First, we need to treat the graph as a linear space. So, we walk the + // topological order's path through the graph, and make a map from + // graph node ID to offset in the linear graph walk. + + // Build a map from node ID to offset in the linear graph walk + unordered_map node_to_offset; + size_t linear_offset = 0; + for (const handle_t& handle : topological_order) { + node_to_offset[graph.get_id(handle)] = linear_offset; + linear_offset += graph.get_length(handle); + } + + // Helper functions to get MEM positions in read and graph space + auto start_offset_read = [&](size_t idx) -> size_t { + return path_nodes[idx].begin - path_nodes[0].begin; + }; + + auto end_offset_read = [&](size_t idx) -> size_t { + return path_nodes[idx].end - path_nodes[0].begin; + }; + + auto start_offset_graph = [&](size_t idx) -> size_t { + const path_t& path = path_nodes[idx].path; + nid_t start_node_id = path.mapping(0).position().node_id(); + return node_to_offset[start_node_id] + path.mapping(0).position().offset(); + }; + + auto end_offset_graph = [&](size_t idx) -> size_t { + const path_t& path = path_nodes[idx].path; + const path_mapping_t& last_mapping = path.mapping(path.mapping_size() - 1); + nid_t end_node_id = last_mapping.position().node_id(); + return node_to_offset[end_node_id] + last_mapping.position().offset() + mapping_from_length(last_mapping); + }; + + // Then we sort everything along the read, first by start, then by end, + // and get a vector of indexes of MEMs, ordered in that sort order. + vector read_order = sort_permutation(path_nodes.begin(), path_nodes.end(), [&](const PathNode& a, const PathNode& b) { + if (a.begin != b.begin) { + return a.begin < b.begin; + } + return a.end < b.end; + }); + + // Then we do the same thing for the graph. + // TODO: is this better than using the accessors? + vector graph_order = sort_permutation(path_nodes.begin(), path_nodes.end(), [&](const PathNode& a, const PathNode& b) { + auto& a_path = a.path; + auto& b_path = b.path; + + auto& a_start_pos = a_path.mapping(0).position(); + auto& b_start_pos = b_path.mapping(0).position(); + if (a_start_pos.node_id() != b_start_pos.node_id()) { + return node_to_offset[a_start_pos.node_id()] < node_to_offset[b_start_pos.node_id()]; + } + if (a_start_pos.offset() != b_start_pos.offset()) { + return a_start_pos.offset() < b_start_pos.offset(); + } + + auto& a_end = a_path.mapping(a_path.mapping_size() - 1); + auto& b_end = b_path.mapping(b_path.mapping_size() - 1); + auto a_end_pos = a_end.position(); + auto b_end_pos = b_end.position(); + if (a_end_pos.node_id() != b_end_pos.node_id()) { + return node_to_offset[a_end_pos.node_id()] < node_to_offset[b_end_pos.node_id()]; + } + // Adjust the offsets to account for the bases used on the node. + // TODO: Are we really allowed to jump to the middle of a node in a mapping here? + a_end_pos.set_offset(a_end_pos.offset() + mapping_from_length(a_end)); + b_end_pos.set_offset(b_end_pos.offset() + mapping_from_length(b_end)); + return a_end_pos.offset() < b_end_pos.offset(); + }); + +#ifdef debug_multipath_alignment + for (size_t i = 0; i + 1 < graph_order.size(); i++) { + crash_unless(start_offset_graph(graph_order[i]) <= start_offset_graph(graph_order[i+1])); + std::cerr << "Graph order " << i << ": MEM " << graph_order[i] << " at graph offset " << start_offset_graph(graph_order[i]) << " is at or before MEM " << graph_order[i+1] << " at graph offset " << start_offset_graph(graph_order[i+1]) << std::endl; + } +#endif + + // Now we make a lookup table to find each MEM's position in the graph order + std::vector graph_index_of(graph_order.size()); + for (size_t i = 0; i < graph_order.size(); i++) { + graph_index_of[graph_order[i]] = i; + } + + // Map from index_onto to maps of index_from to (read_overlap, graph_overlap, dist) + // TODO: dist is always 0 for overlap-colinear MEMs + unordered_map>> confirmed_overlaps; + + for (size_t read_index = 0; read_index < read_order.size(); read_index++) { + // Then we walk through all the MEMs in the read. + size_t predecessor = read_order[read_index]; + +#ifdef debug_multipath_alignment + std::cerr << "Look for transitions from " << predecessor << std::endl; +#endif + + // Record a search start + build_queue_count++; + + // For each, we find its position in the graph. + size_t graph_index = graph_index_of[predecessor]; + crash_unless(graph_order[graph_index] == predecessor); + + // We put an entry in here if we see a potential successor at all, + // and a true if that successor was acceptable. When we again see a + // successor that was acceptable, we can stop. + std::unordered_map acceptable_successors; + + for (size_t i = 2; read_index + (i / 2) < read_order.size() && graph_index + (i / 2) < graph_order.size(); i++) { + // Then we loop over successive offsets until we find an + // acceptable successor on both the read and graph sides, or + // one runs out. (We only need to wait for one to run out, + // because if it wasn't in the smaller set of candidates it + // can't be in the intersection.) We use a hacky loop that + // visits a successor in the read, then a successor in the + // graph, etc. so that we can break out of it with one break + // when we find something. + + // For each potential successor in the graph or the read + size_t successor = (i % 2 == 0) ? read_order.at(read_index + (i/2)) : graph_order.at(graph_index + (i/2)); + +#ifdef debug_multipath_alignment + std::cerr << "\tFound successor " << successor << " at order offset " << (i / 2) << " in " << ((i % 2 == 0) ? "read" : "graph") << std::endl; +#endif + + // We should never see something as a successor of itself. + crash_unless(successor != predecessor); + + auto found = acceptable_successors.find(successor); + if (found != acceptable_successors.end()) { +#ifdef debug_multipath_alignment + std::cerr << "\t\tHas been seen before" << std::endl; +#endif + // When we find a successor we've seen before, we don't have to process it. + if (found->second) { + // But if it was an acceptable successor the first time + // we saw it, we now have an acceptable successor in + // both the graph and the read and we've handled + // everything before it in both the graph and the read, + // so it blocks further edges and we can stop. +#ifdef debug_multipath_alignment + std::cerr << "\t\t\tWas acceptable. Stop." << std::endl; +#endif + break; + } + } else { + // We found a novel successor +#ifdef debug_multipath_alignment + std::cerr << "\t\tIs novel" << std::endl; +#endif + + // Remember we found it. Assume it's not acceptable for now. + auto acceptable_entry = acceptable_successors.emplace_hint(found, successor, false); + + size_t read_from = end_offset_read(predecessor); + size_t read_to = start_offset_read(successor); + size_t graph_from = end_offset_graph(predecessor); + size_t graph_to = start_offset_graph(successor); + if (read_from <= read_to && graph_from <= graph_to) { + // The predecessor ends before the successor starts on + // both sides, so it is colinear. + +#ifdef debug_multipath_alignment + std::cerr << "\t\tPredecessor ends in time in both read (" << read_from << "<=" << read_to << ") and graph (" << graph_from << "<=" << graph_to << "): colinear" << std::endl; +#endif + + path_nodes[predecessor].edges.emplace_back(successor, graph_to - graph_from); + + // Record as an acceptable successor + acceptable_entry->second = true; + } else if (read_from > read_to && + graph_from > graph_to && + start_offset_read(predecessor) <= start_offset_read(successor) && + start_offset_graph(predecessor) <= start_offset_graph(successor) + ) { + // The predecessor overlaps the successor in the read + // and the graph, and starts at or before it in both. + // They might be overlap-colinear. + +#ifdef debug_multipath_alignment + std::cerr << "\t\tMay be overlap-colinear" << std::endl; +#endif + + // TODO: We assume there aren't any duplicate, + // coinciding paths. + + // Figure out how much path needs to match + size_t read_overlap = read_from - read_to; + size_t predecessor_read_length = path_nodes[predecessor].end - path_nodes[predecessor].begin; + size_t predecessor_read_overlap_start = predecessor_read_length - read_overlap; + + // Get and compare the partial paths + path_t predecessor_overlap = extract_subpath( + path_nodes[predecessor].path, + predecessor_read_overlap_start, + predecessor_read_length + ); + path_t successor_overlap = extract_subpath( + path_nodes[successor].path, + 0, + read_overlap + ); + + if (predecessor_overlap == successor_overlap) { + // The overlapping part in the read is the same + // path in the graph. These are overlap-colinear. + +#ifdef debug_multipath_alignment + std::cerr << "\t\t\tSame path in overlapping read region: overlap-colinear" << std::endl; +#endif + + // The distance is 0 for overlap-colinear MEMs. + confirmed_overlaps[predecessor][successor] = make_tuple(read_overlap, path_from_length(predecessor_overlap), 0); + + // Record that we found an acceptable successor. + acceptable_entry->second = true; + } else { +#ifdef debug_multipath_alignment + std::cerr << "\t\t\tDifferent paths in overlapping read region: no relationship" << std::endl; +#endif + } + } else { +#ifdef debug_multipath_alignment + std::cerr << "\t\tNo sensible overlap in read (" << read_from << " vs. " << read_to << ") or graph (" << graph_from << " vs. " << graph_to << "): no relationship" << std::endl; +#endif + } + // If neither, it's not an acceptable successor. Don't mark + // it and keep going. + } + } + } + +#ifdef debug_multipath_alignment + std::cerr << "Found " << confirmed_overlaps.size() << " prececessors with overlap-colinear successors" << std::endl; +#endif + + // When we get here, we've found all the successor relationships that + // are in both the read order and the graph order (either the direct + // edge ones or the overlap-colinear ones). + + // Make the call to split up the overlap-colinear MEMs and add the + // reachability edges between them. + + this->split_at_overlap_edges(confirmed_overlaps, path_node_provenance); + + // Mark that reachability edges have been added + has_reachability_edges = true; + } + + void MultipathAlignmentGraph::add_reachability_edges_general(const HandleGraph& graph, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, + const vector& topological_order, + vector* path_node_provenance) { // We're going to make "reachability" edges, which connect MEMs (which @@ -2505,23 +2850,6 @@ namespace vg { // there. -#ifdef debug_multipath_alignment - cerr << "computing reachability" << endl; -#endif - - // Don't let people do this twice. - assert(!has_reachability_edges); - - // optimization: we never add edges unless there are multiple nodes, and frequently there is only one - // so we can skip traversing over the entire graph - if (path_nodes.size() <= 1) { -#ifdef debug_multipath_alignment - cerr << "skipping reachability computation because there are " << path_nodes.size() << " path nodes" << endl; -#endif - has_reachability_edges = true; - return; - } - // now we calculate reachability between the walked paths so we know which ones // to connect with intervening alignments @@ -2573,17 +2901,35 @@ namespace vg { return false; }; + + // record the start and end node ids of every path // Maps from node ID to the list of MEM numbers that start on that node. - unordered_map> path_starts; + unordered_map> path_starts; // Maps from node ID to the list of MEM numbers that end on that node. - unordered_map> path_ends; + unordered_map> path_ends; for (size_t i = 0; i < path_nodes.size(); i++) { path_t& path = path_nodes[i].path; path_starts[path.mapping(0).position().node_id()].push_back(i); path_ends[path.mapping(path.mapping_size() - 1).position().node_id()].push_back(i); } - + + // Sort the MEMs starting and ending on each node in node sequence order. + // MEMs that start/end earlier will appear earlier in the vector for the node they start/end on. + // We don't need to worry about strand because the graph is a DAG. + for (pair>& node_starts : path_starts) { + std::sort(node_starts.second.begin(), node_starts.second.end(), + [&](const size_t idx_1, const size_t idx_2) { + return start_offset(idx_1) < start_offset(idx_2); + }); + } + for (pair>& node_ends : path_ends) { + std::sort(node_ends.second.begin(), node_ends.second.end(), + [&](const size_t idx_1, const size_t idx_2) { + return end_offset(idx_1) < end_offset(idx_2); + }); + } + #ifdef debug_multipath_alignment cerr << "recorded starts: " << endl; for (const auto& rec : path_starts) { @@ -2602,24 +2948,8 @@ namespace vg { } cerr << endl; } -#endif - - - // Sort the MEMs starting and ending on each node in node sequence order. - // MEMs that start/end earlier will appear earlier in the vector for the node they start/end on. - for (pair>& node_starts : path_starts) { - std::sort(node_starts.second.begin(), node_starts.second.end(), - [&](const size_t idx_1, const size_t idx_2) { - return start_offset(idx_1) < start_offset(idx_2); - }); - } - for (pair>& node_ends : path_ends) { - std::sort(node_ends.second.begin(), node_ends.second.end(), - [&](const size_t idx_1, const size_t idx_2) { - return end_offset(idx_1) < end_offset(idx_2); - }); - } - +#endif + // The "ranges" that are used below (range_start, range_end, etc.) // refer to intervals in these sorted per-node lists in path_starts and // path_ends corresponding to sets of MEMs that all start or end at the @@ -2635,8 +2965,8 @@ namespace vg { // for each node, the starts and ends of MEMs that can reach this node // without passing any other MEM starts or ends. TODO: What do the // unordered_maps map from/to? - unordered_map> reachable_ends; - unordered_map> reachable_starts; + unordered_map> reachable_ends; + unordered_map> reachable_starts; // for the start of each MEM, the starts and ends of other MEMs that can reach it without passing any // other start or end @@ -2654,10 +2984,9 @@ namespace vg { unordered_map>> reachable_ends_from_end; unordered_map>> reachable_starts_from_end; - // get a topological order over the nodes in the graph to iterate over - vector topological_order = handlealgs::lazier_topological_order(&graph); + for (int64_t i = 0; i < topological_order.size(); i++) { - id_t node_id = graph.get_id(topological_order[i]); + nid_t node_id = graph.get_id(topological_order[i]); #ifdef debug_multipath_alignment cerr << "DP step for graph node " << node_id << endl; @@ -2717,7 +3046,7 @@ namespace vg { bool at_end = (curr_end_offset <= curr_start_offset); // TODO: What exactly do these variables hold? - unordered_map>* reachable_endpoints; + unordered_map>* reachable_endpoints; unordered_map>>* reachable_ends_from_endpoint; unordered_map>>* reachable_starts_from_endpoint; vector* endpoints; @@ -2973,7 +3302,7 @@ namespace vg { // this nodes contains at least one start or end, but either all starts or all ends // record the incoming starts/ends for the starts/ends on this node - unordered_map>* reachable_endpoints; + unordered_map>* reachable_endpoints; unordered_map>>* reachable_ends_from_endpoint; unordered_map>>* reachable_starts_from_endpoint; unordered_map>>* reachable_endpoints_from_endpoint; @@ -3164,15 +3493,16 @@ namespace vg { // will use this to navigate between the MEMs in a way that respects graph reachability so that this // phase of the algorithm only needs to pay attention to read colinearity and transitive reducibility + // TODO: What exactly is a noncolinear shell??? vector> noncolinear_shells(path_nodes.size()); // map from index_from to maps of index_onto to (overlap to length, overlap from length, dist) unordered_map>> confirmed_overlaps; // map from path index to set of indexes whose start occurs on the path unordered_map> path_starts_on_path; - + for (size_t i = 0; i < topological_order.size(); i++) { - id_t node_id = graph.get_id(topological_order[i]); + nid_t node_id = graph.get_id(topological_order[i]); #ifdef debug_multipath_alignment cerr << "looking for edges for starts on node " << node_id << endl; @@ -3236,6 +3566,7 @@ namespace vg { pair start_here = start_queue.top(); start_queue.pop(); + build_queue_count++; // don't keep looking backward earlier than the start of the current path if (start_here.first == end) { @@ -3258,11 +3589,15 @@ namespace vg { pair end_here = end_queue.top(); end_queue.pop(); + build_queue_count++; PathNode& next_end_node = path_nodes[end_here.first]; // these are non-colinear, so add it to the non-colinear shell if (next_end_node.begin >= end_node.begin || next_end_node.end >= end_node.end) { +#ifdef debug_multipath_alignment + std::cerr << "Definitely noncolinear from an end: make sure " << end_here.first << " is in noncolinear shell for " << end << std::endl; +#endif if (noncolinear_shell.count(end_here.first)) { noncolinear_shell[end_here.first] = std::min(end_here.second, noncolinear_shell[end_here.first]); } @@ -3276,6 +3611,8 @@ namespace vg { // non-colinear shell. now we need to decide whether to keep searching backward. we'll check a // few conditions that will guarantee that the rest of the search is redundant + // TODO: we actually still might add it to the non-colinear shell. Is that correct? + // TODO: this actually isn't a full set of criteria, we don't just want to know if there is an // edge, we want to know if it is reachable along any series of edges... // at least this will only cause a few false positive edges that we can remove later with @@ -3308,6 +3645,9 @@ namespace vg { // we can't easily guarantee that this is non-colinear or colinear, so we're just going to treat it // as non-colinear and accept some risk of this creating redundant edges to later nodes +#ifdef debug_multipath_alignment + std::cerr << "Ambiguous colinearity from an end: make sure " << end_here.first << " is in noncolinear shell for " << end << std::endl; +#endif if (noncolinear_shell.count(end_here.first)) { noncolinear_shell[end_here.first] = std::min(end_here.second, noncolinear_shell[end_here.first]); } @@ -3345,6 +3685,7 @@ namespace vg { pair start_here = start_queue.top(); start_queue.pop(); + build_queue_count++; #ifdef debug_multipath_alignment cerr << "traversing start " << start_here.first << " at distance " << start_here.second << endl; @@ -3371,6 +3712,7 @@ namespace vg { size_t candidate_end, candidate_dist; tie(candidate_end, candidate_dist) = end_queue.top(); end_queue.pop(); + build_queue_count++; #ifdef debug_multipath_alignment cerr << "considering end " << candidate_end << " as candidate for edge of dist " << candidate_dist << endl; @@ -3451,6 +3793,10 @@ namespace vg { if (start_node_from_length == 0) { start_node_from_length = path_from_length(start_node.path); } + +#ifdef debug_multipath_alignment + std::cerr << "Noncolinear from a start: make sure " << candidate_end << " is in noncolinear shell for " << start << std::endl; +#endif if (noncolinear_shell.count(candidate_end)) { noncolinear_shell[candidate_end] = std::min(candidate_dist + start_node_from_length, noncolinear_shell[candidate_end]); @@ -3539,7 +3885,7 @@ namespace vg { size_t traversed_length = mapping_from_length(path.mapping(0)); for (size_t j = 1; j + 1 < path.mapping_size(); j++) { - id_t path_node_id = path.mapping(j).position().node_id(); + nid_t path_node_id = path.mapping(j).position().node_id(); // record which starts are on the path on this node for (size_t path_start : path_starts[path_node_id]) { path_starts_on_path[start].insert(path_start); @@ -3552,7 +3898,7 @@ namespace vg { traversed_length += mapping_from_length(path.mapping(j)); } - id_t final_node_id = path.mapping(path.mapping_size() - 1).position().node_id(); + nid_t final_node_id = path.mapping(path.mapping_size() - 1).position().node_id(); vector& final_starts = path_starts[final_node_id]; vector& final_ends = path_ends[final_node_id]; @@ -3641,6 +3987,41 @@ namespace vg { } } + this->split_at_overlap_edges(confirmed_overlaps, path_node_provenance); + + // Go to the state where we know the reachability edges exist. + has_reachability_edges = true; + } + + void MultipathAlignmentGraph::clear_reachability_edges() { + // Don't let people clear the edges if they are clear already. + // That suggests that someone has gotten confused over whether they should exist or not. + assert(has_reachability_edges); + + for (auto& node : path_nodes) { + // Just clear all the edges from each node. + // add_reachability_edges can rebuild them all. + node.edges.clear(); + } + + // Go to the state where reachability edges don't exist + has_reachability_edges = false; + + } + + size_t MultipathAlignmentGraph::count_reachability_edges() const { + if (!has_reachability_edges) { + return 0; + } + size_t count = 0; + for (auto& node : path_nodes) { + count += node.edges.size(); + } + return count; + } + + void MultipathAlignmentGraph::split_at_overlap_edges(const unordered_map>>& confirmed_overlaps, + vector* path_node_provenance) { #ifdef debug_multipath_alignment cerr << "breaking nodes at overlap edges" << endl; #endif @@ -3884,54 +4265,6 @@ namespace vg { } } } - - // Go to the state where we know the reachability edges exist. - has_reachability_edges = true; - -#ifdef debug_multipath_alignment - cerr << "final graph after adding reachability edges:" << endl; - for (size_t i = 0; i < path_nodes.size(); i++) { - PathNode& path_node = path_nodes.at(i); - cerr << i; - if (path_node_provenance) { - cerr << " (hit " << path_node_provenance->at(i) << ")"; - } - cerr << " " << debug_string(path_node.path) << " " << string(path_node.begin, path_node.end) << endl; - cerr << "\t"; - for (auto edge : path_node.edges) { - cerr << "(to:" << edge.first << ", graph dist:" << edge.second << ", read dist: " << (path_nodes.at(edge.first).begin - path_node.end) << ") "; - } - cerr << endl; - } -#endif - - } - - void MultipathAlignmentGraph::clear_reachability_edges() { - // Don't let people clear the edges if they are clear already. - // That suggests that someone has gotten confused over whether they should exist or not. - assert(has_reachability_edges); - - for (auto& node : path_nodes) { - // Just clear all the edges from each node. - // add_reachability_edges can rebuild them all. - node.edges.clear(); - } - - // Go to the state where reachability edges don't exist - has_reachability_edges = false; - - } - - size_t MultipathAlignmentGraph::count_reachability_edges() const { - if (!has_reachability_edges) { - return 0; - } - size_t count = 0; - for (auto& node : path_nodes) { - count += node.edges.size(); - } - return count; } // Kahn's algorithm @@ -4233,7 +4566,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, - SnarlDistanceIndex* dist_index, const function(id_t)>* project, + SnarlDistanceIndex* dist_index, const function(nid_t)>* project, bool allow_negative_scores, bool align_in_reverse, uint64_t max_band_cells) { align(alignment, @@ -5188,7 +5521,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, const function& band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, - SnarlDistanceIndex* dist_index, const function(id_t)>* project, + SnarlDistanceIndex* dist_index, const function(nid_t)>* project, bool allow_negative_scores, bool align_in_reverse, uint64_t max_band_cells) { // TODO: magic number @@ -5668,7 +6001,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap size_t attachment_idx, bool to_left, size_t unmergeable_len, const GSSWAligner* aligner, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, - const function(id_t)>* project) { + const function(nid_t)>* project) { // TODO: i wonder if it would be cleaner/more general to use branches rather than snarls // as the condition here... @@ -6048,7 +6381,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } return gap_length; } - + unordered_map>> MultipathAlignmentGraph::align_tails(const Alignment& alignment, const HandleGraph& align_graph, const GSSWAligner* scoring_aligner, const GSSWAligner* dp_aligner, size_t max_alt_alns, bool dynamic_alt_alns, @@ -6056,8 +6389,14 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap unordered_set* sources) { #ifdef debug_multipath_alignment - cerr << "doing tail alignments to:" << endl; - to_dot(cerr); + cerr << "doing tail alignments to " << path_nodes.size() << " path nodes" << endl; + size_t tail_count = 0; + for (auto& path_node : path_nodes) { + if (path_node.edges.empty() && path_node.end != alignment.sequence().end()) { + tail_count++; + } + } + std::cerr << "\tof which " << tail_count << " will be tails" << std::endl; #endif // TODO: magic number @@ -6088,7 +6427,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap vector is_source_node(path_nodes.size(), true); for (size_t j = 0; j < path_nodes.size(); j++) { PathNode& path_node = path_nodes.at(j); -#ifdef debug_multipath_alignment +#ifdef debug_trace_tail_alignment cerr << "Visit PathNode " << j << " with " << path_node.edges.size() << " outbound edges" << endl; #endif if (!path_node.edges.empty()) { @@ -6096,7 +6435,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap for (const pair& edge : path_node.edges) { // Make everywhere we go as not a source is_source_node[edge.first] = false; -#ifdef debug_multipath_alignment +#ifdef debug_trace_tail_alignment cerr << "Edge " << j << " -> " << edge.first << " makes " << edge.first << " not a source" << endl; #endif } @@ -6104,7 +6443,10 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap else if (path_node.end != alignment.sequence().end()) { #ifdef debug_multipath_alignment - cerr << "doing right end alignment from sink node " << j << " with path " << debug_string(path_node.path) << " and sequence "; + cerr << "doing right end alignment from sink node " << j << endl; +#endif +#ifdef debug_trace_tail_alignment + cerr << "\twith path " << debug_string(path_node.path) << " and sequence "; for (auto iter = path_node.begin; iter != path_node.end; iter++) { cerr << *iter; } @@ -6129,13 +6471,19 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap pos_t end_pos = final_position(path_node.path); bdsg::HashGraph tail_graph; - unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, + unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, &tail_graph, target_length, end_pos, false, // search forward false); // no need to preserve cycles (in a DAG) +#ifdef debug_multipath_alignment + std::cerr << "Going to align " << aligning_tail_length << "/" << tail_length + << " bp of tail against " << tail_graph.get_total_length() + << " bp of graph extracted for target length " << target_length << std::endl; +#endif + size_t num_alt_alns; if (dynamic_alt_alns) { size_t num_paths = handlealgs::count_walks(&tail_graph); @@ -6168,6 +6516,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap #ifdef debug_multipath_alignment cerr << "making " << num_alt_alns << " alignments of sequence: " << right_tail_sequence.sequence() << endl << "to right tail graph extracted from " << end_pos << endl; +#endif +#ifdef debug_trace_tail_alignment tail_graph.for_each_handle([&](const handle_t& handle) { cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; tail_graph.follow_edges(handle, true, [&](const handle_t& prev) { @@ -6247,6 +6597,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } #ifdef debug_multipath_alignment cerr << "made " << alt_alignments.size() << " tail alignments" << endl; +#endif +#ifdef debug_trace_tail_alignment for (size_t i = 0; i < alt_alignments.size(); ++i) { cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } @@ -6322,7 +6674,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap pos_t begin_pos = initial_position(path_node.path); bdsg::HashGraph tail_graph; - unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, + unordered_map tail_trans = algorithms::extract_extending_graph(&align_graph, &tail_graph, target_length, begin_pos, @@ -6361,6 +6713,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap #ifdef debug_multipath_alignment cerr << "making " << num_alt_alns << " alignments of sequence: " << left_tail_sequence.sequence() << endl << "to left tail graph extracted from " << begin_pos << endl; +#endif +#ifdef debug_trace_tail_alignment tail_graph.for_each_handle([&](const handle_t& handle) { cerr << tail_graph.get_id(handle) << " " << tail_graph.get_sequence(handle) << endl; tail_graph.follow_edges(handle, false, [&](const handle_t& next) { @@ -6428,6 +6782,8 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } #ifdef debug_multipath_alignment cerr << "made " << alt_alignments.size() << " tail alignments" << endl; +#endif +#ifdef debug_trace_tail_alignment for (size_t i = 0; i < alt_alignments.size(); ++i) { cerr << i << ": " << pb2json(alt_alignments[i]) << endl; } @@ -6480,17 +6836,16 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } } } - #ifdef debug_multipath_alignment cerr << "made alignments for " << to_return[false].size() << " source PathNodes and " << to_return[true].size() << " sink PathNodes" << endl; if (sources != nullptr) { cerr << "Identified " << sources->size() << " source PathNodes" << endl; } #endif - // Return all the alignments, organized by tail and subpath return to_return; } + bool MultipathAlignmentGraph::empty() const { return path_nodes.empty(); @@ -6600,11 +6955,11 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } } } - + void MultipathAlignmentGraph::to_dot(ostream& out, const Alignment* alignment) const { // We track the VG graph nodes we talked about already. - set mentioned_nodes; - set> mentioned_edges; + set mentioned_nodes; + set> mentioned_edges; out << "digraph graphname {" << endl; out << "rankdir=\"LR\";" << endl; @@ -6639,7 +6994,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap if (j != 0) { // We have a previous node in this segment of path. What is it? auto prev_id = path.mapping(j-1).position().node_id(); - pair edge_pair{prev_id, node_id}; + pair edge_pair{prev_id, node_id}; if (!mentioned_edges.count(edge_pair)) { // This graph edge needs to be made @@ -6653,7 +7008,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap out << "}" << endl; } - bool MultipathAlignmentGraph::into_cutting_snarl(id_t node_id, bool is_rev, + bool MultipathAlignmentGraph::into_cutting_snarl(nid_t node_id, bool is_rev, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index) const { if (dist_index) { @@ -6671,7 +7026,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } } - vector> MultipathAlignmentGraph::get_connected_components() const { + vector> MultipathAlignmentGraph::get_connected_components() const { // Brea all the path_nodes into components using this union-find structures::UnionFind unionfind(path_nodes.size()); @@ -6686,13 +7041,13 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap } // Assemble the vectors of ids - vector> to_return; + vector> to_return; for (auto& component : unionfind.all_groups()) { // For each connected component // Make a set of vg graph IDs in it - set vg_ids; + set vg_ids; for (auto& path_node : component) { // For each path in the vg graph used by the component diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index 598353a5db..fb29d69d73 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -21,6 +21,10 @@ namespace vg { // TODO: put in MultipathAlignmentGraph namespace + + /// Represents a single component alignment between part of a string and a + /// path through the target graph, withing a MultipathAlignmentGraph. Also + /// holds edge data for which other PathNodes can succeed it. class PathNode { public: string::const_iterator begin; @@ -40,19 +44,19 @@ namespace vg { /// is needed to construct the MultipathAlignmentGraph, and to perform /// some other operations on it, but is big enough that it is worth not /// making it a member. - static unordered_multimap> create_injection_trans(const unordered_map>& projection_trans); + static unordered_multimap> create_injection_trans(const unordered_map>& projection_trans); /// Create the constant injection translation data from a function instead /// of a map - static unordered_multimap> create_injection_trans(const HandleGraph& graph, - const function(id_t)>& project); + static unordered_multimap> create_injection_trans(const HandleGraph& graph, + const function(nid_t)>& project); /// Create an identity projection translation from a DAG that did not /// need to be modified during dagification. - static unordered_map> create_identity_projection_trans(const HandleGraph& graph); + static unordered_map> create_identity_projection_trans(const HandleGraph& graph); /// Create a lambda function that projects using a map that projects - static function(id_t)> create_projector(const unordered_map>& projection_trans); + static function(nid_t)> create_projector(const unordered_map>& projection_trans); /// Construct a graph of the reachability between MEMs in a DAG-ified /// graph. If a GCSA is specified, use it to collapse MEMs whose @@ -62,15 +66,15 @@ namespace vg { /// read interval. /// If a hit fails to be walked ouut in the graph, it is removed from hits. MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector& path_node_provenance, size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily. MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const unordered_map>& projection_trans, + const unordered_map>& projection_trans, vector& path_node_provenance, size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); @@ -78,7 +82,7 @@ namespace vg { /// Same as the previous constructor, but construct injection_trans implicitly and temporarily /// using a lambda for a projector MultipathAlignmentGraph(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, + const function(nid_t)>& project, vector& path_node_provenance, size_t max_branch_trim_length = 0, gcsa::GCSA* gcsa = nullptr, const MultipathMapper::match_fanouts_t* fanout_breaks = nullptr); @@ -86,38 +90,38 @@ namespace vg { /// Construct a graph of the reachability between aligned chunks in a linearized /// path graph. Produces a graph with reachability edges. MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, - const unordered_multimap>& injection_trans, bool realign_Ns = true, + const Alignment& alignment, const function(nid_t)>& project, + const unordered_multimap>& injection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns = true, + const Alignment& alignment, const unordered_map>& projection_trans, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily /// and using a lambda for a projector MultipathAlignmentGraph(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, bool realign_Ns = true, + const Alignment& alignment, const function(nid_t)>& project, bool realign_Ns = true, bool preserve_tail_anchors = false, vector* path_node_provenance = nullptr); /// Make a multipath alignment graph using the path of a single-path alignment. Only /// one of snarl_manager and dist_index need be supplied. MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const function(id_t)>& project, - const unordered_multimap>& injection_trans); + const function(nid_t)>& project, + const unordered_multimap>& injection_trans); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const unordered_map>& projection_trans); + const unordered_map>& projection_trans); /// Same as the previous constructor, but construct injection_trans implicitly and temporarily /// using a function instead of a map MultipathAlignmentGraph(const HandleGraph& graph, const Alignment& alignment, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index, size_t max_snarl_cut_size, - const function(id_t)>& project); + const function(nid_t)>& project); ~MultipathAlignmentGraph(); @@ -145,13 +149,6 @@ namespace vg { double max_suboptimal_score_ratio, const vector& topological_order, vector& path_node_provenance); - /// Clear reachability edges, so that add_reachability_edges can be run - /// (possibly after modifying the graph). - void clear_reachability_edges(); - - /// Get the number of reachability edges in the graph. - size_t count_reachability_edges() const; - /// Remove the ends of paths, up to a maximum length, if they cause the path /// to extend past a branch point in the graph. void trim_to_branch_points(const HandleGraph* graph, size_t max_trim_length = 1); @@ -161,7 +158,7 @@ namespace vg { /// cut size. Snarls can be stored either in a SnarlManager or a /// SnarlDistanceIndex (only one need be supplied). void resect_snarls_from_paths(SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, - const function(id_t)>& project, int64_t max_snarl_cut_size = 5); + const function(nid_t)>& project, int64_t max_snarl_cut_size = 5); /// Do some exploratory alignments of the tails of the graph, outside /// the outermost existing anchors, and define new anchoring paths from @@ -177,12 +174,32 @@ namespace vg { size_t min_anchor_size, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier); - /// Add edges between reachable nodes and split nodes at overlaps + /// Add edges between reachable nodes and split nodes at overlaps. + /// + /// On completion, all overlap-colinear MEMs have been broken up into + /// colinear MEMs, and all colinear MEMs have been connected with + /// edges. + /// + /// MEMs are "colinear" if the first comes before the second in both + /// the read and the graph, and there is no intervening MEM such that + /// the first and intervening MEM, and the intervening and second MEM, + /// are colinear. + /// + /// MEMs are "overlap-colinear" if a suffix of the first MEM's + /// alignment is a prefix of the second MEM's alignment. TODO: Is that + /// true or does it also cover containment? void add_reachability_edges(const HandleGraph& vg, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector* path_node_provenance = nullptr); - + + /// Clear reachability edges, so that add_reachability_edges can be run + /// (possibly after modifying the graph). + void clear_reachability_edges(); + + /// Get the number of reachability edges in the graph. + size_t count_reachability_edges() const; + /// Do intervening and tail alignments between the anchoring paths and /// store the result in a multipath_alignment_t. Reachability edges must /// be in the graph. The Alignment passed *must* be the same Alignment @@ -201,7 +218,7 @@ namespace vg { bool score_anchors_as_matches, size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, - SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, + SnarlDistanceIndex* dist_index = nullptr, const function(nid_t)>* project = nullptr, bool allow_negative_scores = false, bool align_in_reverse = false, uint64_t max_band_cells = std::numeric_limits::max()); @@ -226,7 +243,7 @@ namespace vg { size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, const function& band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, - SnarlDistanceIndex* dist_index = nullptr, const function(id_t)>* project = nullptr, + SnarlDistanceIndex* dist_index = nullptr, const function(nid_t)>* project = nullptr, bool allow_negative_scores = false, bool align_in_reverse = false, uint64_t max_band_cells = std::numeric_limits::max()); @@ -235,7 +252,7 @@ namespace vg { void to_dot(ostream& out, const Alignment* alignment = nullptr) const; /// Get lists of the vg node IDs that participate in each connected component in the MultipathAlignmentGraph - vector> get_connected_components() const; + vector> get_connected_components() const; /// Does the multipath alignment graph have any nodes? bool empty() const; @@ -247,8 +264,12 @@ namespace vg { size_t max_shift() const; void prune_high_shift_edges(size_t prune_diff, bool prohibit_new_sources, bool prohibit_new_sinks); - + protected: + + /// Default constructor for unit testing. + /// After using it, you need to create_match_nodes() and add_reachability_edges() yourself. + MultipathAlignmentGraph() = default; /// Nodes representing walked MEMs in the graph vector path_nodes; @@ -260,6 +281,9 @@ namespace vg { /// If this is set and you want it unset, use clear_reachability_edges(). /// If this is unset and you want it set, use add_reachability_edges(). bool has_reachability_edges = false; + + /// Field for tracking amount of work done during construction, for testability. + size_t build_queue_count = 0; /// Trim down the given PathNode of everything except softclips. /// Return true if it all gets trimmed away and should be removed. @@ -274,14 +298,14 @@ namespace vg { /// Add the path chunks as nodes to the connectivity graph void create_path_chunk_nodes(const HandleGraph& graph, const vector, path_t>>& path_chunks, - const Alignment& alignment, const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const Alignment& alignment, const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector* path_node_provenance = nullptr); /// Walk out MEMs into match nodes and filter out redundant sub-MEMs void create_match_nodes(const HandleGraph& graph, MultipathMapper::memcluster_t& hits, - const function(id_t)>& project, - const unordered_multimap>& injection_trans, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, vector& path_node_provenance, int64_t max_branch_trim_length, const MultipathMapper::match_fanouts_t* fanout_breaks); @@ -289,6 +313,56 @@ namespace vg { /// If path nodes partially overlap, merge the sections that overlap into a single path node void merge_partially_redundant_match_nodes(const unordered_map>& node_matches, vector& path_node_provenance); + + + /// Add reachability edges and split and connect overlap-colinear MEMs + /// in the case where the underlying target graph is a stick. + /// + /// Use a stick-specific algorithm to detect colinear and + /// overlap-colinear MEMs. + /// + /// Note that this may produce some transitive edges, which may be + /// different than the ones the general algorithm produces. + /// + /// topological_order gives a topological order of the target graph + void add_reachability_edges_easy(const HandleGraph& vg, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, + const vector& topological_order, + vector* path_node_provenance = nullptr); + + /// Add reachability edges and split and connect overlap-cominear MEMs + /// in the general case. + /// + /// Note that this may produce some transitive edges. + /// + /// topological_order gives a topological order of the target graph + void add_reachability_edges_general(const HandleGraph& vg, + const function(nid_t)>& project, + const unordered_multimap>& injection_trans, + const vector& topological_order, + vector* path_node_provenance = nullptr); + + /// Split path_nodes nodes at overlaps. + /// + /// confirmed_overlaps is a map from index_from to maps of index_onto + /// to (overlap to length, overlap from length, dist). This means that + /// it is keyed by source path_nodes index for the earlier MEM, and + /// then destination path_nodes index for the later MEM. The values are + /// the amount of overlaps between the alignment fragments in the read, + /// then in the graph, and then the path length for the overlap edge. + /// + /// TODO: what is that really? + /// + /// The path length for the overlap edge is 0 when the MEMs are + /// "overlap colinear", where a suffix of the first is a prefix of the + /// second. + /// + /// If path_node_provenance is set, it will be updated as path_nodes is + /// modified, to maintain provenance information about which value in + /// path_nodes came from what. + void split_at_overlap_edges(const unordered_map>>& confirmed_overlaps, + vector* path_node_provenance = nullptr); void jitter_homopolymer_ends(const HandleGraph& graph, vector& path_node_provenance, @@ -310,12 +384,12 @@ namespace vg { int64_t pessimistic_tail_gap(int64_t tail_length, double multiplier); /// Returns true if we're pointing into a snarl that we want to cut out of paths - bool into_cutting_snarl(id_t node_id, bool is_rev, + bool into_cutting_snarl(nid_t node_id, bool is_rev, SnarlManager* snarl_manager, SnarlDistanceIndex* dist_index) const; /// Returns the intervals of the path that lie inside of snarls vector> get_cut_segments(path_t& path, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, - const function(id_t)>& project, int64_t max_snarl_cut_size) const; + const function(nid_t)>& project, int64_t max_snarl_cut_size) const; /// Generate alignments of the tails of the query sequence, beyond the /// sources and sinks. The Alignment passed *must* be the one that owns @@ -378,7 +452,7 @@ namespace vg { const GSSWAligner* aligner, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, - const function(id_t)>* project = nullptr); + const function(nid_t)>* project = nullptr); /// Memo for the transcendental pessimistic tail gap function (thread local to maintain thread-safety) static thread_local unordered_map> pessimistic_tail_gap_memo; diff --git a/src/path.cpp b/src/path.cpp index 83188da55c..a1b9c1a4da 100644 --- a/src/path.cpp +++ b/src/path.cpp @@ -2233,6 +2233,130 @@ pair cut_path(const Path& path, size_t offset) { return make_pair(p1, p2); } +// cut a path_mapping_t at a to_length offset +pair cut_mapping(const path_mapping_t& m, size_t offset) { + path_mapping_t left, right; + + // both result mappings will be in the same orientation as the original + left.mutable_position()->set_is_reverse(m.position().is_reverse()); + right.mutable_position()->set_is_reverse(m.position().is_reverse()); + + // left always has the position of the input mapping + *left.mutable_position() = m.position(); + + // nothing to cut + if (offset == 0) { + // we will get a 0-length left + right = m; + } else if (offset >= mapping_to_length(m)) { + // or a 0-length right + left = m; + } else { + // we need to cut the mapping + + // find the cut point and build the two mappings + size_t seen = 0; + size_t j = 0; + // loop over those before our position + for ( ; j < m.edit_size() && seen < offset; ++j) { + const edit_t& e = m.edit(j); + if (seen + e.to_length() > offset) { + // we need to divide this edit + // Convert to protobuf, cut, then convert back + Edit proto_edit; + to_proto_edit(e, proto_edit); + auto proto_edits = cut_edit_at_to(proto_edit, seen + e.to_length() - offset); + from_proto_edit(proto_edits.first, *left.add_edit()); + from_proto_edit(proto_edits.second, *right.add_edit()); + } else { + // this would be the last edit before the target position + *left.add_edit() = e; + } + seen += e.to_length(); + } + // now we add to the second path + for ( ; j < m.edit_size(); ++j) { + *right.add_edit() = m.edit(j); + } + } + // The right mapping has a position on this same node + right.mutable_position()->set_node_id(m.position().node_id()); + right.mutable_position()->set_offset(left.position().offset() + mapping_from_length(left)); + + return make_pair(left, right); +} + +// divide the path_t at a path-relative offset as measured in to_length from start +pair cut_path(const path_t& path, size_t offset) { + path_t p1, p2; + size_t seen = 0; + size_t i = 0; + if (!path.mapping_size()) { + return make_pair(path, path); + } + + // seek forward to the cut point + for ( ; i < path.mapping_size() && seen < offset; ++i) { + const path_mapping_t& m = path.mapping(i); + // the position is in this node, so make the cut + if (seen + mapping_to_length(m) == offset) { + *p1.add_mapping() = m; + } else if (seen + mapping_to_length(m) > offset) { + auto mappings = cut_mapping(m, offset - seen); + // and save the cuts + *p1.add_mapping() = mappings.first; + *p2.add_mapping() = mappings.second; + ++i; // we don't increment our mapping index when we break here + seen += mapping_to_length(m); // same problem + break; + } else { + // otherwise keep adding the mappings onto our first path + *p1.add_mapping() = m; + } + seen += mapping_to_length(m); + } + + // add in the rest of the edits + for ( ; i < path.mapping_size(); ++i) { + const path_mapping_t& m = path.mapping(i); + *p2.add_mapping() = m; + } + + return make_pair(p1, p2); +} + +path_t extract_subpath(const path_t& path, size_t start_offset, size_t end_offset) { + // Handle edge cases + if (start_offset >= end_offset) { + return path_t(); // return empty path + } + + size_t path_len = path_to_length(path); + if (start_offset >= path_len) { + return path_t(); // return empty path + } + + // Clamp end_offset to path length + if (end_offset > path_len) { + end_offset = path_len; + } + + // If we're extracting the whole path, just return it + if (start_offset == 0 && end_offset == path_len) { + return path; + } + + // Cut at start to get everything from start onward + auto first_cut = cut_path(path, start_offset); + path_t suffix = first_cut.second; + + // Cut the suffix at (end_offset - start_offset) to get just the middle part + size_t middle_length = end_offset - start_offset; + auto second_cut = cut_path(suffix, middle_length); + + return second_cut.first; +} + bool maps_to_node(const Path& p, id_t id) { for (size_t i = 0; i < p.mapping_size(); ++i) { if (p.mapping(i).position().node_id() == id) return true; diff --git a/src/path.hpp b/src/path.hpp index 57e0f76a93..72fb603c01 100644 --- a/src/path.hpp +++ b/src/path.hpp @@ -558,6 +558,12 @@ pos_t final_position(const path_t& path); int corresponding_to_length(const path_t& path, int from_length, bool from_end); int corresponding_from_length(const path_t& path, int to_length, bool from_end); +pair cut_path(const path_t& path, size_t offset); +pair cut_mapping(const path_mapping_t& m, size_t offset); +/// Extract a subpath from start_offset to end_offset (measured in to_length, +/// along the read side) +path_t extract_subpath(const path_t& path, size_t start_offset, size_t end_offset); + string debug_string(const path_t& path); string debug_string(const path_mapping_t& mapping); string debug_string(const edit_t& edit); diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 889a6d607b..1bc0a07a78 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -26,6 +26,7 @@ #include "../multipath_alignment_emitter.hpp" #include "../crash.hpp" #include "../watchdog.hpp" +#include "../explainer.hpp" using namespace std; @@ -57,6 +58,8 @@ void help_surject(char** argv) { << " -l, --subpath-local let the multipath mapping surjection produce local" << endl << " (rather than global) alignments" << endl << " -T, --max-tail-len N only align up to N bases of read tails [10000]" << endl + << " -e, --max-tail-cells N only fill up to N alignment matrix cells to align tails" << endl + << " (default: " << Surjector::DEFAULT_MAX_TAIL_CELLS << ")" << endl << " -g, --max-graph-scale X make reads unmapped if alignment target subgraph" << endl << " size exceeds read length by a factor of X " << endl << " (default: " << Surjector::DEFAULT_SUBGRAPH_LIMIT @@ -160,7 +163,8 @@ int main_surject(int argc, char** argv) { int64_t min_splice_length = 20; size_t watchdog_timeout = 10; bool subpath_global = true; // force full length alignments in mpmap resolution - size_t max_tail_len = 10000; + size_t max_tail_length = 10000; + uint64_t max_tail_cells = Surjector::DEFAULT_MAX_TAIL_CELLS; // This needs to be nullable so that we can use the default for spliced if doing spliced mode. std::unique_ptr max_graph_scale; bool qual_adj = false; @@ -190,6 +194,7 @@ int main_surject(int argc, char** argv) { {"ref-sample", required_argument, 0, 'n'}, // Provide an alias to match Giraffe {"subpath-local", no_argument, 0, 'l'}, {"max-tail-len", required_argument, 0, 'T'}, + {"max-tail-cells", required_argument, 0, 'e'}, {"max-graph-scale", required_argument, 0, 'g'}, {"interleaved", no_argument, 0, 'i'}, {"multimap", no_argument, 0, 'M'}, @@ -218,7 +223,7 @@ int main_surject(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "h?x:p:F:n:lT:g:iGmcbsuN:R:f:C:t:SPI:a:AE:LHMVw:r", + c = getopt_long (argc, argv, "h?x:p:F:n:lT:e:g:iGmcbsuN:R:f:C:t:SPI:a:AE:LHMVw:r", long_options, &option_index); // Detect the end of the options. @@ -249,7 +254,11 @@ int main_surject(int argc, char** argv) { break; case 'T': - max_tail_len = parse(optarg); + max_tail_length = parse(optarg); + break; + + case 'e': + max_tail_cells = parse(optarg); break; case 'g': @@ -364,6 +373,8 @@ int main_surject(int argc, char** argv) { } } + Explainer::save_explanations = true; + string file_name = get_input_file_name(optind, argc, argv); if (have_input_file(optind, argc, argv)) { @@ -449,7 +460,8 @@ int main_surject(int argc, char** argv) { else { surjector.min_splice_length = numeric_limits::max(); } - surjector.max_tail_length = max_tail_len; + surjector.max_tail_length = max_tail_length; + surjector.max_tail_cells = max_tail_cells; surjector.annotate_with_all_path_scores = annotate_with_all_path_scores; surjector.annotate_with_graph_alignment = annotate_with_graph_alignment; if (max_graph_scale) { diff --git a/src/surjector.hpp b/src/surjector.hpp index 552fb504ad..9564b7c99b 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -134,6 +134,11 @@ using namespace std; /// the maximum length of a tail that we will try to align size_t max_tail_length = 10000; + + /// An accessible default max_tail_cells + static constexpr double DEFAULT_MAX_TAIL_CELLS = 25000000000; + /// The maximum number of cells that we are willing to fill when aligning a tail + uint64_t max_tail_cells = DEFAULT_MAX_TAIL_CELLS; // the maximum number of estimated band cells that we are willing to try to fill when connecting anchors uint64_t max_band_cells = 8000000000; @@ -144,7 +149,7 @@ using namespace std; static constexpr double DEFAULT_SUBGRAPH_LIMIT = 100 * 1024 / 125.0; /// How big of a graph (in graph bases per read base) should we ever try to align against for realigning surjection? double max_subgraph_bases_per_read_base = DEFAULT_SUBGRAPH_LIMIT; - /// Don't refuse to align (graph size) * (read size) is at least this size (overrides max_subgraph_bases_per_read_base) + /// Don't refuse to align unless (graph size) * (read size) is at least this size (overrides max_subgraph_bases_per_read_base) int64_t min_absolute_align_size_to_refuse = 1024; /// in spliced surject, downsample if the base-wise average coverage by chunks is this high diff --git a/src/unittest/multipath_alignment_graph.cpp b/src/unittest/multipath_alignment_graph.cpp index c3ed8c568e..1a235f09dd 100644 --- a/src/unittest/multipath_alignment_graph.cpp +++ b/src/unittest/multipath_alignment_graph.cpp @@ -19,7 +19,13 @@ namespace unittest { class TestMultipathAlignmentGraph : public MultipathAlignmentGraph { public: + using MultipathAlignmentGraph::MultipathAlignmentGraph; using MultipathAlignmentGraph::decompose_alignments; + using MultipathAlignmentGraph::build_queue_count; + using MultipathAlignmentGraph::add_reachability_edges_easy; + using MultipathAlignmentGraph::add_reachability_edges_general; + using MultipathAlignmentGraph::create_match_nodes; + using MultipathAlignmentGraph::path_nodes; }; TEST_CASE( "MultipathAlignmentGraph::align handles tails correctly", "[multipath][mapping][multipathalignmentgraph]" ) { @@ -763,5 +769,213 @@ TEST_CASE("Tail alignments can be decomposed", "[multipathalignmentgraph]") { // TODO: should check the unshared blocks, but i'm too lazy } +TEST_CASE( "MultipathAlignmentGraph construction doesn't have exploding noncolinear shells", "[multipath][mapping][multipathalignmentgraph]" ) { + + size_t scale = 50; + + // Generate a stick graph + HashGraph graph; + handle_t prev; + for (size_t i = 0; i < scale; i++) { + handle_t h = graph.create_handle("A"); + if (i > 0) { + graph.create_edge(prev, h); + } + prev = h; + } + + // Make snarls on it + CactusSnarlFinder bubble_finder(graph); + IntegratedSnarlFinder snarl_finder(graph); + SnarlManager snarl_manager = bubble_finder.find_snarls(); + SnarlDistanceIndex dist_index; + fill_in_distance_index(&dist_index, &graph, &snarl_finder); + + // We need a fake read + string read(scale, 'A'); + + // Pack it into an Alignment. + // Note that we need to use the Alignment's copy for getting iterators for the MEMs. + Alignment query; + query.set_sequence(read); + + // Make an Aligner to use for the actual aligning and the scores + Aligner aligner; + + // Make an identity projection translation + auto identity = MultipathAlignmentGraph::create_identity_projection_trans(graph); + + // Make up a fake MEM + // GCSA range_type is just a pair of [start, end], so we can fake them. + + // This will actually own the MEMs (so they don't move) + std::list mems; + + // This will hold our MEMs and their start positions in the imaginary graph. + // Note that this is also a memcluster_t + pair>, double> mem_hits; + mem_hits.second = 1.0; + + for (size_t i = 0; i < scale; i += 5) { + auto query_start = query.sequence().begin() + (i % (scale)); + auto query_end = query_start + 1; + // Make a MEM in the read of 1 bp, all across the length + mems.emplace_back(query_start, query_end, make_pair(1, 1), 1); + // Put it on a node's forward strand, doubled up over the first half as if the read loops + mem_hits.first.emplace_back(&mems.back(), make_pos_t(1 + (i % (scale / 4)), false, 0)); + } + + // Make the MultipathAlignmentGraph to test + vector provenance; + TestMultipathAlignmentGraph mpg(graph, mem_hits, identity, provenance); + + // Make sure we didn't do huge amounts of work + REQUIRE(mpg.build_queue_count < (scale / 5 * 2)); + +} + +TEST_CASE( "MultipathAlignmentGraph easy and general reachability edge construction produce the same non-transitive edges", "[multipath][mapping][multipathalignmentgraph]" ) { + + size_t scale = 250; + + // Generate a stick graph + HashGraph graph; + handle_t prev; + for (size_t i = 0; i < scale; i++) { + handle_t h = graph.create_handle("A"); + if (i > 0) { + graph.create_edge(prev, h); + } + prev = h; + } + + // Make snarls on it + CactusSnarlFinder bubble_finder(graph); + IntegratedSnarlFinder snarl_finder(graph); + SnarlManager snarl_manager = bubble_finder.find_snarls(); + SnarlDistanceIndex dist_index; + fill_in_distance_index(&dist_index, &graph, &snarl_finder); + + // We need a fake read + string read(scale, 'A'); + + // Pack it into an Alignment. + // Note that we need to use the Alignment's copy for getting iterators for the MEMs. + Alignment query; + query.set_sequence(read); + + // Make an Aligner to use for the actual aligning and the scores + Aligner aligner; + + // Make an identity projection translation + auto identity_trans = MultipathAlignmentGraph::create_identity_projection_trans(graph); + auto identity = MultipathAlignmentGraph::create_projector(identity_trans); + + // Make up a fake MEM + // GCSA range_type is just a pair of [start, end], so we can fake them. + + // This will actually own the MEMs (so they don't move) + std::list mems; + + // This will hold our MEMs and their start positions in the imaginary graph. + // Note that this is also a memcluster_t + pair>, double> mem_hits; + mem_hits.second = 1.0; + + for (size_t i = 0; i < scale; i += 5) { + auto query_start = query.sequence().begin() + (i % (scale)); + auto query_end = query_start + 1; + // Make a MEM in the read of 1 bp, all across the length + mems.emplace_back(query_start, query_end, make_pair(1, 1), 1); + // Put it on a node's forward strand, doubled up over the first half as if the read loops + mem_hits.first.emplace_back(&mems.back(), make_pos_t(1 + (i % (scale / 4)), false, 0)); + } + + auto injection_trans = TestMultipathAlignmentGraph::create_injection_trans(identity_trans); + + vector topological_order = handlealgs::lazier_topological_order(&graph); + + // Create two identical MultipathAlignmentGraphs + TestMultipathAlignmentGraph mpg_easy; + TestMultipathAlignmentGraph mpg_general; + + vector provenance_easy; + mpg_easy.create_match_nodes(graph, mem_hits, identity, injection_trans, provenance_easy, 0, nullptr); + + vector provenance_general; + mpg_general.create_match_nodes(graph, mem_hits, identity, injection_trans, provenance_easy, 0, nullptr); + +#ifdef debug + std::cerr << "Computing easy result" << std::endl; +#endif + + // Add reachability edges using the easy method + mpg_easy.add_reachability_edges_easy(graph, identity, injection_trans, topological_order, &provenance_easy); + +#ifdef debug + std::cerr << "Computing general result" << std::endl; +#endif + + // Add reachability edges using the general method + mpg_general.add_reachability_edges_general(graph, identity, injection_trans, topological_order, &provenance_general); + + // Neityher method guarantees not to produce transitive edges, and they + // might produce different transitive edges. So make sure to remove + // transitive edges. This needs a topological order of MEMs. + std::vector mem_order_general; + mpg_general.topological_sort(mem_order_general); + mpg_general.remove_transitive_edges(mem_order_general); + + std::vector mem_order_easy; + mpg_easy.topological_sort(mem_order_easy); + mpg_easy.remove_transitive_edges(mem_order_easy); + +#ifdef debug + std::cerr << "Easy Result" << std::endl; + mpg_easy.to_dot(std::cerr, &query); + + std::cerr << "General Result" << std::endl; + mpg_general.to_dot(std::cerr, &query); +#endif + + // Compare the number of path nodes (in case splitting happened differently) + REQUIRE(mpg_easy.path_nodes.size() == mpg_general.path_nodes.size()); + + // Compare the number of edges + size_t easy_edge_count = mpg_easy.count_reachability_edges(); + size_t general_edge_count = mpg_general.count_reachability_edges(); + + REQUIRE(easy_edge_count == general_edge_count); + + // Compare the edges for each path node + for (size_t i = 0; i < mpg_easy.path_nodes.size(); i++) { + auto& easy_node = mpg_easy.path_nodes[i]; + auto& general_node = mpg_general.path_nodes[i]; + + // Same number of edges + REQUIRE(easy_node.edges.size() == general_node.edges.size()); + + // Sort edges for comparison, to allow different orders + auto easy_edges = easy_node.edges; + auto general_edges = general_node.edges; + + sort(easy_edges.begin(), easy_edges.end()); + sort(general_edges.begin(), general_edges.end()); + + // Compare each edge + for (size_t j = 0; j < easy_edges.size(); j++) { + REQUIRE(easy_edges[j].first == general_edges[j].first); // target node + REQUIRE(easy_edges[j].second == general_edges[j].second); // distance + } + + // Compare read intervals + REQUIRE(easy_node.begin == general_node.begin); + REQUIRE(easy_node.end == general_node.end); + + // Compare paths + REQUIRE(easy_node.path == general_node.path); + } +} + } }