From b52fcd2f3fbd33bb3469698254803582b0cb2659 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 12 Feb 2026 13:32:33 +0000 Subject: [PATCH 1/5] WIP faster topsort --- mypy/graph_utils.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mypy/graph_utils.py b/mypy/graph_utils.py index 154efcef48a93..1e52226612582 100644 --- a/mypy/graph_utils.py +++ b/mypy/graph_utils.py @@ -115,3 +115,57 @@ def topsort(data: dict[T, set[T]]) -> Iterable[set[T]]: yield ready data = {item: (dep - ready) for item, dep in data.items() if item not in ready} assert not data, f"A cyclic dependency exists amongst {data!r}" + + +def topsort2(data: dict[T, set[T]]) -> Iterable[set[T]]: + """Topological sort using Kahn's algorithm. + + This is functionally equivalent to topsort() but avoids rebuilding + the full dict and set objects on each iteration. Instead it uses + in-degree counters and a reverse adjacency list, so the total work + is O(V + E) rather than O(depth * V). + + Args: + data: A map from vertices to all vertices that it has an edge + connecting it to. NOTE: This data structure + is modified in place -- for normalization purposes, + self-dependencies are removed and entries representing + orphans are added. + + Returns: + An iterator yielding sets of vertices that have an equivalent + ordering. + """ + for k, v in data.items(): + v.discard(k) # Ignore self dependencies. + for item in set.union(*data.values()) - set(data.keys()): + data[item] = set() + + # Build reverse adjacency list and in-degree counts. + in_degree: dict[T, int] = {} + rev: dict[T, list[T]] = {} + for item in data: + in_degree[item] = len(data[item]) + rev[item] = [] + for item, deps in data.items(): + for dep in deps: + rev[dep].append(item) + + ready = {item for item, deg in in_degree.items() if deg == 0} + remaining = len(in_degree) - len(ready) + + while ready: + yield ready + new_ready: set[T] = set() + for item in ready: + for dependent in rev[item]: + in_degree[dependent] -= 1 + if in_degree[dependent] == 0: + new_ready.add(dependent) + remaining -= len(new_ready) + ready = new_ready + + assert remaining == 0, ( + f"A cyclic dependency exists amongst " + f"{[k for k, deg in in_degree.items() if deg > 0]!r}" + ) From 6f1249a7387573c2a366a31fbd6e37bae2821246 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 12 Feb 2026 13:51:15 +0000 Subject: [PATCH 2/5] Switch to a class instead of iterator function --- mypy/graph_utils.py | 77 ++++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/mypy/graph_utils.py b/mypy/graph_utils.py index 1e52226612582..6d87c52298a3e 100644 --- a/mypy/graph_utils.py +++ b/mypy/graph_utils.py @@ -117,7 +117,7 @@ def topsort(data: dict[T, set[T]]) -> Iterable[set[T]]: assert not data, f"A cyclic dependency exists amongst {data!r}" -def topsort2(data: dict[T, set[T]]) -> Iterable[set[T]]: +class topsort2(Iterator[set[T]]): """Topological sort using Kahn's algorithm. This is functionally equivalent to topsort() but avoids rebuilding @@ -125,47 +125,58 @@ def topsort2(data: dict[T, set[T]]) -> Iterable[set[T]]: in-degree counters and a reverse adjacency list, so the total work is O(V + E) rather than O(depth * V). + Implemented as a class rather than a generator for better mypyc + compilation. + Args: data: A map from vertices to all vertices that it has an edge connecting it to. NOTE: This data structure is modified in place -- for normalization purposes, self-dependencies are removed and entries representing orphans are added. - - Returns: - An iterator yielding sets of vertices that have an equivalent - ordering. """ - for k, v in data.items(): - v.discard(k) # Ignore self dependencies. - for item in set.union(*data.values()) - set(data.keys()): - data[item] = set() - - # Build reverse adjacency list and in-degree counts. - in_degree: dict[T, int] = {} - rev: dict[T, list[T]] = {} - for item in data: - in_degree[item] = len(data[item]) - rev[item] = [] - for item, deps in data.items(): - for dep in deps: - rev[dep].append(item) - - ready = {item for item, deg in in_degree.items() if deg == 0} - remaining = len(in_degree) - len(ready) - while ready: - yield ready + def __init__(self, data: dict[T, set[T]]) -> None: + for k, v in data.items(): + v.discard(k) # Ignore self dependencies. + for item in set.union(*data.values()) - set(data.keys()): + data[item] = set() + + # Build reverse adjacency list and in-degree counts. + in_degree: dict[T, int] = {} + rev: dict[T, list[T]] = {} + for item in data: + in_degree[item] = len(data[item]) + rev[item] = [] + for item, deps in data.items(): + for dep in deps: + rev[dep].append(item) + + self.in_degree = in_degree + self.rev = rev + self.ready = {item for item, deg in in_degree.items() if deg == 0} + self.remaining = len(in_degree) - len(self.ready) + + def __iter__(self) -> Iterator[set[T]]: + return self + + def __next__(self) -> set[T]: + ready = self.ready + if not ready: + assert self.remaining == 0, ( + f"A cyclic dependency exists amongst " + f"{[k for k, deg in self.in_degree.items() if deg > 0]!r}" + ) + raise StopIteration + in_degree = self.in_degree + rev = self.rev new_ready: set[T] = set() for item in ready: for dependent in rev[item]: - in_degree[dependent] -= 1 - if in_degree[dependent] == 0: + new_deg = in_degree[dependent] - 1 + in_degree[dependent] = new_deg + if new_deg == 0: new_ready.add(dependent) - remaining -= len(new_ready) - ready = new_ready - - assert remaining == 0, ( - f"A cyclic dependency exists amongst " - f"{[k for k, deg in in_degree.items() if deg > 0]!r}" - ) + self.remaining -= len(new_ready) + self.ready = new_ready + return ready From a9ecf42bfc43f90c53ef0b2dc6200a1d2954e335 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 12 Feb 2026 14:01:15 +0000 Subject: [PATCH 3/5] Optimize init into a single pass --- mypy/graph_utils.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/mypy/graph_utils.py b/mypy/graph_utils.py index 6d87c52298a3e..93c1b59e51c07 100644 --- a/mypy/graph_utils.py +++ b/mypy/graph_utils.py @@ -137,25 +137,33 @@ class topsort2(Iterator[set[T]]): """ def __init__(self, data: dict[T, set[T]]) -> None: - for k, v in data.items(): - v.discard(k) # Ignore self dependencies. - for item in set.union(*data.values()) - set(data.keys()): - data[item] = set() - - # Build reverse adjacency list and in-degree counts. + # Single pass: remove self-deps, build reverse adjacency list, + # compute in-degree counts, detect orphans, and find initial ready set. in_degree: dict[T, int] = {} rev: dict[T, list[T]] = {} - for item in data: - in_degree[item] = len(data[item]) - rev[item] = [] + ready: set[T] = set() for item, deps in data.items(): + deps.discard(item) # Ignore self dependencies. + deg = len(deps) + in_degree[item] = deg + if deg == 0: + ready.add(item) + if item not in rev: + rev[item] = [] for dep in deps: - rev[dep].append(item) + if dep in rev: + rev[dep].append(item) + else: + rev[dep] = [item] + if dep not in data: + # Orphan: appears as dependency but has no entry in data. + in_degree[dep] = 0 + ready.add(dep) self.in_degree = in_degree self.rev = rev - self.ready = {item for item, deg in in_degree.items() if deg == 0} - self.remaining = len(in_degree) - len(self.ready) + self.ready = ready + self.remaining = len(in_degree) - len(ready) def __iter__(self) -> Iterator[set[T]]: return self From c9d2e1ba5da50cca7a2dfa6f2614e8ca30caf294 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 12 Feb 2026 14:14:36 +0000 Subject: [PATCH 4/5] Add tests --- mypy/test/testgraph.py | 79 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 71 insertions(+), 8 deletions(-) diff --git a/mypy/test/testgraph.py b/mypy/test/testgraph.py index 29696c760b9c2..b1d4daf079815 100644 --- a/mypy/test/testgraph.py +++ b/mypy/test/testgraph.py @@ -8,7 +8,7 @@ from mypy.build import BuildManager, BuildSourceSet, State, order_ascc, sorted_components from mypy.errors import Errors from mypy.fscache import FileSystemCache -from mypy.graph_utils import strongly_connected_components, topsort +from mypy.graph_utils import strongly_connected_components, topsort, topsort2 from mypy.modulefinder import SearchPaths from mypy.options import Options from mypy.plugin import Plugin @@ -18,14 +18,77 @@ class GraphSuite(Suite): + def test_topsort_empty(self) -> None: + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {} + assert_equal(list(topsort2(data)), []) + def test_topsort(self) -> None: - a = frozenset({"A"}) - b = frozenset({"B"}) - c = frozenset({"C"}) - d = frozenset({"D"}) - data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}, b: {d}, c: {d}} - res = list(topsort(data)) - assert_equal(res, [{d}, {b, c}, {a}]) + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + c = frozenset({"C"}) + d = frozenset({"D"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}, b: {d}, c: {d}} + res = list(topsort_func(data)) + assert_equal(res, [{d}, {b, c}, {a}]) + + def test_topsort_orphan(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b}} + res = list(topsort_func(data)) + assert_equal(res, [{b}, {a}]) + + def test_topsort_independent(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + c = frozenset({"C"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: set(), b: set(), c: set()} + res = list(topsort_func(data)) + assert_equal(res, [{a, b, c}]) + + def test_topsort_linear_chain(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + c = frozenset({"C"}) + d = frozenset({"D"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = { + a: {b}, + b: {c}, + c: {d}, + d: set(), + } + res = list(topsort_func(data)) + assert_equal(res, [{d}, {c}, {b}, {a}]) + + def test_topsort_self_dependency(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {a, b}, b: set()} + res = list(topsort_func(data)) + assert_equal(res, [{b}, {a}]) + + def test_topsort_orphan_diamond(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + c = frozenset({"C"}) + # B and C are orphans -- they appear only in values, not as keys. + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}} + res = list(topsort_func(data)) + assert_equal(res, [{b, c}, {a}]) + + def test_topsort_cycle(self) -> None: + for topsort_func in [topsort, topsort2]: + a = frozenset({"A"}) + b = frozenset({"B"}) + data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b}, b: {a}} + with self.assertRaises(AssertionError): + list(topsort_func(data)) def test_scc(self) -> None: vertices = {"A", "B", "C", "D"} From 6d9890e66c82f046f92a8ed14b3f311633f55bc0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 12 Feb 2026 14:16:50 +0000 Subject: [PATCH 5/5] Update code to use topsort2 --- mypy/build.py | 6 +++--- mypy/graph_utils.py | 2 +- mypy/solve.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 93180e1eed5e9..b7561a57770e1 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -94,7 +94,7 @@ ErrorTupleRaw, report_internal_error, ) -from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort +from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort2 from mypy.indirection import TypeIndirectionVisitor from mypy.ipc import BadStatus, IPCClient, IPCMessage, read_status, ready_to_read, receive, send from mypy.messages import MessageBuilder @@ -4236,7 +4236,7 @@ def sorted_components(graph: Graph) -> list[SCC]: scc_dep_map = prepare_sccs_full(strongly_connected_components(vertices, edges), edges) # Topsort. res = [] - for ready in topsort(scc_dep_map): + for ready in topsort2(scc_dep_map): # Sort the sets in ready by reversed smallest State.order. Examples: # # - If ready is [{x}, {y}], x.order == 1, y.order == 2, we get @@ -4271,7 +4271,7 @@ def sorted_components_inner( edges = {id: deps_filtered(graph, vertices, id, pri_max) for id in vertices} sccs = list(strongly_connected_components(vertices, edges)) res = [] - for ready in topsort(prepare_sccs(sccs, edges)): + for ready in topsort2(prepare_sccs(sccs, edges)): res.extend(sorted(ready, key=lambda scc: -min(graph[id].order for id in scc))) return res diff --git a/mypy/graph_utils.py b/mypy/graph_utils.py index 93c1b59e51c07..30d1660e4c0a5 100644 --- a/mypy/graph_utils.py +++ b/mypy/graph_utils.py @@ -117,7 +117,7 @@ def topsort(data: dict[T, set[T]]) -> Iterable[set[T]]: assert not data, f"A cyclic dependency exists amongst {data!r}" -class topsort2(Iterator[set[T]]): +class topsort2(Iterator[set[T]]): # noqa: N801 """Topological sort using Kahn's algorithm. This is functionally equivalent to topsort() but avoids rebuilding diff --git a/mypy/solve.py b/mypy/solve.py index e3709106996cd..57c002ff9b55c 100644 --- a/mypy/solve.py +++ b/mypy/solve.py @@ -8,7 +8,7 @@ from mypy.constraints import SUBTYPE_OF, SUPERTYPE_OF, Constraint, infer_constraints, neg_op from mypy.expandtype import expand_type -from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort +from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort2 from mypy.join import join_type_list from mypy.meet import meet_type_list, meet_types from mypy.subtypes import is_subtype @@ -147,7 +147,7 @@ def solve_with_dependent( sccs = list(strongly_connected_components(set(vars), dmap)) if not all(check_linear(scc, lowers, uppers) for scc in sccs): return {}, [] - raw_batches = list(topsort(prepare_sccs(sccs, dmap))) + raw_batches = list(topsort2(prepare_sccs(sccs, dmap))) free_vars = [] free_solutions = {}