Skip to content

Commit 2343f8e

Browse files
testdLo999
andcommitted
feat: generic package/module resolution for IMPORTS edges across 10 languages
Bare import specifiers (@myorg/pkg, github.com/foo/bar, use my_crate::foo) previously produced zero IMPORTS edges. This adds manifest-aware resolution during parallel extraction for JS/TS (package.json), Go (go.mod), Rust (Cargo.toml), Python (pyproject.toml), PHP (composer.json), Java (pom.xml), Gradle (build.gradle), Dart (pubspec.yaml), Elixir (mix.exs), and Ruby (*.gemspec). Workers parse manifest files during extraction with zero extra I/O, entries are merged into a hash table before registry build, and cbm_pipeline_resolve_module() provides unified resolution with prefix matching for Go/Java/PHP path-based specifiers. Closes #180. Based on #184 by dLo999 (JS/TS package map approach). Co-Authored-By: Dustin Obrecht <dustin@kurtnoble.com>
1 parent 9021c22 commit 2343f8e

6 files changed

Lines changed: 1042 additions & 26 deletions

File tree

Makefile.cbm

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ PIPELINE_SRCS = \
194194
src/pipeline/pass_similarity.c \
195195
src/pipeline/pass_semantic_edges.c \
196196
src/pipeline/pass_cross_repo.c \
197-
src/pipeline/artifact.c
197+
src/pipeline/artifact.c \
198+
src/pipeline/pass_pkgmap.c
198199

199200
# SimHash / MinHash module
200201
SIMHASH_SRCS = src/simhash/minhash.c

src/pipeline/pass_definitions.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -304,13 +304,7 @@ static int create_import_edges_for_file(cbm_pipeline_ctx_t *ctx, const CBMFileRe
304304
continue;
305305
}
306306
char *target_qn = NULL;
307-
char *resolved = cbm_pipeline_resolve_relative_import(rel, imp->module_path);
308-
if (resolved) {
309-
target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved);
310-
free(resolved);
311-
} else {
312-
target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path);
313-
}
307+
target_qn = cbm_pipeline_resolve_module(ctx, rel, imp->module_path);
314308
const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn);
315309
char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__");
316310
const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn);

src/pipeline/pass_parallel.c

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,8 @@ typedef struct {
399399
_Atomic int64_t *shared_ids;
400400
_Atomic int *cancelled;
401401
_Atomic int next_file_idx;
402+
403+
cbm_pkg_entries_t *pkg_entries; /* per-worker manifest arrays (separate allocation) */
402404
} extract_ctx_t;
403405

404406
/* Insert one definition node (and its route if present) into the local gbuf. */
@@ -506,6 +508,13 @@ static void extract_worker(int worker_id, void *ctx_ptr) {
506508
* are released before the slab is bulk-reclaimed. */
507509
cbm_free_tree(result);
508510

511+
/* Detect and parse manifest files for package map */
512+
{
513+
const char *bn = strrchr(fi->rel_path, '/');
514+
cbm_pkgmap_try_parse(bn ? bn + SKIP_ONE : fi->rel_path, fi->rel_path, source,
515+
source_len, &ec->pkg_entries[worker_id]);
516+
}
517+
509518
/* Free source buffer — extraction captured everything needed. */
510519
free_source(source);
511520

@@ -539,6 +548,28 @@ static void extract_worker(int worker_id, void *ctx_ptr) {
539548
cbm_slab_destroy_thread();
540549
}
541550

551+
static void merge_pkg_entries(cbm_pipeline_ctx_t *ctx, cbm_pkg_entries_t *pkg_entries,
552+
int worker_count) {
553+
if (!pkg_entries) {
554+
return;
555+
}
556+
cbm_pipeline_set_pkgmap(cbm_pkgmap_build(pkg_entries, worker_count, ctx->project_name));
557+
for (int i = 0; i < worker_count; i++) {
558+
cbm_pkg_entries_free(&pkg_entries[i]);
559+
}
560+
free(pkg_entries);
561+
}
562+
563+
static void log_extract_mem_stats(int worker_count) {
564+
if (cbm_mem_budget() > 0) {
565+
size_t mb = (size_t)CBM_SZ_1K * CBM_SZ_1K;
566+
cbm_log_info("parallel.extract.mem", "rss_mb", itoa_log((int)(cbm_mem_rss() / mb)),
567+
"peak_mb", itoa_log((int)(cbm_mem_peak_rss() / mb)), "budget_mb",
568+
itoa_log((int)(cbm_mem_budget() / mb)), "per_worker_mb",
569+
itoa_log((int)(cbm_mem_worker_budget(worker_count) / mb)));
570+
}
571+
}
572+
542573
int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files, int file_count,
543574
CBMFileResult **result_cache, _Atomic int64_t *shared_ids,
544575
int worker_count) {
@@ -585,6 +616,9 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files,
585616
}
586617
memset(workers, 0, (size_t)worker_count * sizeof(extract_worker_state_t));
587618

619+
/* Per-worker manifest entry arrays (separate from cache-line-aligned worker state) */
620+
cbm_pkg_entries_t *pkg_entries = calloc(worker_count, sizeof(cbm_pkg_entries_t));
621+
588622
extract_ctx_t ec = {
589623
.files = files,
590624
.sorted = sorted,
@@ -596,6 +630,7 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files,
596630
.result_cache = result_cache,
597631
.shared_ids = shared_ids,
598632
.cancelled = ctx->cancelled,
633+
.pkg_entries = pkg_entries,
599634
};
600635
atomic_init(&ec.next_worker_id, 0);
601636
atomic_init(&ec.next_file_idx, 0);
@@ -620,23 +655,16 @@ int cbm_parallel_extract(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *files,
620655
}
621656
CBM_PROF_END_N("parallel_extract", "4_merge_gbufs_seq", t_merge, total_nodes);
622657

658+
merge_pkg_entries(ctx, pkg_entries, worker_count);
659+
623660
cbm_aligned_free(workers);
624661
free(sorted);
625662

626663
if (atomic_load(ctx->cancelled)) {
627664
return CBM_NOT_FOUND;
628665
}
629666

630-
/* RSS-based memory stats after extraction */
631-
if (cbm_mem_budget() > 0) {
632-
size_t rss_mb = cbm_mem_rss() / ((size_t)CBM_SZ_1K * CBM_SZ_1K);
633-
size_t peak_mb = cbm_mem_peak_rss() / ((size_t)CBM_SZ_1K * CBM_SZ_1K);
634-
size_t budget_mb = cbm_mem_budget() / ((size_t)CBM_SZ_1K * CBM_SZ_1K);
635-
size_t worker_mb = cbm_mem_worker_budget(worker_count) / ((size_t)CBM_SZ_1K * CBM_SZ_1K);
636-
cbm_log_info("parallel.extract.mem", "rss_mb", itoa_log((int)rss_mb), "peak_mb",
637-
itoa_log((int)peak_mb), "budget_mb", itoa_log((int)budget_mb), "per_worker_mb",
638-
itoa_log((int)worker_mb));
639-
}
667+
log_extract_mem_stats(worker_count);
640668

641669
cbm_log_info("parallel.extract.done", "nodes", itoa_log(total_nodes), "errors",
642670
itoa_log(total_errors));
@@ -684,14 +712,7 @@ static int create_imports_edges(cbm_pipeline_ctx_t *ctx, const CBMFileResult *re
684712
if (!imp->module_path) {
685713
continue;
686714
}
687-
char *target_qn = NULL;
688-
char *resolved = cbm_pipeline_resolve_relative_import(rel, imp->module_path);
689-
if (resolved) {
690-
target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved);
691-
free(resolved);
692-
} else {
693-
target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path);
694-
}
715+
char *target_qn = cbm_pipeline_resolve_module(ctx, rel, imp->module_path);
695716
const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn);
696717
char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__");
697718
const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn);

0 commit comments

Comments
 (0)