From ff1668a3f49c55ee0968ac6a789800f6fea31a87 Mon Sep 17 00:00:00 2001 From: HONGDAE KIM Date: Sun, 15 Feb 2026 14:12:09 +0900 Subject: [PATCH] Add schema semantics alias normalization and UI mapping output --- bitnet_tools/analysis.py | 9 ++- bitnet_tools/schema_semantics.py | 115 +++++++++++++++++++++++++++++ bitnet_tools/ui/app.js | 19 +++++ bitnet_tools/ui/index.html | 2 + resources/schema_semantics_ko.json | 14 ++++ tests/test_analysis.py | 11 +++ tests/test_schema_semantics.py | 37 ++++++++++ 7 files changed, 205 insertions(+), 2 deletions(-) create mode 100644 bitnet_tools/schema_semantics.py create mode 100644 resources/schema_semantics_ko.json create mode 100644 tests/test_schema_semantics.py diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py index 684d3a8..7efc055 100644 --- a/bitnet_tools/analysis.py +++ b/bitnet_tools/analysis.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import Any +from .schema_semantics import load_schema_semantics, normalize_question_entities + VALID_INPUT_TYPES = {"csv", "excel", "document"} @@ -229,13 +231,16 @@ def build_analysis_payload_from_normalized_input( columns = [str(c) for c in reader.fieldnames] summary = summarize_reader(reader, columns) + planner = normalize_question_entities(question, columns, load_schema_semantics()) csv_path = csv_path_override or normalized_input.source_name return { "csv_path": csv_path, - "question": question, + "question": planner["normalized_question"], + "original_question": question, "summary": summary.to_dict(), - "prompt": build_prompt(summary, question), + "prompt": build_prompt(summary, planner["normalized_question"]), + "schema_semantics_mappings": planner["mappings"], "input": normalized_input.to_dict(), } diff --git a/bitnet_tools/schema_semantics.py b/bitnet_tools/schema_semantics.py new file mode 100644 index 0000000..174820c --- /dev/null +++ b/bitnet_tools/schema_semantics.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from pathlib import Path +from typing import Any + +SCHEMA_SEMANTICS_PATH = Path(__file__).resolve().parents[1] / "resources" / "schema_semantics_ko.json" + + +@dataclass +class AliasConcept: + canonical: str + aliases: list[str] + column_aliases: list[str] + + +@dataclass +class AliasMatch: + user_term: str + status: str + canonical: str | None = None + matched_column: str | None = None + candidates: list[str] | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "user_term": self.user_term, + "status": self.status, + "canonical": self.canonical, + "matched_column": self.matched_column, + "candidates": self.candidates or [], + } + + +def _norm(text: str) -> str: + return "".join(ch for ch in str(text).strip().lower() if not ch.isspace() and ch not in "_-") + + +def load_schema_semantics(path: str | Path | None = None) -> list[AliasConcept]: + semantics_path = Path(path) if path else SCHEMA_SEMANTICS_PATH + raw = json.loads(semantics_path.read_text(encoding="utf-8")) + items = raw.get("concepts", []) if isinstance(raw, dict) else [] + + concepts: list[AliasConcept] = [] + for item in items: + if not isinstance(item, dict): + continue + canonical = str(item.get("canonical", "")).strip() + if not canonical: + continue + aliases = [str(x).strip() for x in item.get("aliases", []) if str(x).strip()] + col_aliases = [str(x).strip() for x in item.get("column_aliases", []) if str(x).strip()] + concepts.append(AliasConcept(canonical=canonical, aliases=aliases, column_aliases=col_aliases)) + return concepts + + +def match_alias_to_column(user_term: str, columns: list[str], concepts: list[AliasConcept]) -> AliasMatch: + term_key = _norm(user_term) + if not term_key: + return AliasMatch(user_term=user_term, status="failed") + + concept: AliasConcept | None = None + for c in concepts: + vocab = {_norm(c.canonical), *(_norm(a) for a in c.aliases)} + if term_key in vocab: + concept = c + break + + if concept is None: + return AliasMatch(user_term=user_term, status="failed") + + candidate_keys = {_norm(concept.canonical), *(_norm(a) for a in concept.column_aliases), *(_norm(a) for a in concept.aliases)} + candidates = [col for col in columns if _norm(col) in candidate_keys] + + if len(candidates) == 1: + return AliasMatch( + user_term=user_term, + status="success", + canonical=concept.canonical, + matched_column=candidates[0], + candidates=candidates, + ) + if len(candidates) > 1: + return AliasMatch( + user_term=user_term, + status="ambiguous", + canonical=concept.canonical, + candidates=candidates, + ) + return AliasMatch(user_term=user_term, status="failed", canonical=concept.canonical) + + +def normalize_question_entities(question: str, columns: list[str], concepts: list[AliasConcept]) -> dict[str, Any]: + normalized_question = question + mappings: list[AliasMatch] = [] + + for concept in concepts: + terms = [concept.canonical, *concept.aliases] + for term in terms: + if term and term in question: + match = match_alias_to_column(term, columns, concepts) + mappings.append(match) + if match.status == "success" and match.matched_column: + normalized_question = normalized_question.replace(term, match.matched_column) + break + + deduped: dict[str, AliasMatch] = {} + for item in mappings: + deduped[item.user_term] = item + + return { + "normalized_question": normalized_question, + "mappings": [m.to_dict() for m in deduped.values()], + } diff --git a/bitnet_tools/ui/app.js b/bitnet_tools/ui/app.js index c367aee..0859e7e 100644 --- a/bitnet_tools/ui/app.js +++ b/bitnet_tools/ui/app.js @@ -12,6 +12,7 @@ const UI = { quickAnalyzeBtn: document.getElementById('quickAnalyzeBtn'), runBtn: document.getElementById('runBtn'), summary: document.getElementById('summary'), + schemaMappings: document.getElementById('schemaMappings'), prompt: document.getElementById('prompt'), answer: document.getElementById('answer'), analyzeAssist: document.getElementById('analyzeAssist'), @@ -217,6 +218,22 @@ async function buildMultiPayloadFiles(files) { return payloadFiles; } + +function renderSchemaMappings(data) { + if (!UI.schemaMappings) return; + const mappings = Array.isArray(data?.schema_semantics_mappings) ? data.schema_semantics_mappings : []; + if (!mappings.length) { + UI.schemaMappings.textContent = '자동 매핑 결과가 없습니다.'; + return; + } + const lines = mappings.map((m) => { + if (m.status === 'success') return `${m.user_term} → ${m.matched_column}`; + if (m.status === 'ambiguous') return `${m.user_term} → 후보: ${(m.candidates || []).join(', ')}`; + return `${m.user_term} → 매핑 실패`; + }); + UI.schemaMappings.textContent = lines.join('\n'); +} + function setStatus(message) { if (UI.statusBox) UI.statusBox.textContent = message; } @@ -677,6 +694,7 @@ async function runAnalyzeFromPreprocessed(result, fallbackQuestion = '') { const data = await postJson('/api/analyze', body, '분석'); appState.latestPrompt = data.prompt; UI.summary.textContent = JSON.stringify(data.summary, null, 2); + renderSchemaMappings(data); renderAnalyzeAssist(data); if (UI.prompt) UI.prompt.textContent = data.prompt; if (UI.answer) UI.answer.textContent = ''; @@ -833,6 +851,7 @@ async function runAnalyze() { resetAnalyzeAssist(); setStatus(STATUS.analyzing); UI.summary.textContent = STATUS.analyzing; + if (UI.schemaMappings) UI.schemaMappings.textContent = '자동 매핑 결과를 계산 중입니다...'; toggleBusy(true); try { const body = await buildAnalyzeRequest(); diff --git a/bitnet_tools/ui/index.html b/bitnet_tools/ui/index.html index 7ed2212..c259afb 100644 --- a/bitnet_tools/ui/index.html +++ b/bitnet_tools/ui/index.html @@ -97,6 +97,8 @@

4) 결과

후보 테이블 미리보기(상위 5행)
+

스키마 자동 매핑

+
자동 매핑 결과가 여기에 표시됩니다.

데이터 요약


       
diff --git a/resources/schema_semantics_ko.json b/resources/schema_semantics_ko.json
new file mode 100644
index 0000000..638f2dd
--- /dev/null
+++ b/resources/schema_semantics_ko.json
@@ -0,0 +1,14 @@
+{
+  "concepts": [
+    {
+      "canonical": "시군구",
+      "aliases": ["구군", "지역구"],
+      "column_aliases": ["sigungu_col", "sigungu", "시군구명", "구군명"]
+    },
+    {
+      "canonical": "세차유형",
+      "aliases": ["서비스타입"],
+      "column_aliases": ["service_type_col", "service_type", "세차유형코드", "서비스타입"]
+    }
+  ]
+}
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index da8ae62..d80072e 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -220,3 +220,14 @@ def test_normalize_analysis_input_rejects_unsupported_type():
 
     with pytest.raises(ValueError):
         normalize_analysis_input({"input_type": "json", "normalized_csv_text": "a\n1\n"})
+
+
+def test_build_analysis_payload_normalizes_question_by_schema_semantics():
+    payload = build_analysis_payload_from_csv_text(
+        "sigungu_col,service_type_col\n강남,셀프\n",
+        "시군구 별 세차유형 통계를 보여줘",
+    )
+
+    assert payload["original_question"] == "시군구 별 세차유형 통계를 보여줘"
+    assert payload["question"] == "sigungu_col 별 service_type_col 통계를 보여줘"
+    assert any(m["status"] == "success" for m in payload["schema_semantics_mappings"])
diff --git a/tests/test_schema_semantics.py b/tests/test_schema_semantics.py
new file mode 100644
index 0000000..28f03f1
--- /dev/null
+++ b/tests/test_schema_semantics.py
@@ -0,0 +1,37 @@
+from bitnet_tools.schema_semantics import (
+    load_schema_semantics,
+    match_alias_to_column,
+    normalize_question_entities,
+)
+
+
+def test_schema_semantics_success_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('시군구', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert match.status == 'success'
+    assert match.matched_column == 'sigungu_col'
+
+
+def test_schema_semantics_failed_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('없는용어', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert match.status == 'failed'
+    assert match.matched_column is None
+
+
+def test_schema_semantics_ambiguous_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('세차유형', ['service_type_col', 'service_type'], concepts)
+
+    assert match.status == 'ambiguous'
+    assert sorted(match.candidates or []) == ['service_type', 'service_type_col']
+
+
+def test_normalize_question_entities_replaces_with_column_name():
+    concepts = load_schema_semantics()
+    result = normalize_question_entities('시군구 별 세차유형 비율을 보여줘', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert result['normalized_question'] == 'sigungu_col 별 service_type_col 비율을 보여줘'
+    assert len(result['mappings']) == 2