From ff1668a3f49c55ee0968ac6a789800f6fea31a87 Mon Sep 17 00:00:00 2001
From: HONGDAE KIM <rad174951@gmail.com>
Date: Sun, 15 Feb 2026 14:12:09 +0900
Subject: [PATCH] Add schema semantics alias normalization and UI mapping
 output

---
 bitnet_tools/analysis.py           |   9 ++-
 bitnet_tools/schema_semantics.py   | 115 +++++++++++++++++++++++++++++
 bitnet_tools/ui/app.js             |  19 +++++
 bitnet_tools/ui/index.html         |   2 +
 resources/schema_semantics_ko.json |  14 ++++
 tests/test_analysis.py             |  11 +++
 tests/test_schema_semantics.py     |  37 ++++++++++
 7 files changed, 205 insertions(+), 2 deletions(-)
 create mode 100644 bitnet_tools/schema_semantics.py
 create mode 100644 resources/schema_semantics_ko.json
 create mode 100644 tests/test_schema_semantics.py

diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py
index 684d3a8..7efc055 100644
--- a/bitnet_tools/analysis.py
+++ b/bitnet_tools/analysis.py
@@ -7,6 +7,8 @@
 from pathlib import Path
 from typing import Any
 
+from .schema_semantics import load_schema_semantics, normalize_question_entities
+
 
 VALID_INPUT_TYPES = {"csv", "excel", "document"}
 
@@ -229,13 +231,16 @@ def build_analysis_payload_from_normalized_input(
 
     columns = [str(c) for c in reader.fieldnames]
     summary = summarize_reader(reader, columns)
+    planner = normalize_question_entities(question, columns, load_schema_semantics())
     csv_path = csv_path_override or normalized_input.source_name
 
     return {
         "csv_path": csv_path,
-        "question": question,
+        "question": planner["normalized_question"],
+        "original_question": question,
         "summary": summary.to_dict(),
-        "prompt": build_prompt(summary, question),
+        "prompt": build_prompt(summary, planner["normalized_question"]),
+        "schema_semantics_mappings": planner["mappings"],
         "input": normalized_input.to_dict(),
     }
 
diff --git a/bitnet_tools/schema_semantics.py b/bitnet_tools/schema_semantics.py
new file mode 100644
index 0000000..174820c
--- /dev/null
+++ b/bitnet_tools/schema_semantics.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+import json
+from pathlib import Path
+from typing import Any
+
+SCHEMA_SEMANTICS_PATH = Path(__file__).resolve().parents[1] / "resources" / "schema_semantics_ko.json"
+
+
+@dataclass
+class AliasConcept:
+    canonical: str
+    aliases: list[str]
+    column_aliases: list[str]
+
+
+@dataclass
+class AliasMatch:
+    user_term: str
+    status: str
+    canonical: str | None = None
+    matched_column: str | None = None
+    candidates: list[str] | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "user_term": self.user_term,
+            "status": self.status,
+            "canonical": self.canonical,
+            "matched_column": self.matched_column,
+            "candidates": self.candidates or [],
+        }
+
+
+def _norm(text: str) -> str:
+    return "".join(ch for ch in str(text).strip().lower() if not ch.isspace() and ch not in "_-")
+
+
+def load_schema_semantics(path: str | Path | None = None) -> list[AliasConcept]:
+    semantics_path = Path(path) if path else SCHEMA_SEMANTICS_PATH
+    raw = json.loads(semantics_path.read_text(encoding="utf-8"))
+    items = raw.get("concepts", []) if isinstance(raw, dict) else []
+
+    concepts: list[AliasConcept] = []
+    for item in items:
+        if not isinstance(item, dict):
+            continue
+        canonical = str(item.get("canonical", "")).strip()
+        if not canonical:
+            continue
+        aliases = [str(x).strip() for x in item.get("aliases", []) if str(x).strip()]
+        col_aliases = [str(x).strip() for x in item.get("column_aliases", []) if str(x).strip()]
+        concepts.append(AliasConcept(canonical=canonical, aliases=aliases, column_aliases=col_aliases))
+    return concepts
+
+
+def match_alias_to_column(user_term: str, columns: list[str], concepts: list[AliasConcept]) -> AliasMatch:
+    term_key = _norm(user_term)
+    if not term_key:
+        return AliasMatch(user_term=user_term, status="failed")
+
+    concept: AliasConcept | None = None
+    for c in concepts:
+        vocab = {_norm(c.canonical), *(_norm(a) for a in c.aliases)}
+        if term_key in vocab:
+            concept = c
+            break
+
+    if concept is None:
+        return AliasMatch(user_term=user_term, status="failed")
+
+    candidate_keys = {_norm(concept.canonical), *(_norm(a) for a in concept.column_aliases), *(_norm(a) for a in concept.aliases)}
+    candidates = [col for col in columns if _norm(col) in candidate_keys]
+
+    if len(candidates) == 1:
+        return AliasMatch(
+            user_term=user_term,
+            status="success",
+            canonical=concept.canonical,
+            matched_column=candidates[0],
+            candidates=candidates,
+        )
+    if len(candidates) > 1:
+        return AliasMatch(
+            user_term=user_term,
+            status="ambiguous",
+            canonical=concept.canonical,
+            candidates=candidates,
+        )
+    return AliasMatch(user_term=user_term, status="failed", canonical=concept.canonical)
+
+
+def normalize_question_entities(question: str, columns: list[str], concepts: list[AliasConcept]) -> dict[str, Any]:
+    normalized_question = question
+    mappings: list[AliasMatch] = []
+
+    for concept in concepts:
+        terms = [concept.canonical, *concept.aliases]
+        for term in terms:
+            if term and term in question:
+                match = match_alias_to_column(term, columns, concepts)
+                mappings.append(match)
+                if match.status == "success" and match.matched_column:
+                    normalized_question = normalized_question.replace(term, match.matched_column)
+                break
+
+    deduped: dict[str, AliasMatch] = {}
+    for item in mappings:
+        deduped[item.user_term] = item
+
+    return {
+        "normalized_question": normalized_question,
+        "mappings": [m.to_dict() for m in deduped.values()],
+    }
diff --git a/bitnet_tools/ui/app.js b/bitnet_tools/ui/app.js
index c367aee..0859e7e 100644
--- a/bitnet_tools/ui/app.js
+++ b/bitnet_tools/ui/app.js
@@ -12,6 +12,7 @@ const UI = {
   quickAnalyzeBtn: document.getElementById('quickAnalyzeBtn'),
   runBtn: document.getElementById('runBtn'),
   summary: document.getElementById('summary'),
+  schemaMappings: document.getElementById('schemaMappings'),
   prompt: document.getElementById('prompt'),
   answer: document.getElementById('answer'),
   analyzeAssist: document.getElementById('analyzeAssist'),
@@ -217,6 +218,22 @@ async function buildMultiPayloadFiles(files) {
   return payloadFiles;
 }
 
+
+function renderSchemaMappings(data) {
+  if (!UI.schemaMappings) return;
+  const mappings = Array.isArray(data?.schema_semantics_mappings) ? data.schema_semantics_mappings : [];
+  if (!mappings.length) {
+    UI.schemaMappings.textContent = '자동 매핑 결과가 없습니다.';
+    return;
+  }
+  const lines = mappings.map((m) => {
+    if (m.status === 'success') return `${m.user_term} → ${m.matched_column}`;
+    if (m.status === 'ambiguous') return `${m.user_term} → 후보: ${(m.candidates || []).join(', ')}`;
+    return `${m.user_term} → 매핑 실패`;
+  });
+  UI.schemaMappings.textContent = lines.join('\n');
+}
+
 function setStatus(message) {
   if (UI.statusBox) UI.statusBox.textContent = message;
 }
@@ -677,6 +694,7 @@ async function runAnalyzeFromPreprocessed(result, fallbackQuestion = '') {
   const data = await postJson('/api/analyze', body, '분석');
   appState.latestPrompt = data.prompt;
   UI.summary.textContent = JSON.stringify(data.summary, null, 2);
+  renderSchemaMappings(data);
   renderAnalyzeAssist(data);
   if (UI.prompt) UI.prompt.textContent = data.prompt;
   if (UI.answer) UI.answer.textContent = '';
@@ -833,6 +851,7 @@ async function runAnalyze() {
   resetAnalyzeAssist();
   setStatus(STATUS.analyzing);
   UI.summary.textContent = STATUS.analyzing;
+  if (UI.schemaMappings) UI.schemaMappings.textContent = '자동 매핑 결과를 계산 중입니다...';
   toggleBusy(true);
   try {
     const body = await buildAnalyzeRequest();
diff --git a/bitnet_tools/ui/index.html b/bitnet_tools/ui/index.html
index 7ed2212..c259afb 100644
--- a/bitnet_tools/ui/index.html
+++ b/bitnet_tools/ui/index.html
@@ -97,6 +97,8 @@ <h2>4) 결과</h2>
             <pre id="candidateTablePreview">후보 테이블 미리보기(상위 5행)</pre>
           </div>
         </div>
+        <h3>스키마 자동 매핑</h3>
+        <pre id="schemaMappings">자동 매핑 결과가 여기에 표시됩니다.</pre>
         <h3>데이터 요약</h3>
         <pre id="summary"></pre>
       </section>
diff --git a/resources/schema_semantics_ko.json b/resources/schema_semantics_ko.json
new file mode 100644
index 0000000..638f2dd
--- /dev/null
+++ b/resources/schema_semantics_ko.json
@@ -0,0 +1,14 @@
+{
+  "concepts": [
+    {
+      "canonical": "시군구",
+      "aliases": ["구군", "지역구"],
+      "column_aliases": ["sigungu_col", "sigungu", "시군구명", "구군명"]
+    },
+    {
+      "canonical": "세차유형",
+      "aliases": ["서비스타입"],
+      "column_aliases": ["service_type_col", "service_type", "세차유형코드", "서비스타입"]
+    }
+  ]
+}
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index da8ae62..d80072e 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -220,3 +220,14 @@ def test_normalize_analysis_input_rejects_unsupported_type():
 
     with pytest.raises(ValueError):
         normalize_analysis_input({"input_type": "json", "normalized_csv_text": "a\n1\n"})
+
+
+def test_build_analysis_payload_normalizes_question_by_schema_semantics():
+    payload = build_analysis_payload_from_csv_text(
+        "sigungu_col,service_type_col\n강남,셀프\n",
+        "시군구 별 세차유형 통계를 보여줘",
+    )
+
+    assert payload["original_question"] == "시군구 별 세차유형 통계를 보여줘"
+    assert payload["question"] == "sigungu_col 별 service_type_col 통계를 보여줘"
+    assert any(m["status"] == "success" for m in payload["schema_semantics_mappings"])
diff --git a/tests/test_schema_semantics.py b/tests/test_schema_semantics.py
new file mode 100644
index 0000000..28f03f1
--- /dev/null
+++ b/tests/test_schema_semantics.py
@@ -0,0 +1,37 @@
+from bitnet_tools.schema_semantics import (
+    load_schema_semantics,
+    match_alias_to_column,
+    normalize_question_entities,
+)
+
+
+def test_schema_semantics_success_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('시군구', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert match.status == 'success'
+    assert match.matched_column == 'sigungu_col'
+
+
+def test_schema_semantics_failed_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('없는용어', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert match.status == 'failed'
+    assert match.matched_column is None
+
+
+def test_schema_semantics_ambiguous_mapping():
+    concepts = load_schema_semantics()
+    match = match_alias_to_column('세차유형', ['service_type_col', 'service_type'], concepts)
+
+    assert match.status == 'ambiguous'
+    assert sorted(match.candidates or []) == ['service_type', 'service_type_col']
+
+
+def test_normalize_question_entities_replaces_with_column_name():
+    concepts = load_schema_semantics()
+    result = normalize_question_entities('시군구 별 세차유형 비율을 보여줘', ['sigungu_col', 'service_type_col'], concepts)
+
+    assert result['normalized_question'] == 'sigungu_col 별 service_type_col 비율을 보여줘'
+    assert len(result['mappings']) == 2