diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py index 684d3a8..7efc055 100644 --- a/bitnet_tools/analysis.py +++ b/bitnet_tools/analysis.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import Any +from .schema_semantics import load_schema_semantics, normalize_question_entities + VALID_INPUT_TYPES = {"csv", "excel", "document"} @@ -229,13 +231,16 @@ def build_analysis_payload_from_normalized_input( columns = [str(c) for c in reader.fieldnames] summary = summarize_reader(reader, columns) + planner = normalize_question_entities(question, columns, load_schema_semantics()) csv_path = csv_path_override or normalized_input.source_name return { "csv_path": csv_path, - "question": question, + "question": planner["normalized_question"], + "original_question": question, "summary": summary.to_dict(), - "prompt": build_prompt(summary, question), + "prompt": build_prompt(summary, planner["normalized_question"]), + "schema_semantics_mappings": planner["mappings"], "input": normalized_input.to_dict(), } diff --git a/bitnet_tools/schema_semantics.py b/bitnet_tools/schema_semantics.py new file mode 100644 index 0000000..174820c --- /dev/null +++ b/bitnet_tools/schema_semantics.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from pathlib import Path +from typing import Any + +SCHEMA_SEMANTICS_PATH = Path(__file__).resolve().parents[1] / "resources" / "schema_semantics_ko.json" + + +@dataclass +class AliasConcept: + canonical: str + aliases: list[str] + column_aliases: list[str] + + +@dataclass +class AliasMatch: + user_term: str + status: str + canonical: str | None = None + matched_column: str | None = None + candidates: list[str] | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "user_term": self.user_term, + "status": self.status, + "canonical": self.canonical, + "matched_column": self.matched_column, + "candidates": self.candidates or [], + } + + +def _norm(text: str) -> str: + return "".join(ch for ch in str(text).strip().lower() if not ch.isspace() and ch not in "_-") + + +def load_schema_semantics(path: str | Path | None = None) -> list[AliasConcept]: + semantics_path = Path(path) if path else SCHEMA_SEMANTICS_PATH + raw = json.loads(semantics_path.read_text(encoding="utf-8")) + items = raw.get("concepts", []) if isinstance(raw, dict) else [] + + concepts: list[AliasConcept] = [] + for item in items: + if not isinstance(item, dict): + continue + canonical = str(item.get("canonical", "")).strip() + if not canonical: + continue + aliases = [str(x).strip() for x in item.get("aliases", []) if str(x).strip()] + col_aliases = [str(x).strip() for x in item.get("column_aliases", []) if str(x).strip()] + concepts.append(AliasConcept(canonical=canonical, aliases=aliases, column_aliases=col_aliases)) + return concepts + + +def match_alias_to_column(user_term: str, columns: list[str], concepts: list[AliasConcept]) -> AliasMatch: + term_key = _norm(user_term) + if not term_key: + return AliasMatch(user_term=user_term, status="failed") + + concept: AliasConcept | None = None + for c in concepts: + vocab = {_norm(c.canonical), *(_norm(a) for a in c.aliases)} + if term_key in vocab: + concept = c + break + + if concept is None: + return AliasMatch(user_term=user_term, status="failed") + + candidate_keys = {_norm(concept.canonical), *(_norm(a) for a in concept.column_aliases), *(_norm(a) for a in concept.aliases)} + candidates = [col for col in columns if _norm(col) in candidate_keys] + + if len(candidates) == 1: + return AliasMatch( + user_term=user_term, + status="success", + canonical=concept.canonical, + matched_column=candidates[0], + candidates=candidates, + ) + if len(candidates) > 1: + return AliasMatch( + user_term=user_term, + status="ambiguous", + canonical=concept.canonical, + candidates=candidates, + ) + return AliasMatch(user_term=user_term, status="failed", canonical=concept.canonical) + + +def normalize_question_entities(question: str, columns: list[str], concepts: list[AliasConcept]) -> dict[str, Any]: + normalized_question = question + mappings: list[AliasMatch] = [] + + for concept in concepts: + terms = [concept.canonical, *concept.aliases] + for term in terms: + if term and term in question: + match = match_alias_to_column(term, columns, concepts) + mappings.append(match) + if match.status == "success" and match.matched_column: + normalized_question = normalized_question.replace(term, match.matched_column) + break + + deduped: dict[str, AliasMatch] = {} + for item in mappings: + deduped[item.user_term] = item + + return { + "normalized_question": normalized_question, + "mappings": [m.to_dict() for m in deduped.values()], + } diff --git a/bitnet_tools/ui/app.js b/bitnet_tools/ui/app.js index c367aee..0859e7e 100644 --- a/bitnet_tools/ui/app.js +++ b/bitnet_tools/ui/app.js @@ -12,6 +12,7 @@ const UI = { quickAnalyzeBtn: document.getElementById('quickAnalyzeBtn'), runBtn: document.getElementById('runBtn'), summary: document.getElementById('summary'), + schemaMappings: document.getElementById('schemaMappings'), prompt: document.getElementById('prompt'), answer: document.getElementById('answer'), analyzeAssist: document.getElementById('analyzeAssist'), @@ -217,6 +218,22 @@ async function buildMultiPayloadFiles(files) { return payloadFiles; } + +function renderSchemaMappings(data) { + if (!UI.schemaMappings) return; + const mappings = Array.isArray(data?.schema_semantics_mappings) ? data.schema_semantics_mappings : []; + if (!mappings.length) { + UI.schemaMappings.textContent = '자동 매핑 결과가 없습니다.'; + return; + } + const lines = mappings.map((m) => { + if (m.status === 'success') return `${m.user_term} → ${m.matched_column}`; + if (m.status === 'ambiguous') return `${m.user_term} → 후보: ${(m.candidates || []).join(', ')}`; + return `${m.user_term} → 매핑 실패`; + }); + UI.schemaMappings.textContent = lines.join('\n'); +} + function setStatus(message) { if (UI.statusBox) UI.statusBox.textContent = message; } @@ -677,6 +694,7 @@ async function runAnalyzeFromPreprocessed(result, fallbackQuestion = '') { const data = await postJson('/api/analyze', body, '분석'); appState.latestPrompt = data.prompt; UI.summary.textContent = JSON.stringify(data.summary, null, 2); + renderSchemaMappings(data); renderAnalyzeAssist(data); if (UI.prompt) UI.prompt.textContent = data.prompt; if (UI.answer) UI.answer.textContent = ''; @@ -833,6 +851,7 @@ async function runAnalyze() { resetAnalyzeAssist(); setStatus(STATUS.analyzing); UI.summary.textContent = STATUS.analyzing; + if (UI.schemaMappings) UI.schemaMappings.textContent = '자동 매핑 결과를 계산 중입니다...'; toggleBusy(true); try { const body = await buildAnalyzeRequest(); diff --git a/bitnet_tools/ui/index.html b/bitnet_tools/ui/index.html index 7ed2212..c259afb 100644 --- a/bitnet_tools/ui/index.html +++ b/bitnet_tools/ui/index.html @@ -97,6 +97,8 @@
후보 테이블 미리보기(상위 5행)+
자동 매핑 결과가 여기에 표시됩니다.