Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions bitnet_tools/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from pathlib import Path
from typing import Any

from .schema_semantics import load_schema_semantics, normalize_question_entities


VALID_INPUT_TYPES = {"csv", "excel", "document"}

Expand Down Expand Up @@ -229,13 +231,16 @@ def build_analysis_payload_from_normalized_input(

columns = [str(c) for c in reader.fieldnames]
summary = summarize_reader(reader, columns)
planner = normalize_question_entities(question, columns, load_schema_semantics())
csv_path = csv_path_override or normalized_input.source_name

return {
"csv_path": csv_path,
"question": question,
"question": planner["normalized_question"],
"original_question": question,
"summary": summary.to_dict(),
"prompt": build_prompt(summary, question),
"prompt": build_prompt(summary, planner["normalized_question"]),
"schema_semantics_mappings": planner["mappings"],
"input": normalized_input.to_dict(),
}

Expand Down
115 changes: 115 additions & 0 deletions bitnet_tools/schema_semantics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from __future__ import annotations

from dataclasses import dataclass
import json
from pathlib import Path
from typing import Any

SCHEMA_SEMANTICS_PATH = Path(__file__).resolve().parents[1] / "resources" / "schema_semantics_ko.json"


@dataclass
class AliasConcept:
canonical: str
aliases: list[str]
column_aliases: list[str]


@dataclass
class AliasMatch:
user_term: str
status: str
canonical: str | None = None
matched_column: str | None = None
candidates: list[str] | None = None

def to_dict(self) -> dict[str, Any]:
return {
"user_term": self.user_term,
"status": self.status,
"canonical": self.canonical,
"matched_column": self.matched_column,
"candidates": self.candidates or [],
}


def _norm(text: str) -> str:
return "".join(ch for ch in str(text).strip().lower() if not ch.isspace() and ch not in "_-")


def load_schema_semantics(path: str | Path | None = None) -> list[AliasConcept]:
semantics_path = Path(path) if path else SCHEMA_SEMANTICS_PATH
raw = json.loads(semantics_path.read_text(encoding="utf-8"))
items = raw.get("concepts", []) if isinstance(raw, dict) else []

concepts: list[AliasConcept] = []
for item in items:
if not isinstance(item, dict):
continue
canonical = str(item.get("canonical", "")).strip()
if not canonical:
continue
aliases = [str(x).strip() for x in item.get("aliases", []) if str(x).strip()]
col_aliases = [str(x).strip() for x in item.get("column_aliases", []) if str(x).strip()]
concepts.append(AliasConcept(canonical=canonical, aliases=aliases, column_aliases=col_aliases))
return concepts


def match_alias_to_column(user_term: str, columns: list[str], concepts: list[AliasConcept]) -> AliasMatch:
term_key = _norm(user_term)
if not term_key:
return AliasMatch(user_term=user_term, status="failed")

concept: AliasConcept | None = None
for c in concepts:
vocab = {_norm(c.canonical), *(_norm(a) for a in c.aliases)}
if term_key in vocab:
concept = c
break

if concept is None:
return AliasMatch(user_term=user_term, status="failed")

candidate_keys = {_norm(concept.canonical), *(_norm(a) for a in concept.column_aliases), *(_norm(a) for a in concept.aliases)}
candidates = [col for col in columns if _norm(col) in candidate_keys]

if len(candidates) == 1:
return AliasMatch(
user_term=user_term,
status="success",
canonical=concept.canonical,
matched_column=candidates[0],
candidates=candidates,
)
if len(candidates) > 1:
return AliasMatch(
user_term=user_term,
status="ambiguous",
canonical=concept.canonical,
candidates=candidates,
)
return AliasMatch(user_term=user_term, status="failed", canonical=concept.canonical)


def normalize_question_entities(question: str, columns: list[str], concepts: list[AliasConcept]) -> dict[str, Any]:
normalized_question = question
mappings: list[AliasMatch] = []

for concept in concepts:
terms = [concept.canonical, *concept.aliases]
for term in terms:
if term and term in question:
match = match_alias_to_column(term, columns, concepts)
mappings.append(match)
if match.status == "success" and match.matched_column:
normalized_question = normalized_question.replace(term, match.matched_column)
Comment on lines +101 to +105
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Replace only whole entity tokens during normalization

normalize_question_entities matches aliases with plain substring checks and then runs global str.replace, so terms that appear inside longer words are rewritten incorrectly. For example, with column 시군구명 and question 시군구명 별 ..., the canonical term 시군구 matches and replacement produces 시군구명명 ..., which then feeds a corrupted question into the prompt. This can silently degrade analysis whenever a concept term is a prefix/subsequence of another token.

Useful? React with 👍 / 👎.

break

deduped: dict[str, AliasMatch] = {}
for item in mappings:
deduped[item.user_term] = item

return {
"normalized_question": normalized_question,
"mappings": [m.to_dict() for m in deduped.values()],
}
19 changes: 19 additions & 0 deletions bitnet_tools/ui/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const UI = {
quickAnalyzeBtn: document.getElementById('quickAnalyzeBtn'),
runBtn: document.getElementById('runBtn'),
summary: document.getElementById('summary'),
schemaMappings: document.getElementById('schemaMappings'),
prompt: document.getElementById('prompt'),
answer: document.getElementById('answer'),
analyzeAssist: document.getElementById('analyzeAssist'),
Expand Down Expand Up @@ -217,6 +218,22 @@ async function buildMultiPayloadFiles(files) {
return payloadFiles;
}


function renderSchemaMappings(data) {
if (!UI.schemaMappings) return;
const mappings = Array.isArray(data?.schema_semantics_mappings) ? data.schema_semantics_mappings : [];
if (!mappings.length) {
UI.schemaMappings.textContent = '자동 매핑 결과가 없습니다.';
return;
}
const lines = mappings.map((m) => {
if (m.status === 'success') return `${m.user_term} → ${m.matched_column}`;
if (m.status === 'ambiguous') return `${m.user_term} → 후보: ${(m.candidates || []).join(', ')}`;
return `${m.user_term} → 매핑 실패`;
});
UI.schemaMappings.textContent = lines.join('\n');
}

function setStatus(message) {
if (UI.statusBox) UI.statusBox.textContent = message;
}
Expand Down Expand Up @@ -677,6 +694,7 @@ async function runAnalyzeFromPreprocessed(result, fallbackQuestion = '') {
const data = await postJson('/api/analyze', body, '분석');
appState.latestPrompt = data.prompt;
UI.summary.textContent = JSON.stringify(data.summary, null, 2);
renderSchemaMappings(data);
renderAnalyzeAssist(data);
if (UI.prompt) UI.prompt.textContent = data.prompt;
if (UI.answer) UI.answer.textContent = '';
Expand Down Expand Up @@ -833,6 +851,7 @@ async function runAnalyze() {
resetAnalyzeAssist();
setStatus(STATUS.analyzing);
UI.summary.textContent = STATUS.analyzing;
if (UI.schemaMappings) UI.schemaMappings.textContent = '자동 매핑 결과를 계산 중입니다...';
toggleBusy(true);
try {
const body = await buildAnalyzeRequest();
Expand Down
2 changes: 2 additions & 0 deletions bitnet_tools/ui/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ <h2>4) 결과</h2>
<pre id="candidateTablePreview">후보 테이블 미리보기(상위 5행)</pre>
</div>
</div>
<h3>스키마 자동 매핑</h3>
<pre id="schemaMappings">자동 매핑 결과가 여기에 표시됩니다.</pre>
<h3>데이터 요약</h3>
<pre id="summary"></pre>
</section>
Expand Down
14 changes: 14 additions & 0 deletions resources/schema_semantics_ko.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"concepts": [
{
"canonical": "시군구",
"aliases": ["구군", "지역구"],
"column_aliases": ["sigungu_col", "sigungu", "시군구명", "구군명"]
},
{
"canonical": "세차유형",
"aliases": ["서비스타입"],
"column_aliases": ["service_type_col", "service_type", "세차유형코드", "서비스타입"]
}
]
}
11 changes: 11 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,14 @@ def test_normalize_analysis_input_rejects_unsupported_type():

with pytest.raises(ValueError):
normalize_analysis_input({"input_type": "json", "normalized_csv_text": "a\n1\n"})


def test_build_analysis_payload_normalizes_question_by_schema_semantics():
payload = build_analysis_payload_from_csv_text(
"sigungu_col,service_type_col\n강남,셀프\n",
"시군구 별 세차유형 통계를 보여줘",
)

assert payload["original_question"] == "시군구 별 세차유형 통계를 보여줘"
assert payload["question"] == "sigungu_col 별 service_type_col 통계를 보여줘"
assert any(m["status"] == "success" for m in payload["schema_semantics_mappings"])
37 changes: 37 additions & 0 deletions tests/test_schema_semantics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from bitnet_tools.schema_semantics import (
load_schema_semantics,
match_alias_to_column,
normalize_question_entities,
)


def test_schema_semantics_success_mapping():
concepts = load_schema_semantics()
match = match_alias_to_column('시군구', ['sigungu_col', 'service_type_col'], concepts)

assert match.status == 'success'
assert match.matched_column == 'sigungu_col'


def test_schema_semantics_failed_mapping():
concepts = load_schema_semantics()
match = match_alias_to_column('없는용어', ['sigungu_col', 'service_type_col'], concepts)

assert match.status == 'failed'
assert match.matched_column is None


def test_schema_semantics_ambiguous_mapping():
concepts = load_schema_semantics()
match = match_alias_to_column('세차유형', ['service_type_col', 'service_type'], concepts)

assert match.status == 'ambiguous'
assert sorted(match.candidates or []) == ['service_type', 'service_type_col']


def test_normalize_question_entities_replaces_with_column_name():
concepts = load_schema_semantics()
result = normalize_question_entities('시군구 별 세차유형 비율을 보여줘', ['sigungu_col', 'service_type_col'], concepts)

assert result['normalized_question'] == 'sigungu_col 별 service_type_col 비율을 보여줘'
assert len(result['mappings']) == 2