From 578717723c78c700bc2fa37d25403bab85a7e0e3 Mon Sep 17 00:00:00 2001 From: HONGDAE KIM Date: Sun, 15 Feb 2026 16:28:55 +0900 Subject: [PATCH] Add official-path asset collector with manifest and defer report --- ONLINE_RESOURCE_COLLECTION.md | 19 +- resources/online_sources.json | 41 +++-- scripts/collect_online_assets.py | 275 +++++++++++++++++++++++----- tests/test_collect_online_assets.py | 32 ++++ 4 files changed, 303 insertions(+), 64 deletions(-) create mode 100644 tests/test_collect_online_assets.py diff --git a/ONLINE_RESOURCE_COLLECTION.md b/ONLINE_RESOURCE_COLLECTION.md index a8fefe8..5460712 100644 --- a/ONLINE_RESOURCE_COLLECTION.md +++ b/ONLINE_RESOURCE_COLLECTION.md @@ -1,17 +1,24 @@ # Online tool/reference collection -인터넷이 열려 있는 턴에서 필요한 툴 패키지(wheel)와 UI/접근성 레퍼런스를 로컬로 저장하기 위한 작업 기록 문서. +인터넷이 열려 있는 턴에서 Python wheelhouse, 모델, 런타임 자산을 **공식 경로**에서만 수집하기 위한 작업 문서. ## Source catalog - `resources/online_sources.json` - - `tool_packages`: 다운로드 대상 pip 패키지 목록 - - `reference_urls`: 저장 대상 웹 레퍼런스 URL 목록 + - `wheelhouse`: `name` + `version`(고정)으로 수집할 Python 패키지 목록 + - `model_assets`: 모델 관련 자산 URL + `official_base` + - `runtime_assets`: 런타임 자산 URL + `official_base` ## Collector - `scripts/collect_online_assets.py` - - pip wheel 다운로드 시도 (`.online_assets/wheels`) - - 레퍼런스 HTML 다운로드 시도 (`.online_assets/references`) - - 결과 리포트 생성 (`.online_assets/meta/collection_report.json`) + - wheelhouse 다운로드 (`.online_assets/wheelhouse`) + - PyPI 공식 인덱스(`https://pypi.org/simple`)로만 다운로드 + - 모델 자산 다운로드 (`.online_assets/models`) + - 런타임 자산 다운로드 (`.online_assets/runtime`) + - SHA256 + 버전 고정 정보를 담은 manifest 생성 + - `.online_assets/meta/collection_manifest.json` + - 수집 리포트 생성 + - `.online_assets/meta/collection_report.json` + - 설치 가능(`installable`) / 불가(`blocked_or_failed`) / 보류(`defer`) 분리 ## Run ```bash diff --git a/resources/online_sources.json b/resources/online_sources.json index c9d1494..f2a5292 100644 --- a/resources/online_sources.json +++ b/resources/online_sources.json @@ -1,16 +1,33 @@ { - "tool_packages": [ - "matplotlib", - "pandas", - "jupyterlab", - "pytest", - "playwright" + "wheelhouse": [ + {"name": "matplotlib", "version": "3.9.2"}, + {"name": "pandas", "version": "2.2.3"}, + {"name": "jupyterlab", "version": "4.2.5"}, + {"name": "pytest", "version": "8.3.3"}, + {"name": "playwright", "version": "1.47.0"} ], - "reference_urls": [ - "https://www.w3.org/WAI/WCAG22/quickref/", - "https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA", - "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API", - "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl", - "https://web.dev/articles/loading-patterns" + "model_assets": [ + { + "name": "ollama_install_script", + "url": "https://ollama.com/install.sh", + "official_base": "https://ollama.com/" + } + ], + "runtime_assets": [ + { + "name": "wcag_quickref", + "url": "https://www.w3.org/WAI/WCAG22/quickref/", + "official_base": "https://www.w3.org/WAI/" + }, + { + "name": "mdn_aria", + "url": "https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA", + "official_base": "https://developer.mozilla.org/en-US/docs/Web/" + }, + { + "name": "mdn_fetch_api", + "url": "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API", + "official_base": "https://developer.mozilla.org/en-US/docs/Web/" + } ] } diff --git a/scripts/collect_online_assets.py b/scripts/collect_online_assets.py index 37e30a3..54f7753 100755 --- a/scripts/collect_online_assets.py +++ b/scripts/collect_online_assets.py @@ -1,10 +1,11 @@ from __future__ import annotations +import hashlib import json import re import subprocess import sys -from dataclasses import dataclass, asdict +from dataclasses import asdict, dataclass from datetime import datetime, timezone from pathlib import Path from urllib.error import HTTPError, URLError @@ -15,51 +16,198 @@ SOURCES_FILE = ROOT / "resources" / "online_sources.json" OUT_DIR = ROOT / ".online_assets" REF_DIR = OUT_DIR / "references" -WHEEL_DIR = OUT_DIR / "wheels" +WHEEL_DIR = OUT_DIR / "wheelhouse" +MODEL_DIR = OUT_DIR / "models" +RUNTIME_DIR = OUT_DIR / "runtime" META_DIR = OUT_DIR / "meta" +OFFICIAL_WHEEL_INDEX = "https://pypi.org/simple" + @dataclass -class DownloadResult: - target: str +class ManifestItem: category: str - ok: bool + name: str + version: str | None + source: str path: str | None + sha256: str | None + status: str detail: str +@dataclass +class CollectionReport: + created_at: str + source_file: str + installable: list[dict] + blocked_or_failed: list[dict] + defer: list[dict] + summary: dict[str, int] + + def _slug_from_url(url: str) -> str: parsed = urlparse(url) stem = (parsed.netloc + parsed.path).strip("/") or "index" stem = re.sub(r"[^a-zA-Z0-9._-]+", "_", stem) - return f"{stem}.html" + return stem + + +def _hash_sha256(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as f: + while True: + chunk = f.read(1024 * 1024) + if not chunk: + break + digest.update(chunk) + return digest.hexdigest() + + +def _is_official_asset_url(url: str, official_base: str) -> bool: + parsed = urlparse(url) + official = urlparse(official_base) + if parsed.scheme != "https": + return False + if parsed.netloc != official.netloc: + return False + return parsed.path.startswith(official.path) + + +def _extract_version_from_filename(filename: str) -> str | None: + # wheel: {dist}-{version}-...whl + if filename.endswith(".whl"): + parts = filename.split("-") + if len(parts) >= 2: + return parts[1] + # source dist: name-version.tar.gz | .zip + m = re.match(r".+-([0-9][A-Za-z0-9_.!-]*)\.(tar\.gz|zip)$", filename) + if m: + return m.group(1) + return None + + +def _download_wheel(pkg_name: str, version: str) -> list[ManifestItem]: + WHEEL_DIR.mkdir(parents=True, exist_ok=True) + spec = f"{pkg_name}=={version}" + before = {p.name for p in WHEEL_DIR.iterdir() if p.is_file()} + cmd = [ + sys.executable, + "-m", + "pip", + "download", + spec, + "-d", + str(WHEEL_DIR), + "--index-url", + OFFICIAL_WHEEL_INDEX, + ] + proc = subprocess.run(cmd, capture_output=True, text=True, check=False) + if proc.returncode != 0: + detail = proc.stderr.strip() or proc.stdout.strip() or "pip download failed" + return [ + ManifestItem( + category="wheelhouse", + name=pkg_name, + version=version, + source=OFFICIAL_WHEEL_INDEX, + path=None, + sha256=None, + status="defer", + detail=detail.splitlines()[-1][:280], + ) + ] + + added = [p for p in WHEEL_DIR.iterdir() if p.is_file() and p.name not in before] + if not added: + return [ + ManifestItem( + category="wheelhouse", + name=pkg_name, + version=version, + source=OFFICIAL_WHEEL_INDEX, + path=None, + sha256=None, + status="defer", + detail="download completed but no new files were added", + ) + ] + + results: list[ManifestItem] = [] + for file_path in sorted(added): + detected_version = _extract_version_from_filename(file_path.name) + results.append( + ManifestItem( + category="wheelhouse", + name=file_path.name, + version=detected_version or version, + source=OFFICIAL_WHEEL_INDEX, + path=str(file_path.relative_to(ROOT)), + sha256=_hash_sha256(file_path), + status="collected", + detail=f"requested={spec}", + ) + ) + return results + +def _download_asset(category: str, name: str, url: str, official_base: str, target_dir: Path) -> ManifestItem: + target_dir.mkdir(parents=True, exist_ok=True) + if not _is_official_asset_url(url, official_base): + return ManifestItem( + category=category, + name=name, + version=None, + source=url, + path=None, + sha256=None, + status="defer", + detail=f"blocked: non-official URL (official_base={official_base})", + ) + + suffix = Path(urlparse(url).path).suffix or ".bin" + out_path = target_dir / f"{_slug_from_url(url)}{suffix}" + req = Request(url, headers={"User-Agent": "bitnet-tools/official-collector"}) -def _fetch_reference(url: str) -> DownloadResult: - REF_DIR.mkdir(parents=True, exist_ok=True) - out_path = REF_DIR / _slug_from_url(url) - req = Request(url, headers={"User-Agent": "bitnet-tools/online-collector"}) try: - with urlopen(req, timeout=20) as resp: - body = resp.read() - out_path.write_bytes(body) - return DownloadResult(url, "reference", True, str(out_path.relative_to(ROOT)), "downloaded") + with urlopen(req, timeout=30) as resp: + final_url = resp.geturl() + if not _is_official_asset_url(final_url, official_base): + return ManifestItem( + category=category, + name=name, + version=None, + source=url, + path=None, + sha256=None, + status="defer", + detail=f"blocked: redirected to non-official URL ({final_url})", + ) + out_path.write_bytes(resp.read()) + return ManifestItem( + category=category, + name=name, + version=None, + source=url, + path=str(out_path.relative_to(ROOT)), + sha256=_hash_sha256(out_path), + status="collected", + detail="downloaded", + ) except HTTPError as exc: - return DownloadResult(url, "reference", False, None, f"http_error:{exc.code}") + return ManifestItem(category, name, None, url, None, None, "defer", f"http_error:{exc.code}") except URLError as exc: - return DownloadResult(url, "reference", False, None, f"url_error:{exc.reason}") + return ManifestItem(category, name, None, url, None, None, "defer", f"url_error:{exc.reason}") except Exception as exc: # pragma: no cover - return DownloadResult(url, "reference", False, None, f"error:{exc}") + return ManifestItem(category, name, None, url, None, None, "defer", f"error:{exc}") -def _download_wheel(pkg: str) -> DownloadResult: - WHEEL_DIR.mkdir(parents=True, exist_ok=True) - cmd = [sys.executable, "-m", "pip", "download", pkg, "-d", str(WHEEL_DIR)] - proc = subprocess.run(cmd, capture_output=True, text=True, check=False) - if proc.returncode == 0: - return DownloadResult(pkg, "tool_package", True, str(WHEEL_DIR.relative_to(ROOT)), "downloaded") - detail = proc.stderr.strip() or proc.stdout.strip() or "pip download failed" - return DownloadResult(pkg, "tool_package", False, None, detail.splitlines()[-1][:220]) +def _load_sources() -> dict: + data = json.loads(SOURCES_FILE.read_text(encoding="utf-8")) + data.setdefault("wheelhouse", []) + data.setdefault("model_assets", []) + data.setdefault("runtime_assets", []) + return data def main() -> int: @@ -67,38 +215,73 @@ def main() -> int: print(f"sources file not found: {SOURCES_FILE}", file=sys.stderr) return 1 - data = json.loads(SOURCES_FILE.read_text(encoding="utf-8")) - tool_packages: list[str] = list(data.get("tool_packages", [])) - reference_urls: list[str] = list(data.get("reference_urls", [])) + sources = _load_sources() OUT_DIR.mkdir(exist_ok=True) META_DIR.mkdir(parents=True, exist_ok=True) - results: list[DownloadResult] = [] + items: list[ManifestItem] = [] - for pkg in tool_packages: - print(f"[tool] {pkg}") - results.append(_download_wheel(pkg)) + for pkg in sources["wheelhouse"]: + name = pkg["name"] + version = pkg["version"] + print(f"[wheelhouse] {name}=={version}") + items.extend(_download_wheel(name, version)) - for url in reference_urls: - print(f"[ref] {url}") - results.append(_fetch_reference(url)) + for asset in sources["model_assets"]: + print(f"[model] {asset['name']}") + items.append( + _download_asset( + category="model_asset", + name=asset["name"], + url=asset["url"], + official_base=asset["official_base"], + target_dir=MODEL_DIR, + ) + ) - report = { + for asset in sources["runtime_assets"]: + print(f"[runtime] {asset['name']}") + items.append( + _download_asset( + category="runtime_asset", + name=asset["name"], + url=asset["url"], + official_base=asset["official_base"], + target_dir=RUNTIME_DIR, + ) + ) + + manifest_path = META_DIR / "collection_manifest.json" + manifest = { "created_at": datetime.now(timezone.utc).isoformat(), - "python": sys.version, "source_file": str(SOURCES_FILE.relative_to(ROOT)), - "results": [asdict(r) for r in results], - "summary": { - "total": len(results), - "success": sum(1 for r in results if r.ok), - "failed": sum(1 for r in results if not r.ok), - }, + "items": [asdict(item) for item in items], } + manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") + + installable = [asdict(i) for i in items if i.status == "collected"] + blocked_or_failed = [asdict(i) for i in items if i.status != "collected"] + defer = [asdict(i) for i in items if i.status == "defer"] + + report = CollectionReport( + created_at=datetime.now(timezone.utc).isoformat(), + source_file=str(SOURCES_FILE.relative_to(ROOT)), + installable=installable, + blocked_or_failed=blocked_or_failed, + defer=defer, + summary={ + "total": len(items), + "installable": len(installable), + "blocked_or_failed": len(blocked_or_failed), + "defer": len(defer), + }, + ) + report_path = META_DIR / "collection_report.json" + report_path.write_text(json.dumps(asdict(report), ensure_ascii=False, indent=2), encoding="utf-8") - out = META_DIR / "collection_report.json" - out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") - print(f"report saved: {out.relative_to(ROOT)}") + print(f"manifest saved: {manifest_path.relative_to(ROOT)}") + print(f"report saved: {report_path.relative_to(ROOT)}") return 0 diff --git a/tests/test_collect_online_assets.py b/tests/test_collect_online_assets.py new file mode 100644 index 0000000..24473d1 --- /dev/null +++ b/tests/test_collect_online_assets.py @@ -0,0 +1,32 @@ +import sys +from importlib.util import module_from_spec, spec_from_file_location +from pathlib import Path + + +MODULE_PATH = Path(__file__).resolve().parent.parent / "scripts" / "collect_online_assets.py" +SPEC = spec_from_file_location("collect_online_assets", MODULE_PATH) +collector = module_from_spec(SPEC) +assert SPEC and SPEC.loader +sys.modules[SPEC.name] = collector +SPEC.loader.exec_module(collector) + + +def test_is_official_asset_url_allows_only_https_same_base(): + assert collector._is_official_asset_url( + "https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API", + "https://developer.mozilla.org/en-US/docs/Web/", + ) + assert not collector._is_official_asset_url( + "http://developer.mozilla.org/en-US/docs/Web/API/Fetch_API", + "https://developer.mozilla.org/en-US/docs/Web/", + ) + assert not collector._is_official_asset_url( + "https://example.com/en-US/docs/Web/API/Fetch_API", + "https://developer.mozilla.org/en-US/docs/Web/", + ) + + +def test_extract_version_from_filename(): + assert collector._extract_version_from_filename("pandas-2.2.3-cp311-cp311-manylinux.whl") == "2.2.3" + assert collector._extract_version_from_filename("pytest-8.3.3.tar.gz") == "8.3.3" + assert collector._extract_version_from_filename("unknown.txt") is None