From a2564ec5200154239a369341fe165c36b9b8039f Mon Sep 17 00:00:00 2001 From: FlacSy Date: Tue, 3 Mar 2026 04:50:56 +0100 Subject: [PATCH] Add ruff: Makefile targets (lint, format, format-fix, lint-fix), dev deps, apply format --- Makefile | 18 ++++++- ml/prepare_data.py | 68 ++++++++++++++++++------ ml/quantize_model.py | 4 +- ml/test_inference.py | 9 ++-- pyproject.toml | 2 +- python/badwords/ml/_paths.py | 12 +++-- python/pyproject.toml | 2 +- scripts/bench_compare.py | 100 ++++++++++++++++++++++++++--------- 8 files changed, 163 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index 7620481..dcc4d7d 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare wasm wasm-nodejs npm-publish lang-packages npm-publish-languages +.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare lint lint-fix format format-fix wasm wasm-nodejs npm-publish lang-packages npm-publish-languages develop: cd python && maturin develop @@ -38,6 +38,22 @@ bench-python: @if [ -d .venv ]; then .venv/bin/python -m pytest tests/bench_filter.py -v --benchmark-only; \ else python3 -m pytest tests/bench_filter.py -v --benchmark-only; fi +# Ruff: lint (check only) +lint: + @if [ -d .venv ]; then .venv/bin/ruff check .; else ruff check .; fi + +# Ruff: format check (CI) +format: + @if [ -d .venv ]; then .venv/bin/ruff format --check .; else ruff format --check .; fi + +# Ruff: format fix (apply formatting) +format-fix: + @if [ -d .venv ]; then .venv/bin/ruff format .; else ruff format .; fi + +# Ruff: lint with auto-fix +lint-fix: + @if [ -d .venv ]; then .venv/bin/ruff check . --fix; else ruff check . --fix; fi + # WebAssembly build for browser wasm: cd rust/badwords-wasm && wasm-pack build --target web --out-dir pkg diff --git a/ml/prepare_data.py b/ml/prepare_data.py index 3444e7a..f5c511e 100644 --- a/ml/prepare_data.py +++ b/ml/prepare_data.py @@ -10,7 +10,14 @@ import pandas as pd from datasets import load_dataset -TOXIC_COLUMNS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] +TOXIC_COLUMNS = [ + "toxic", + "severe_toxic", + "obscene", + "threat", + "insult", + "identity_hate", +] TEXT_COLUMN = "comment_text" OUTPUT_DIR = Path(__file__).parent / "data" / "processed" @@ -46,21 +53,39 @@ def load_single( if label_source == "paradetox": # toxic = 1, neutral/detox = 0 input_col = next( - (c for c in [ - "input", "source", "toxic", - "en_toxic_comment", "ru_toxic_comment", "toxic_sentence", - ] if c in df.columns), + ( + c + for c in [ + "input", + "source", + "toxic", + "en_toxic_comment", + "ru_toxic_comment", + "toxic_sentence", + ] + if c in df.columns + ), None, ) output_col = next( - (c for c in [ - "output", "target", "detox", - "en_neutral_comment", "ru_neutral_comment", "neutral_sentence", - ] if c in df.columns), + ( + c + for c in [ + "output", + "target", + "detox", + "en_neutral_comment", + "ru_neutral_comment", + "neutral_sentence", + ] + if c in df.columns + ), None, ) if not input_col or not output_col: - raise ValueError(f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}") + raise ValueError( + f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}" + ) toxic_df = df[[input_col]].rename(columns={input_col: TEXT_COLUMN}) toxic_df["label"] = 1 clean_df = df[[output_col]].rename(columns={output_col: TEXT_COLUMN}) @@ -68,7 +93,11 @@ def load_single( df = pd.concat([toxic_df, clean_df], ignore_index=True) else: text_col = text_col or next( - (c for c in ["comment_text", "text", "comment", "sentence", "content"] if c in df.columns), + ( + c + for c in ["comment_text", "text", "comment", "sentence", "content"] + if c in df.columns + ), None, ) if not text_col: @@ -81,7 +110,9 @@ def load_single( # civil_comments: toxicity 0-1, threshold 0.5 tox_col = next((c for c in ["toxicity", "toxic"] if c in df.columns), None) if not tox_col: - raise ValueError(f"Toxicity column not found. Columns: {list(df.columns)}") + raise ValueError( + f"Toxicity column not found. Columns: {list(df.columns)}" + ) df["label"] = (df[tox_col].fillna(0) >= 0.5).astype(int) elif label_source.startswith("toxic"): toxic_cols = [c for c in TOXIC_COLUMNS if c in df.columns] @@ -132,7 +163,10 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram # English + Russian + multilingual paradetox for name, (ds, _, src) in DATASET_PRESETS.items(): - if name in ("paradetox", "ru_paradetox", "multilingual_paradetox") and src == "paradetox": + if ( + name in ("paradetox", "ru_paradetox", "multilingual_paradetox") + and src == "paradetox" + ): try: df = load_single(ds, src, None, max_samples_per_dataset, 3, 512) dfs.append(df) @@ -144,7 +178,9 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram return pd.concat(dfs, ignore_index=True).drop_duplicates(subset=[TEXT_COLUMN]) -def balance(df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None) -> pd.DataFrame: +def balance( + df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None +) -> pd.DataFrame: """Balance classes. ratio = fraction of positive samples. max_total caps result size.""" pos = df[df["label"] == 1] neg = df[df["label"] == 0] @@ -213,7 +249,9 @@ def main() -> None: ds_name, text_col, label_src = DATASET_PRESETS[args.preset] df = load_single(ds_name, label_src, text_col, args.max_samples, 3, 512) - print(f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})") + print( + f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})" + ) df_balanced = balance(df, ratio=args.positive_ratio, max_total=args.max_total) print(f"Balanced: {len(df_balanced)} samples") diff --git a/ml/quantize_model.py b/ml/quantize_model.py index ff27704..3fa4505 100644 --- a/ml/quantize_model.py +++ b/ml/quantize_model.py @@ -58,7 +58,9 @@ def main() -> None: target.unlink() shutil.copy(quant_path, target) new_size = target.stat().st_size - print(f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)") + print( + f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)" + ) if __name__ == "__main__": diff --git a/ml/test_inference.py b/ml/test_inference.py index 4db7f96..cdc3f0c 100644 --- a/ml/test_inference.py +++ b/ml/test_inference.py @@ -13,7 +13,6 @@ # Expected: 1=toxic, 0=clean TEST_CASES = [ - ("Поздравзяю теперь ты не тупой", 1), ] @@ -29,9 +28,7 @@ def predict(model, tokenizer, text: str) -> float: def main() -> None: print("Loading model...") model = ORTModelForSequenceClassification.from_pretrained(str(MODELS_DIR)) - tokenizer = AutoTokenizer.from_pretrained( - str(MODELS_DIR), fix_mistral_regex=True - ) + tokenizer = AutoTokenizer.from_pretrained(str(MODELS_DIR), fix_mistral_regex=True) print("\n" + "=" * 70) print("Toxicity scores (1.0 = toxic, 0.5 threshold)") @@ -49,7 +46,9 @@ def main() -> None: print(f" {prob:.3f} [{label:5}] {ok} (exp: {exp_str}) {text!r}") print("=" * 70) - print(f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)") + print( + f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)" + ) print("Note: evasion (leetspeak, spacing), indirect RU insults often missed.") print("=" * 70) diff --git a/pyproject.toml b/pyproject.toml index 8f16142..f7f929d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ classifiers = [ ] [project.optional-dependencies] -dev = ["pytest>=7.0", "pytest-benchmark>=4.0"] +dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"] ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"] [project.urls] diff --git a/python/badwords/ml/_paths.py b/python/badwords/ml/_paths.py index 248cc65..464d18e 100644 --- a/python/badwords/ml/_paths.py +++ b/python/badwords/ml/_paths.py @@ -62,13 +62,19 @@ def _download_model(cache_dir: Path) -> None: zip_path = cache_dir / ASSET_NAME # Get latest release - api_url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest" - req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"}) + api_url = ( + f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest" + ) + req = urllib.request.Request( + api_url, headers={"Accept": "application/vnd.github+json"} + ) with urllib.request.urlopen(req, timeout=30) as r: release = json.loads(r.read().decode()) # Find asset - asset = next((a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None) + asset = next( + (a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None + ) if not asset: raise FileNotFoundError( f"Asset {ASSET_NAME} not found in release {release.get('tag_name', '?')}. " diff --git a/python/pyproject.toml b/python/pyproject.toml index 96101e7..af596bf 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -32,7 +32,7 @@ classifiers = [ ] [project.optional-dependencies] -dev = ["pytest>=7.0", "pytest-benchmark>=4.0"] +dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"] ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"] [project.urls] diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py index 8635bca..4251b38 100644 --- a/scripts/bench_compare.py +++ b/scripts/bench_compare.py @@ -9,6 +9,7 @@ try: from importlib.metadata import version except ImportError: + def version(pkg: str) -> str: return "unknown" @@ -69,18 +70,24 @@ def main() -> None: print(f"Python: {sys.version.split()[0]}") print(f"glin-profanity: {glin_ver}") print(f"Iterations: BadWords {n_bw:,} | glin {n_glin:,}") - print(f"Text lengths: clean {len(texts_clean)} chars | bad {len(texts_bad)} chars | batch {batch_total_len} chars (5 texts)") + print( + f"Text lengths: clean {len(texts_clean)} chars | bad {len(texts_bad)} chars | batch {batch_total_len} chars (5 texts)" + ) print("-" * 55) # Clean bw_clean = bench(bw.filter_text, texts_clean, iterations=n_bw) glin_clean = bench(glin.is_profane, texts_clean, iterations=n_glin) - print(f"Clean text: BadWords {bw_clean:>7.2f} µs ({_throughput(bw_clean)}) | glin {glin_clean:>7.2f} µs ({_throughput(glin_clean)})") + print( + f"Clean text: BadWords {bw_clean:>7.2f} µs ({_throughput(bw_clean)}) | glin {glin_clean:>7.2f} µs ({_throughput(glin_clean)})" + ) # Bad bw_bad = bench(bw.filter_text, texts_bad, iterations=n_bw) glin_bad = bench(glin.is_profane, texts_bad, iterations=n_glin) - print(f"Bad word: BadWords {bw_bad:>7.2f} µs ({_throughput(bw_bad)}) | glin {glin_bad:>7.2f} µs ({_throughput(glin_bad)})") + print( + f"Bad word: BadWords {bw_bad:>7.2f} µs ({_throughput(bw_bad)}) | glin {glin_bad:>7.2f} µs ({_throughput(glin_bad)})" + ) # Censor def bw_censor(): @@ -91,7 +98,9 @@ def glin_censor_fn(): bw_c = bench(bw_censor, iterations=n_bw) glin_c = bench(glin_censor_fn, iterations=n_glin) - print(f"Censor: BadWords {bw_c:>7.2f} µs ({_throughput(bw_c)}) | glin {glin_c:>7.2f} µs ({_throughput(glin_c)})") + print( + f"Censor: BadWords {bw_c:>7.2f} µs ({_throughput(bw_c)}) | glin {glin_c:>7.2f} µs ({_throughput(glin_c)})" + ) # Batch def bw_batch(): @@ -104,7 +113,9 @@ def glin_batch(): bw_b = bench(bw_batch, iterations=n_bw) glin_b = bench(glin_batch, iterations=n_glin) - print(f"5 texts batch: BadWords {bw_b:>7.2f} µs ({_throughput(bw_b, 5)}) | glin {glin_b:>7.2f} µs ({_throughput(glin_b, 5)})") + print( + f"5 texts batch: BadWords {bw_b:>7.2f} µs ({_throughput(bw_b, 5)}) | glin {glin_b:>7.2f} µs ({_throughput(glin_b, 5)})" + ) print("-" * 55) print() @@ -112,6 +123,7 @@ def glin_batch(): # --- ML benchmarks --- n_ml = 100 # ML is much slower import os + _prev_hf = os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None) os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" @@ -130,12 +142,14 @@ def glin_batch(): try: from glin_profanity.ml import HybridFilter - glin_ml_light = HybridFilter({ - "languages": ["english", "russian"], - "enable_ml": True, - "ml_type": "lightweight", - "preload_ml": True, - }) + glin_ml_light = HybridFilter( + { + "languages": ["english", "russian"], + "enable_ml": True, + "ml_type": "lightweight", + "preload_ml": True, + } + ) if not glin_ml_light.is_ml_ready(): glin_ml_light = None except Exception: @@ -146,12 +160,14 @@ def glin_batch(): try: from glin_profanity.ml import HybridFilter - glin_ml_trans = HybridFilter({ - "languages": ["english", "russian"], - "enable_ml": True, - "ml_type": "transformer", - "preload_ml": True, - }) + glin_ml_trans = HybridFilter( + { + "languages": ["english", "russian"], + "enable_ml": True, + "ml_type": "transformer", + "preload_ml": True, + } + ) if not glin_ml_trans.is_ml_ready(): glin_ml_trans = None except Exception: @@ -166,8 +182,12 @@ def glin_batch(): from optimum.onnxruntime import ORTModelForSequenceClassification from transformers import AutoTokenizer - bw_ml_model = ORTModelForSequenceClassification.from_pretrained(str(ml_models_dir)) - bw_ml_tok = AutoTokenizer.from_pretrained(str(ml_models_dir), fix_mistral_regex=True) + bw_ml_model = ORTModelForSequenceClassification.from_pretrained( + str(ml_models_dir) + ) + bw_ml_tok = AutoTokenizer.from_pretrained( + str(ml_models_dir), fix_mistral_regex=True + ) except Exception: pass @@ -181,42 +201,72 @@ def _run_ml_bench(backend: str, fn, scenario: str) -> None: # Clean if glin_ml_light: - _run_ml_bench("glin_light", lambda: glin_ml_light.check_profanity_hybrid(texts_clean), "Clean text") + _run_ml_bench( + "glin_light", + lambda: glin_ml_light.check_profanity_hybrid(texts_clean), + "Clean text", + ) if glin_ml_trans: - _run_ml_bench("glin_trans", lambda: glin_ml_trans.check_profanity_hybrid(texts_clean), "Clean text") + _run_ml_bench( + "glin_trans", + lambda: glin_ml_trans.check_profanity_hybrid(texts_clean), + "Clean text", + ) if bw_ml_model is not None and bw_ml_tok is not None: + def _bw_clean(): - inp = bw_ml_tok(texts_clean, return_tensors="pt", truncation=True, max_length=128) + inp = bw_ml_tok( + texts_clean, return_tensors="pt", truncation=True, max_length=128 + ) bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item() + _run_ml_bench("bw", _bw_clean, "Clean text") # Bad word if glin_ml_light: - _run_ml_bench("glin_light", lambda: glin_ml_light.check_profanity_hybrid(texts_bad), "Bad word") + _run_ml_bench( + "glin_light", + lambda: glin_ml_light.check_profanity_hybrid(texts_bad), + "Bad word", + ) if glin_ml_trans: - _run_ml_bench("glin_trans", lambda: glin_ml_trans.check_profanity_hybrid(texts_bad), "Bad word") + _run_ml_bench( + "glin_trans", + lambda: glin_ml_trans.check_profanity_hybrid(texts_bad), + "Bad word", + ) if bw_ml_model is not None and bw_ml_tok is not None: + def _bw_bad(): - inp = bw_ml_tok(texts_bad, return_tensors="pt", truncation=True, max_length=128) + inp = bw_ml_tok( + texts_bad, return_tensors="pt", truncation=True, max_length=128 + ) bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item() + _run_ml_bench("bw", _bw_bad, "Bad word") # Batch (5 texts) if glin_ml_light: + def _glin_light_batch(): for t in texts_batch: glin_ml_light.check_profanity_hybrid(t) + _run_ml_bench("glin_light", _glin_light_batch, "5 texts batch") if glin_ml_trans: + def _glin_trans_batch(): for t in texts_batch: glin_ml_trans.check_profanity_hybrid(t) + _run_ml_bench("glin_trans", _glin_trans_batch, "5 texts batch") if bw_ml_model is not None and bw_ml_tok is not None: + def _bw_batch(): for t in texts_batch: inp = bw_ml_tok(t, return_tensors="pt", truncation=True, max_length=128) bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item() + _run_ml_bench("bw", _bw_batch, "5 texts batch") # Print ML results table