From a2564ec5200154239a369341fe165c36b9b8039f Mon Sep 17 00:00:00 2001
From: FlacSy <flacsy.x@gmail.com>
Date: Tue, 3 Mar 2026 04:50:56 +0100
Subject: [PATCH] Add ruff: Makefile targets (lint, format, format-fix,
 lint-fix), dev deps, apply format

---
 Makefile                     |  18 ++++++-
 ml/prepare_data.py           |  68 ++++++++++++++++++------
 ml/quantize_model.py         |   4 +-
 ml/test_inference.py         |   9 ++--
 pyproject.toml               |   2 +-
 python/badwords/ml/_paths.py |  12 +++--
 python/pyproject.toml        |   2 +-
 scripts/bench_compare.py     | 100 ++++++++++++++++++++++++++---------
 8 files changed, 163 insertions(+), 52 deletions(-)

diff --git a/Makefile b/Makefile
index 7620481..dcc4d7d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
+.PHONY: develop build test test-rust test-python test-wasm bench bench-rust bench-python bench-compare lint lint-fix format format-fix wasm wasm-nodejs npm-publish lang-packages npm-publish-languages
 
 develop:
 	cd python && maturin develop
@@ -38,6 +38,22 @@ bench-python:
 	@if [ -d .venv ]; then .venv/bin/python -m pytest tests/bench_filter.py -v --benchmark-only; \
 	else python3 -m pytest tests/bench_filter.py -v --benchmark-only; fi
 
+# Ruff: lint (check only)
+lint:
+	@if [ -d .venv ]; then .venv/bin/ruff check .; else ruff check .; fi
+
+# Ruff: format check (CI)
+format:
+	@if [ -d .venv ]; then .venv/bin/ruff format --check .; else ruff format --check .; fi
+
+# Ruff: format fix (apply formatting)
+format-fix:
+	@if [ -d .venv ]; then .venv/bin/ruff format .; else ruff format .; fi
+
+# Ruff: lint with auto-fix
+lint-fix:
+	@if [ -d .venv ]; then .venv/bin/ruff check . --fix; else ruff check . --fix; fi
+
 # WebAssembly build for browser
 wasm:
 	cd rust/badwords-wasm && wasm-pack build --target web --out-dir pkg
diff --git a/ml/prepare_data.py b/ml/prepare_data.py
index 3444e7a..f5c511e 100644
--- a/ml/prepare_data.py
+++ b/ml/prepare_data.py
@@ -10,7 +10,14 @@
 import pandas as pd
 from datasets import load_dataset
 
-TOXIC_COLUMNS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
+TOXIC_COLUMNS = [
+    "toxic",
+    "severe_toxic",
+    "obscene",
+    "threat",
+    "insult",
+    "identity_hate",
+]
 TEXT_COLUMN = "comment_text"
 OUTPUT_DIR = Path(__file__).parent / "data" / "processed"
 
@@ -46,21 +53,39 @@ def load_single(
     if label_source == "paradetox":
         # toxic = 1, neutral/detox = 0
         input_col = next(
-            (c for c in [
-                "input", "source", "toxic",
-                "en_toxic_comment", "ru_toxic_comment", "toxic_sentence",
-            ] if c in df.columns),
+            (
+                c
+                for c in [
+                    "input",
+                    "source",
+                    "toxic",
+                    "en_toxic_comment",
+                    "ru_toxic_comment",
+                    "toxic_sentence",
+                ]
+                if c in df.columns
+            ),
             None,
         )
         output_col = next(
-            (c for c in [
-                "output", "target", "detox",
-                "en_neutral_comment", "ru_neutral_comment", "neutral_sentence",
-            ] if c in df.columns),
+            (
+                c
+                for c in [
+                    "output",
+                    "target",
+                    "detox",
+                    "en_neutral_comment",
+                    "ru_neutral_comment",
+                    "neutral_sentence",
+                ]
+                if c in df.columns
+            ),
             None,
         )
         if not input_col or not output_col:
-            raise ValueError(f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}")
+            raise ValueError(
+                f"ParaDetox format needs toxic/neutral columns. Columns: {list(df.columns)}"
+            )
         toxic_df = df[[input_col]].rename(columns={input_col: TEXT_COLUMN})
         toxic_df["label"] = 1
         clean_df = df[[output_col]].rename(columns={output_col: TEXT_COLUMN})
@@ -68,7 +93,11 @@ def load_single(
         df = pd.concat([toxic_df, clean_df], ignore_index=True)
     else:
         text_col = text_col or next(
-            (c for c in ["comment_text", "text", "comment", "sentence", "content"] if c in df.columns),
+            (
+                c
+                for c in ["comment_text", "text", "comment", "sentence", "content"]
+                if c in df.columns
+            ),
             None,
         )
         if not text_col:
@@ -81,7 +110,9 @@ def load_single(
             # civil_comments: toxicity 0-1, threshold 0.5
             tox_col = next((c for c in ["toxicity", "toxic"] if c in df.columns), None)
             if not tox_col:
-                raise ValueError(f"Toxicity column not found. Columns: {list(df.columns)}")
+                raise ValueError(
+                    f"Toxicity column not found. Columns: {list(df.columns)}"
+                )
             df["label"] = (df[tox_col].fillna(0) >= 0.5).astype(int)
         elif label_source.startswith("toxic"):
             toxic_cols = [c for c in TOXIC_COLUMNS if c in df.columns]
@@ -132,7 +163,10 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram
 
     # English + Russian + multilingual paradetox
     for name, (ds, _, src) in DATASET_PRESETS.items():
-        if name in ("paradetox", "ru_paradetox", "multilingual_paradetox") and src == "paradetox":
+        if (
+            name in ("paradetox", "ru_paradetox", "multilingual_paradetox")
+            and src == "paradetox"
+        ):
             try:
                 df = load_single(ds, src, None, max_samples_per_dataset, 3, 512)
                 dfs.append(df)
@@ -144,7 +178,9 @@ def load_multilingual(max_samples_per_dataset: int | None = None) -> pd.DataFram
     return pd.concat(dfs, ignore_index=True).drop_duplicates(subset=[TEXT_COLUMN])
 
 
-def balance(df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None) -> pd.DataFrame:
+def balance(
+    df: pd.DataFrame, ratio: float = 0.3, max_total: int | None = None
+) -> pd.DataFrame:
     """Balance classes. ratio = fraction of positive samples. max_total caps result size."""
     pos = df[df["label"] == 1]
     neg = df[df["label"] == 0]
@@ -213,7 +249,9 @@ def main() -> None:
         ds_name, text_col, label_src = DATASET_PRESETS[args.preset]
         df = load_single(ds_name, label_src, text_col, args.max_samples, 3, 512)
 
-    print(f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})")
+    print(
+        f"Total: {len(df)} samples, {df['label'].sum()} positive ({df['label'].mean():.2%})"
+    )
 
     df_balanced = balance(df, ratio=args.positive_ratio, max_total=args.max_total)
     print(f"Balanced: {len(df_balanced)} samples")
diff --git a/ml/quantize_model.py b/ml/quantize_model.py
index ff27704..3fa4505 100644
--- a/ml/quantize_model.py
+++ b/ml/quantize_model.py
@@ -58,7 +58,9 @@ def main() -> None:
             target.unlink()
             shutil.copy(quant_path, target)
             new_size = target.stat().st_size
-            print(f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)")
+            print(
+                f"Done: {orig_size / 1e6:.1f} MB -> {new_size / 1e6:.1f} MB ({100 * new_size / orig_size:.0f}%)"
+            )
 
 
 if __name__ == "__main__":
diff --git a/ml/test_inference.py b/ml/test_inference.py
index 4db7f96..cdc3f0c 100644
--- a/ml/test_inference.py
+++ b/ml/test_inference.py
@@ -13,7 +13,6 @@
 
 # Expected: 1=toxic, 0=clean
 TEST_CASES = [
-
     ("Поздравзяю теперь ты не тупой", 1),
 ]
 
@@ -29,9 +28,7 @@ def predict(model, tokenizer, text: str) -> float:
 def main() -> None:
     print("Loading model...")
     model = ORTModelForSequenceClassification.from_pretrained(str(MODELS_DIR))
-    tokenizer = AutoTokenizer.from_pretrained(
-        str(MODELS_DIR), fix_mistral_regex=True
-    )
+    tokenizer = AutoTokenizer.from_pretrained(str(MODELS_DIR), fix_mistral_regex=True)
 
     print("\n" + "=" * 70)
     print("Toxicity scores (1.0 = toxic, 0.5 threshold)")
@@ -49,7 +46,9 @@ def main() -> None:
         print(f"  {prob:.3f} [{label:5}] {ok} (exp: {exp_str})  {text!r}")
 
     print("=" * 70)
-    print(f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)")
+    print(
+        f"Accuracy: {correct}/{len(TEST_CASES)} ({100 * correct / len(TEST_CASES):.0f}%)"
+    )
     print("Note: evasion (leetspeak, spacing), indirect RU insults often missed.")
     print("=" * 70)
 
diff --git a/pyproject.toml b/pyproject.toml
index 8f16142..f7f929d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest>=7.0", "pytest-benchmark>=4.0"]
+dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"]
 ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"]
 
 [project.urls]
diff --git a/python/badwords/ml/_paths.py b/python/badwords/ml/_paths.py
index 248cc65..464d18e 100644
--- a/python/badwords/ml/_paths.py
+++ b/python/badwords/ml/_paths.py
@@ -62,13 +62,19 @@ def _download_model(cache_dir: Path) -> None:
     zip_path = cache_dir / ASSET_NAME
 
     # Get latest release
-    api_url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest"
-    req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"})
+    api_url = (
+        f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/releases/latest"
+    )
+    req = urllib.request.Request(
+        api_url, headers={"Accept": "application/vnd.github+json"}
+    )
     with urllib.request.urlopen(req, timeout=30) as r:
         release = json.loads(r.read().decode())
 
     # Find asset
-    asset = next((a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None)
+    asset = next(
+        (a for a in release.get("assets", []) if a["name"] == ASSET_NAME), None
+    )
     if not asset:
         raise FileNotFoundError(
             f"Asset {ASSET_NAME} not found in release {release.get('tag_name', '?')}. "
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 96101e7..af596bf 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -32,7 +32,7 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest>=7.0", "pytest-benchmark>=4.0"]
+dev = ["pytest>=7.0", "pytest-benchmark>=4.0", "ruff>=0.4"]
 ml = ["onnxruntime>=1.16", "optimum[onnxruntime]>=1.14", "transformers>=4.36"]
 
 [project.urls]
diff --git a/scripts/bench_compare.py b/scripts/bench_compare.py
index 8635bca..4251b38 100644
--- a/scripts/bench_compare.py
+++ b/scripts/bench_compare.py
@@ -9,6 +9,7 @@
 try:
     from importlib.metadata import version
 except ImportError:
+
     def version(pkg: str) -> str:
         return "unknown"
 
@@ -69,18 +70,24 @@ def main() -> None:
     print(f"Python:         {sys.version.split()[0]}")
     print(f"glin-profanity: {glin_ver}")
     print(f"Iterations:     BadWords {n_bw:,}  |  glin {n_glin:,}")
-    print(f"Text lengths:   clean {len(texts_clean)} chars  |  bad {len(texts_bad)} chars  |  batch {batch_total_len} chars (5 texts)")
+    print(
+        f"Text lengths:   clean {len(texts_clean)} chars  |  bad {len(texts_bad)} chars  |  batch {batch_total_len} chars (5 texts)"
+    )
     print("-" * 55)
 
     # Clean
     bw_clean = bench(bw.filter_text, texts_clean, iterations=n_bw)
     glin_clean = bench(glin.is_profane, texts_clean, iterations=n_glin)
-    print(f"Clean text:     BadWords {bw_clean:>7.2f} µs ({_throughput(bw_clean)})  |  glin {glin_clean:>7.2f} µs ({_throughput(glin_clean)})")
+    print(
+        f"Clean text:     BadWords {bw_clean:>7.2f} µs ({_throughput(bw_clean)})  |  glin {glin_clean:>7.2f} µs ({_throughput(glin_clean)})"
+    )
 
     # Bad
     bw_bad = bench(bw.filter_text, texts_bad, iterations=n_bw)
     glin_bad = bench(glin.is_profane, texts_bad, iterations=n_glin)
-    print(f"Bad word:       BadWords {bw_bad:>7.2f} µs ({_throughput(bw_bad)})  |  glin {glin_bad:>7.2f} µs ({_throughput(glin_bad)})")
+    print(
+        f"Bad word:       BadWords {bw_bad:>7.2f} µs ({_throughput(bw_bad)})  |  glin {glin_bad:>7.2f} µs ({_throughput(glin_bad)})"
+    )
 
     # Censor
     def bw_censor():
@@ -91,7 +98,9 @@ def glin_censor_fn():
 
     bw_c = bench(bw_censor, iterations=n_bw)
     glin_c = bench(glin_censor_fn, iterations=n_glin)
-    print(f"Censor:         BadWords {bw_c:>7.2f} µs ({_throughput(bw_c)})  |  glin {glin_c:>7.2f} µs ({_throughput(glin_c)})")
+    print(
+        f"Censor:         BadWords {bw_c:>7.2f} µs ({_throughput(bw_c)})  |  glin {glin_c:>7.2f} µs ({_throughput(glin_c)})"
+    )
 
     # Batch
     def bw_batch():
@@ -104,7 +113,9 @@ def glin_batch():
 
     bw_b = bench(bw_batch, iterations=n_bw)
     glin_b = bench(glin_batch, iterations=n_glin)
-    print(f"5 texts batch:  BadWords {bw_b:>7.2f} µs ({_throughput(bw_b, 5)})  |  glin {glin_b:>7.2f} µs ({_throughput(glin_b, 5)})")
+    print(
+        f"5 texts batch:  BadWords {bw_b:>7.2f} µs ({_throughput(bw_b, 5)})  |  glin {glin_b:>7.2f} µs ({_throughput(glin_b, 5)})"
+    )
 
     print("-" * 55)
     print()
@@ -112,6 +123,7 @@ def glin_batch():
     # --- ML benchmarks ---
     n_ml = 100  # ML is much slower
     import os
+
     _prev_hf = os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
     os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
 
@@ -130,12 +142,14 @@ def glin_batch():
     try:
         from glin_profanity.ml import HybridFilter
 
-        glin_ml_light = HybridFilter({
-            "languages": ["english", "russian"],
-            "enable_ml": True,
-            "ml_type": "lightweight",
-            "preload_ml": True,
-        })
+        glin_ml_light = HybridFilter(
+            {
+                "languages": ["english", "russian"],
+                "enable_ml": True,
+                "ml_type": "lightweight",
+                "preload_ml": True,
+            }
+        )
         if not glin_ml_light.is_ml_ready():
             glin_ml_light = None
     except Exception:
@@ -146,12 +160,14 @@ def glin_batch():
     try:
         from glin_profanity.ml import HybridFilter
 
-        glin_ml_trans = HybridFilter({
-            "languages": ["english", "russian"],
-            "enable_ml": True,
-            "ml_type": "transformer",
-            "preload_ml": True,
-        })
+        glin_ml_trans = HybridFilter(
+            {
+                "languages": ["english", "russian"],
+                "enable_ml": True,
+                "ml_type": "transformer",
+                "preload_ml": True,
+            }
+        )
         if not glin_ml_trans.is_ml_ready():
             glin_ml_trans = None
     except Exception:
@@ -166,8 +182,12 @@ def glin_batch():
             from optimum.onnxruntime import ORTModelForSequenceClassification
             from transformers import AutoTokenizer
 
-            bw_ml_model = ORTModelForSequenceClassification.from_pretrained(str(ml_models_dir))
-            bw_ml_tok = AutoTokenizer.from_pretrained(str(ml_models_dir), fix_mistral_regex=True)
+            bw_ml_model = ORTModelForSequenceClassification.from_pretrained(
+                str(ml_models_dir)
+            )
+            bw_ml_tok = AutoTokenizer.from_pretrained(
+                str(ml_models_dir), fix_mistral_regex=True
+            )
         except Exception:
             pass
 
@@ -181,42 +201,72 @@ def _run_ml_bench(backend: str, fn, scenario: str) -> None:
 
     # Clean
     if glin_ml_light:
-        _run_ml_bench("glin_light", lambda: glin_ml_light.check_profanity_hybrid(texts_clean), "Clean text")
+        _run_ml_bench(
+            "glin_light",
+            lambda: glin_ml_light.check_profanity_hybrid(texts_clean),
+            "Clean text",
+        )
     if glin_ml_trans:
-        _run_ml_bench("glin_trans", lambda: glin_ml_trans.check_profanity_hybrid(texts_clean), "Clean text")
+        _run_ml_bench(
+            "glin_trans",
+            lambda: glin_ml_trans.check_profanity_hybrid(texts_clean),
+            "Clean text",
+        )
     if bw_ml_model is not None and bw_ml_tok is not None:
+
         def _bw_clean():
-            inp = bw_ml_tok(texts_clean, return_tensors="pt", truncation=True, max_length=128)
+            inp = bw_ml_tok(
+                texts_clean, return_tensors="pt", truncation=True, max_length=128
+            )
             bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item()
+
         _run_ml_bench("bw", _bw_clean, "Clean text")
 
     # Bad word
     if glin_ml_light:
-        _run_ml_bench("glin_light", lambda: glin_ml_light.check_profanity_hybrid(texts_bad), "Bad word")
+        _run_ml_bench(
+            "glin_light",
+            lambda: glin_ml_light.check_profanity_hybrid(texts_bad),
+            "Bad word",
+        )
     if glin_ml_trans:
-        _run_ml_bench("glin_trans", lambda: glin_ml_trans.check_profanity_hybrid(texts_bad), "Bad word")
+        _run_ml_bench(
+            "glin_trans",
+            lambda: glin_ml_trans.check_profanity_hybrid(texts_bad),
+            "Bad word",
+        )
     if bw_ml_model is not None and bw_ml_tok is not None:
+
         def _bw_bad():
-            inp = bw_ml_tok(texts_bad, return_tensors="pt", truncation=True, max_length=128)
+            inp = bw_ml_tok(
+                texts_bad, return_tensors="pt", truncation=True, max_length=128
+            )
             bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item()
+
         _run_ml_bench("bw", _bw_bad, "Bad word")
 
     # Batch (5 texts)
     if glin_ml_light:
+
         def _glin_light_batch():
             for t in texts_batch:
                 glin_ml_light.check_profanity_hybrid(t)
+
         _run_ml_bench("glin_light", _glin_light_batch, "5 texts batch")
     if glin_ml_trans:
+
         def _glin_trans_batch():
             for t in texts_batch:
                 glin_ml_trans.check_profanity_hybrid(t)
+
         _run_ml_bench("glin_trans", _glin_trans_batch, "5 texts batch")
     if bw_ml_model is not None and bw_ml_tok is not None:
+
         def _bw_batch():
             for t in texts_batch:
                 inp = bw_ml_tok(t, return_tensors="pt", truncation=True, max_length=128)
                 bw_ml_model(**inp).logits.softmax(dim=-1)[0, 1].item()
+
         _run_ml_bench("bw", _bw_batch, "5 texts batch")
 
     # Print ML results table