From a210a0a005dedba2ebed47ab8e4b055832518412 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:45:23 +0000 Subject: [PATCH 1/2] Merge overlapping tools and update README Co-authored-by: ypriverol <52113+ypriverol@users.noreply.github.com> Agent-Logs-Url: https://github.com/OpenMS/agentomics/sessions/a6d63691-a178-4fa1-b118-46b4dbb887ca --- README.md | 13 +- .../isotope_pattern_analyzer/README.md | 83 +++ .../isotope_pattern_analyzer.py | 415 +++++++++++++++ .../requirements.txt | 1 - .../tests/conftest.py | 0 .../tests/test_isotope_pattern_analyzer.py | 218 ++++++++ .../isotope_pattern_fit_scorer/README.md | 16 - .../isotope_pattern_fit_scorer.py | 258 ---------- .../requirements.txt | 2 - .../tests/test_isotope_pattern_fit_scorer.py | 111 ----- .../isotope_pattern_matcher/README.md | 11 - .../isotope_pattern_matcher.py | 147 ------ .../tests/test_isotope_pattern_matcher.py | 50 -- .../isotope_pattern_scorer.py | 154 ------ .../isotope_pattern_scorer/tests/conftest.py | 15 - .../tests/test_isotope_pattern_scorer.py | 53 -- .../mgf_mzml_converter/README.md | 50 ++ .../mgf_mzml_converter/mgf_mzml_converter.py | 471 ++++++++++++++++++ .../mgf_mzml_converter}/requirements.txt | 1 - .../mgf_mzml_converter}/tests/conftest.py | 0 .../tests/test_mgf_mzml_converter.py | 300 +++++++++++ .../mgf_to_mzml_converter/README.md | 15 - .../mgf_to_mzml_converter.py | 110 ---- .../mgf_to_mzml_converter/requirements.txt | 2 - .../mgf_to_mzml_converter/tests/conftest.py | 15 - .../tests/test_mgf_to_mzml_converter.py | 101 ---- .../mzml_to_mgf_converter/README.md | 19 - .../mzml_to_mgf_converter.py | 214 -------- .../mzml_to_mgf_converter/requirements.txt | 2 - .../mzml_to_mgf_converter/tests/conftest.py | 15 - .../tests/test_mzml_to_mgf_converter.py | 186 ------- 31 files changed, 1542 insertions(+), 1506 deletions(-) create mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/README.md create mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/isotope_pattern_analyzer.py rename tools/metabolomics/spectral_analysis/{isotope_pattern_matcher => isotope_pattern_analyzer}/requirements.txt (60%) rename tools/metabolomics/spectral_analysis/{isotope_pattern_fit_scorer => isotope_pattern_analyzer}/tests/conftest.py (100%) create mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/test_isotope_pattern_analyzer.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/README.md delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/isotope_pattern_fit_scorer.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/requirements.txt delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/test_isotope_pattern_fit_scorer.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_matcher/README.md delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_matcher/isotope_pattern_matcher.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/test_isotope_pattern_matcher.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_scorer/isotope_pattern_scorer.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/conftest.py delete mode 100644 tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/test_isotope_pattern_scorer.py create mode 100644 tools/proteomics/file_conversion/mgf_mzml_converter/README.md create mode 100644 tools/proteomics/file_conversion/mgf_mzml_converter/mgf_mzml_converter.py rename tools/{metabolomics/spectral_analysis/isotope_pattern_scorer => proteomics/file_conversion/mgf_mzml_converter}/requirements.txt (60%) rename tools/{metabolomics/spectral_analysis/isotope_pattern_matcher => proteomics/file_conversion/mgf_mzml_converter}/tests/conftest.py (100%) create mode 100644 tools/proteomics/file_conversion/mgf_mzml_converter/tests/test_mgf_mzml_converter.py delete mode 100644 tools/proteomics/file_conversion/mgf_to_mzml_converter/README.md delete mode 100644 tools/proteomics/file_conversion/mgf_to_mzml_converter/mgf_to_mzml_converter.py delete mode 100644 tools/proteomics/file_conversion/mgf_to_mzml_converter/requirements.txt delete mode 100644 tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/conftest.py delete mode 100644 tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/test_mgf_to_mzml_converter.py delete mode 100644 tools/proteomics/file_conversion/mzml_to_mgf_converter/README.md delete mode 100644 tools/proteomics/file_conversion/mzml_to_mgf_converter/mzml_to_mgf_converter.py delete mode 100644 tools/proteomics/file_conversion/mzml_to_mgf_converter/requirements.txt delete mode 100644 tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/conftest.py delete mode 100644 tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/test_mzml_to_mgf_converter.py diff --git a/README.md b/README.md index 0d0b861..6d4e05d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Agentomics -A growing collection of **123 standalone CLI tools** built with [pyopenms](https://pyopenms.readthedocs.io/) for proteomics and metabolomics workflows. Every tool in this repository fills a gap not covered by existing OpenMS TOPP tools — small, focused utilities that researchers need daily but typically write as throwaway scripts. +A growing collection of **118 standalone CLI tools** built with [pyopenms](https://pyopenms.readthedocs.io/) for proteomics and metabolomics workflows. Every tool in this repository fills a gap not covered by existing OpenMS TOPP tools — small, focused utilities that researchers need daily but typically write as throwaway scripts. ## Why This Exists @@ -155,12 +155,11 @@ Both `ruff` and `pytest` must pass with zero errors. | [`fasta_in_silico_digest_stats`](tools/proteomics/fasta_utils/fasta_in_silico_digest_stats/) | Digest a FASTA and report peptide-level statistics | | [`fasta_taxonomy_splitter`](tools/proteomics/fasta_utils/fasta_taxonomy_splitter/) | Split multi-organism FASTA by taxonomy from headers | -#### File Conversion (8 tools) +#### File Conversion (7 tools) | Tool | Description | |------|-------------| -| [`mzml_to_mgf_converter`](tools/proteomics/file_conversion/mzml_to_mgf_converter/) | Convert MS2 spectra from mzML to MGF format | -| [`mgf_to_mzml_converter`](tools/proteomics/file_conversion/mgf_to_mzml_converter/) | Convert MGF files to mzML format | +| [`mgf_mzml_converter`](tools/proteomics/file_conversion/mgf_mzml_converter/) | Bidirectional MGF ↔ mzML converter with spectrum filtering (merged from `mgf_to_mzml_converter` + `mzml_to_mgf_converter`) | | [`consensus_map_to_matrix`](tools/proteomics/file_conversion/consensus_map_to_matrix/) | Convert consensusXML to flat quantification matrix | | [`idxml_to_tsv_exporter`](tools/proteomics/file_conversion/idxml_to_tsv_exporter/) | Export idXML identification results to flat TSV | | [`ms_data_to_csv_exporter`](tools/proteomics/file_conversion/ms_data_to_csv_exporter/) | Export mzML/featureXML data to CSV with column selection | @@ -281,15 +280,13 @@ Both `ruff` and `pytest` must pass with zero errors. | [`mass_defect_filter`](tools/metabolomics/feature_processing/mass_defect_filter/) | Filter features by mass defect and Kendrick mass defect | | [`metabolite_feature_detection`](tools/metabolomics/feature_processing/metabolite_feature_detection/) | Metabolite feature detection from LC-MS data | -#### Spectral Analysis (6 tools) +#### Spectral Analysis (4 tools) | Tool | Description | |------|-------------| | [`spectral_entropy_scorer`](tools/metabolomics/spectral_analysis/spectral_entropy_scorer/) | Compute spectral entropy similarity (Li & Fiehn 2021) | | [`neutral_loss_scanner`](tools/metabolomics/spectral_analysis/neutral_loss_scanner/) | Scan MS2 spectra for characteristic neutral losses | -| [`isotope_pattern_scorer`](tools/metabolomics/spectral_analysis/isotope_pattern_scorer/) | Score observed vs. theoretical isotope patterns | -| [`isotope_pattern_matcher`](tools/metabolomics/spectral_analysis/isotope_pattern_matcher/) | Generate theoretical isotope distributions and cosine similarity scoring | -| [`isotope_pattern_fit_scorer`](tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/) | Score isotope pattern fit, detect Cl/Br from M+2 enhancement | +| [`isotope_pattern_analyzer`](tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/) | Generate theoretical isotope distributions, cosine similarity scoring, Da/ppm tolerance, Cl/Br halogen detection (merged from `isotope_pattern_matcher` + `isotope_pattern_scorer` + `isotope_pattern_fit_scorer`) | | [`massql_query_tool`](tools/metabolomics/spectral_analysis/massql_query_tool/) | Query mzML data using MassQL-like syntax | #### Compound Annotation (4 tools) diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/README.md b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/README.md new file mode 100644 index 0000000..c4b4f44 --- /dev/null +++ b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/README.md @@ -0,0 +1,83 @@ +# Isotope Pattern Analyzer + +Generate theoretical isotope distributions for molecular formulas, score observed +isotope patterns using cosine similarity, and detect halogenation (Cl/Br). + +This tool consolidates `isotope_pattern_matcher`, `isotope_pattern_scorer`, and +`isotope_pattern_fit_scorer` into a single, improved utility. + +## Features + +- Theoretical isotope pattern generation via pyopenms `CoarseIsotopePatternGenerator` +- Cosine similarity scoring between observed and theoretical patterns +- **Da or ppm m/z tolerance** — choose your preferred unit +- Halogen (Cl/Br) detection from M+2 peak enhancement +- JSON output with per-peak detail +- Terminal bar-chart preview of the theoretical distribution +- Optional numpy acceleration for cosine computation + +## Installation + +```bash +pip install pyopenms +``` + +## CLI Usage + +```bash +# Generate and display the isotope pattern for glucose +python isotope_pattern_analyzer.py --formula C6H12O6 + +# Score observed peaks against the formula (colon-separated format) +python isotope_pattern_analyzer.py --formula C6H12O6 \ + --observed "180.063:100,181.067:6.5,182.070:0.5" \ + --output result.json + +# Use legacy comma-separated format (one --peaks flag per peak) +python isotope_pattern_analyzer.py --formula C6H12O6 \ + --peaks 180.063,100.0 --peaks 181.067,6.5 \ + --output result.json + +# Use ppm tolerance +python isotope_pattern_analyzer.py --formula C6H12O6 \ + --observed "180.063:100,181.067:6.5" \ + --tolerance 10 --tolerance-unit ppm + +# Detect halogenation (chlorinated compound example) +python isotope_pattern_analyzer.py --formula C6H5Cl \ + --observed "112.007:100,113.011:5.5,114.004:33.0" \ + --output halogen_result.json +``` + +## Output JSON Structure + +```json +{ + "formula": "C6H12O6", + "cosine_similarity": 0.9987, + "n_peaks_compared": 3, + "tolerance": 0.05, + "tolerance_unit": "da", + "peaks": [ + {"peak_index": 0, "obs_mz": 180.063, "theo_mz": 180.0634, "obs_intensity": 100.0, "theo_intensity": 100.0}, + ... + ], + "theoretical_pattern": [...], + "halogen_detection": { + "m2_ratio_observed": 0.5, + "m2_ratio_theoretical": 0.42, + "m2_excess": 0.08, + "halogen_flag": false, + "possible_halogen": "none" + } +} +``` + +## Halogen Detection Thresholds + +| M+2 excess above theoretical | Interpretation | +|------------------------------|---------------------------------| +| < 10 % | No halogenation detected | +| 10–20 % | Cl (weak signal) | +| 20–70 % | Cl | +| > 70 % | Br | diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/isotope_pattern_analyzer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/isotope_pattern_analyzer.py new file mode 100644 index 0000000..a29de8e --- /dev/null +++ b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/isotope_pattern_analyzer.py @@ -0,0 +1,415 @@ +""" +Isotope Pattern Analyzer +======================== +Generate theoretical isotope distributions for molecular formulas, score observed +isotope patterns against theoretical ones using cosine similarity, and detect +halogenation (Cl/Br) from M+2 peak enhancement. + +This tool consolidates the functionality of isotope_pattern_matcher, +isotope_pattern_scorer, and isotope_pattern_fit_scorer into a single, +more capable utility. + +Features +-------- +- Theoretical isotope pattern generation via pyopenms CoarseIsotopePatternGenerator +- Cosine similarity scoring between observed and theoretical patterns +- Da or ppm m/z tolerance modes +- Halogen (Cl/Br) detection from M+2 peak enhancement +- JSON output with per-peak detail +- Bar-chart preview in the terminal + +Usage +----- + # Generate isotope pattern for glucose + python isotope_pattern_analyzer.py --formula C6H12O6 + + # Score observed peaks (mz:intensity format) against formula + python isotope_pattern_analyzer.py --formula C6H12O6 \\ + --observed "180.063:100,181.067:6.5,182.070:0.5" \\ + --output result.json + + # Score using comma-separated mz,intensity pairs (legacy format) + python isotope_pattern_analyzer.py --formula C6H12O6 \\ + --peaks 180.063,100.0 --peaks 181.067,6.5 \\ + --output result.json + + # Use ppm tolerance + python isotope_pattern_analyzer.py --formula C6H12O6 \\ + --observed "180.063:100,181.067:6.5" --tolerance 10 --tolerance-unit ppm +""" + +import json +import math +import sys + +import click + +try: + import pyopenms as oms +except ImportError: + sys.exit("pyopenms is required. Install it with: pip install pyopenms") + +try: + import numpy as np + + _HAS_NUMPY = True +except ImportError: + _HAS_NUMPY = False + + +def get_theoretical_pattern(formula: str, max_isotopes: int = 6) -> list[tuple[float, float]]: + """Compute the theoretical isotope distribution for a molecular formula. + + Parameters + ---------- + formula: + Empirical formula string, e.g. ``"C6H12O6"``. + max_isotopes: + Maximum number of isotope peaks to return (default: 6). + + Returns + ------- + list of (mz, relative_abundance) + Sorted by m/z; relative abundances normalized so the maximum is 100. + """ + ef = oms.EmpiricalFormula(formula) + iso_dist = ef.getIsotopeDistribution(oms.CoarseIsotopePatternGenerator(max_isotopes)) + container = iso_dist.getContainer() + peaks = [(p.getMZ(), p.getIntensity()) for p in container] + if not peaks: + return [] + max_int = max(p[1] for p in peaks) + if max_int == 0: + return peaks + return [(mz, intensity / max_int * 100.0) for mz, intensity in peaks] + + +def _mz_tolerance_da(mz: float, tolerance: float, unit: str) -> float: + """Convert tolerance to Da, supporting 'da' and 'ppm' units. + + Parameters + ---------- + mz: + Reference m/z value (only used when unit is 'ppm'). + tolerance: + Tolerance value. + unit: + Either ``'da'`` or ``'ppm'``. + + Returns + ------- + float + Tolerance in Daltons. + """ + if unit == "ppm": + return mz * tolerance * 1e-6 + return tolerance + + +def cosine_similarity( + observed: list[tuple[float, float]], + theoretical: list[tuple[float, float]], + tolerance: float = 0.05, + tolerance_unit: str = "da", +) -> float: + """Compute cosine similarity between observed and theoretical isotope patterns. + + Each theoretical peak is matched to the closest observed peak within the + specified tolerance. Unmatched theoretical peaks contribute zero to the + dot product. + + Parameters + ---------- + observed: + List of (mz, intensity) for observed peaks. + theoretical: + List of (mz, intensity) for theoretical peaks. + tolerance: + m/z tolerance value. + tolerance_unit: + ``'da'`` for Daltons (default) or ``'ppm'`` for parts per million. + + Returns + ------- + float + Cosine similarity score in [0, 1]. + """ + if not observed or not theoretical: + return 0.0 + + if _HAS_NUMPY: + obs_mz = np.array([p[0] for p in observed]) + obs_int = np.array([p[1] for p in observed], dtype=float) + theo_int = np.array([p[1] for p in theoretical], dtype=float) + + matched_obs = np.zeros(len(theoretical)) + for i, (tmz, _) in enumerate(theoretical): + tol_da = _mz_tolerance_da(tmz, tolerance, tolerance_unit) + diffs = np.abs(obs_mz - tmz) + mask = diffs <= tol_da + if mask.any(): + matched_obs[i] = obs_int[np.argmin(diffs)] + + dot = float(np.dot(theo_int, matched_obs)) + norm_t = float(np.linalg.norm(theo_int)) + norm_o = float(np.linalg.norm(matched_obs)) + else: + matched_obs = [] + for tmz, _ in theoretical: + tol_da = _mz_tolerance_da(tmz, tolerance, tolerance_unit) + best = 0.0 + for omz, oint in observed: + if abs(omz - tmz) <= tol_da: + best = oint + break + matched_obs.append(best) + + dot = sum(t * o for (_, t), o in zip(theoretical, matched_obs)) + norm_t = math.sqrt(sum(t ** 2 for _, t in theoretical)) + norm_o = math.sqrt(sum(o ** 2 for o in matched_obs)) + + if norm_t == 0 or norm_o == 0: + return 0.0 + return dot / (norm_t * norm_o) + + +def detect_halogenation( + observed: list[tuple[float, float]], + theoretical: list[tuple[float, float]], +) -> dict: + """Detect Cl/Br halogenation from an enhanced M+2 isotope peak. + + Chlorine has a natural Cl-37 / Cl-35 ratio of ~32.5%, giving an M+2 + peak ~32.5% of M. Bromine has an almost 1:1 Br-79/Br-81 ratio, giving + an M+2 peak ~97% of M. If the observed M+2 significantly exceeds the + theoretical value, a halogen is flagged. + + Parameters + ---------- + observed: + Observed isotope pattern as (mz, intensity) list. + theoretical: + Theoretical isotope pattern (without explicit halogens) as (mz, + intensity) list. + + Returns + ------- + dict + Keys: m2_ratio_observed, m2_ratio_theoretical, m2_excess, + halogen_flag, possible_halogen. + """ + result: dict = { + "m2_ratio_observed": None, + "m2_ratio_theoretical": None, + "m2_excess": None, + "halogen_flag": False, + "possible_halogen": "none", + } + if len(observed) < 3 or len(theoretical) < 3: + return result + + obs_m0 = observed[0][1] + obs_m2 = observed[2][1] + theo_m0 = max(theoretical[0][1], 1e-10) + theo_m2 = theoretical[2][1] + + if obs_m0 == 0: + return result + + obs_ratio = obs_m2 / obs_m0 * 100.0 + theo_ratio = theo_m2 / theo_m0 * 100.0 + excess = obs_ratio - theo_ratio + + result["m2_ratio_observed"] = round(obs_ratio, 2) + result["m2_ratio_theoretical"] = round(theo_ratio, 2) + result["m2_excess"] = round(excess, 2) + + if excess > 10.0: + result["halogen_flag"] = True + if excess > 70.0: + result["possible_halogen"] = "Br" + elif excess > 20.0: + result["possible_halogen"] = "Cl" + else: + result["possible_halogen"] = "Cl (weak signal)" + + return result + + +def parse_observed(observed_str: str) -> list[tuple[float, float]]: + """Parse an observed peak string into (mz, intensity) pairs. + + Accepts both ``"mz:intensity"`` (colon-separated) and + ``"mz,intensity"`` (comma-separated, one pair per call) formats. + Multiple peaks are separated by commas in colon format. + + Parameters + ---------- + observed_str: + String of the form ``"180.063:100,181.067:6.5"`` (colon-separated + pairs) or a single ``"180.063,100.0"`` (comma-separated pair). + + Returns + ------- + list of (mz, intensity) + + Raises + ------ + ValueError + If a token cannot be parsed. + """ + peaks = [] + # If colons are present, assume "mz:intensity,mz:intensity,..." format + if ":" in observed_str: + for token in observed_str.split(","): + token = token.strip() + if not token: + continue + parts = token.split(":") + if len(parts) != 2: + raise ValueError(f"Expected 'mz:intensity', got: {token!r}") + peaks.append((float(parts[0].strip()), float(parts[1].strip()))) + else: + # Single "mz,intensity" pair (legacy --peaks flag style) + parts = observed_str.split(",") + if len(parts) != 2: + raise ValueError(f"Expected 'mz,intensity', got: {observed_str!r}") + peaks.append((float(parts[0].strip()), float(parts[1].strip()))) + return peaks + + +def score_pattern( + observed: list[tuple[float, float]], + formula: str, + max_isotopes: int = 6, + tolerance: float = 0.05, + tolerance_unit: str = "da", +) -> dict: + """Score an observed isotope pattern against a theoretical one. + + Parameters + ---------- + observed: + List of (mz, intensity) observed peaks. + formula: + Molecular formula string. + max_isotopes: + Maximum number of theoretical isotope peaks to generate. + tolerance: + m/z tolerance value. + tolerance_unit: + ``'da'`` or ``'ppm'``. + + Returns + ------- + dict + Contains formula, cosine_similarity, per-peak detail, and + halogen detection results. + """ + theoretical = get_theoretical_pattern(formula, max_isotopes) + cos_sim = cosine_similarity(observed, theoretical, tolerance, tolerance_unit) + halogen = detect_halogenation(observed, theoretical) + + n = min(len(observed), len(theoretical)) + peak_detail = [] + for i in range(n): + peak_detail.append({ + "peak_index": i, + "obs_mz": round(observed[i][0], 6), + "theo_mz": round(theoretical[i][0], 6), + "obs_intensity": round(observed[i][1], 4), + "theo_intensity": round(theoretical[i][1], 4), + }) + + return { + "formula": formula, + "cosine_similarity": round(cos_sim, 6), + "n_peaks_compared": n, + "tolerance": tolerance, + "tolerance_unit": tolerance_unit, + "peaks": peak_detail, + "theoretical_pattern": [{"mz": round(mz, 6), "intensity": round(ab, 4)} for mz, ab in theoretical], + "halogen_detection": halogen, + } + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +@click.command( + help="Generate theoretical isotope patterns and optionally score them against observed peaks." +) +@click.option("--formula", required=True, help="Molecular formula (e.g. C6H12O6)") +@click.option("--max-isotopes", type=int, default=6, help="Max isotope peaks to generate (default: 6)") +@click.option( + "--observed", default=None, + help="Observed peaks as 'mz:int,mz:int,...' (colon-separated pairs).", +) +@click.option( + "--peaks", multiple=True, + help="Observed peaks as 'mz,intensity' pairs (repeatable, legacy format).", +) +@click.option("--tolerance", type=float, default=0.05, help="m/z tolerance (default: 0.05)") +@click.option( + "--tolerance-unit", type=click.Choice(["da", "ppm"]), default="da", + help="Tolerance unit: 'da' (Daltons, default) or 'ppm'.", +) +@click.option("--output", default=None, help="Output JSON file (optional)") +def main(formula, max_isotopes, observed, peaks, tolerance, tolerance_unit, output): + """CLI entry point.""" + theoretical = get_theoretical_pattern(formula, max_isotopes) + if not theoretical: + print("Could not compute isotope distribution for the given formula.") + return + + # Print theoretical pattern + print(f"Theoretical isotope pattern for {formula}:") + print(f"\n{'Peak':>5} {'m/z':>12} {'Rel. Abundance (%)':>18}") + print("-" * 42) + for i, (mz, rel_ab) in enumerate(theoretical): + bar = "#" * int(rel_ab / 5) + print(f" M+{i} {mz:>12.4f} {rel_ab:>6.2f} % {bar}") + + # Collect observed peaks + obs_peaks: list[tuple[float, float]] = [] + if observed: + obs_peaks = parse_observed(observed) + elif peaks: + for p in peaks: + obs_peaks.extend(parse_observed(p)) + + if obs_peaks: + result = score_pattern(obs_peaks, formula, max_isotopes, tolerance, tolerance_unit) + cos_sim = result["cosine_similarity"] + print(f"\nCosine similarity ({tolerance} {tolerance_unit}): {cos_sim:.4f}") + if cos_sim >= 0.9: + print(" ✓ Excellent match (≥ 0.90)") + elif cos_sim >= 0.7: + print(" ~ Good match (≥ 0.70)") + else: + print(" ✗ Poor match (< 0.70)") + + hal = result["halogen_detection"] + if hal["halogen_flag"]: + print(f"\nHalogen detected: {hal['possible_halogen']}") + print(f" M+2 observed: {hal['m2_ratio_observed']}% | theoretical: {hal['m2_ratio_theoretical']}%") + + if output: + with open(output, "w") as fh: + json.dump(result, fh, indent=2) + print(f"\nDetailed results written to {output}") + elif output: + # No observed peaks but output requested — write theoretical pattern only + data = { + "formula": formula, + "theoretical_pattern": [{"mz": round(mz, 6), "intensity": round(ab, 4)} for mz, ab in theoretical], + } + with open(output, "w") as fh: + json.dump(data, fh, indent=2) + print(f"\nTheoretical pattern written to {output}") + + +if __name__ == "__main__": + main() diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/requirements.txt b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/requirements.txt similarity index 60% rename from tools/metabolomics/spectral_analysis/isotope_pattern_matcher/requirements.txt rename to tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/requirements.txt index 18d6bbb..7ce28ec 100644 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/requirements.txt +++ b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/requirements.txt @@ -1,2 +1 @@ pyopenms -click diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/conftest.py b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/conftest.py similarity index 100% rename from tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/conftest.py rename to tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/conftest.py diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/test_isotope_pattern_analyzer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/test_isotope_pattern_analyzer.py new file mode 100644 index 0000000..2b046d0 --- /dev/null +++ b/tools/metabolomics/spectral_analysis/isotope_pattern_analyzer/tests/test_isotope_pattern_analyzer.py @@ -0,0 +1,218 @@ +"""Tests for isotope_pattern_analyzer.""" + +import json +import os +import tempfile + +import pytest +from conftest import requires_pyopenms + + +@requires_pyopenms +class TestGetTheoreticalPattern: + def test_glucose_pattern(self): + from isotope_pattern_analyzer import get_theoretical_pattern + + pattern = get_theoretical_pattern("C6H12O6", max_isotopes=3) + assert len(pattern) == 3 + # First peak should be the most abundant (normalized to 100) + assert pattern[0][1] == pytest.approx(100.0) + # Subsequent peaks should be smaller + assert pattern[1][1] < 100.0 + + def test_sucrose_max_isotopes(self): + from isotope_pattern_analyzer import get_theoretical_pattern + + pattern = get_theoretical_pattern("C12H22O11", max_isotopes=4) + assert len(pattern) == 4 + + def test_mz_ordering(self): + from isotope_pattern_analyzer import get_theoretical_pattern + + pattern = get_theoretical_pattern("C12H22O11", max_isotopes=4) + mzs = [mz for mz, _ in pattern] + assert mzs == sorted(mzs) + + def test_empty_formula_raises(self): + from isotope_pattern_analyzer import get_theoretical_pattern + + # Empty formula should return empty list (pyopenms may raise or return empty) + try: + result = get_theoretical_pattern("", max_isotopes=3) + assert result == [] + except Exception: + pass # acceptable + + +@requires_pyopenms +class TestCosineSimilarity: + def test_perfect_match(self): + from isotope_pattern_analyzer import cosine_similarity + + peaks = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + sim = cosine_similarity(peaks, peaks, tolerance=0.05) + assert abs(sim - 1.0) < 1e-6 + + def test_no_overlap(self): + from isotope_pattern_analyzer import cosine_similarity + + obs = [(180.0, 100.0), (181.0, 6.5)] + theo = [(300.0, 100.0), (301.0, 6.5)] + sim = cosine_similarity(obs, theo, tolerance=0.05) + assert sim == 0.0 + + def test_partial_overlap(self): + from isotope_pattern_analyzer import cosine_similarity + + obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 50.0)] # Enhanced M+2 + theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + sim = cosine_similarity(obs, theo, tolerance=0.05) + assert 0.0 < sim < 1.0 + + def test_empty_returns_zero(self): + from isotope_pattern_analyzer import cosine_similarity + + assert cosine_similarity([], [(180.0, 100.0)]) == 0.0 + assert cosine_similarity([(180.0, 100.0)], []) == 0.0 + + def test_ppm_tolerance(self): + from isotope_pattern_analyzer import cosine_similarity + + # At 180 Da, 10 ppm = 0.0018 Da + peaks = [(180.001, 100.0), (181.002, 6.5)] + theo = [(180.0, 100.0), (181.0, 6.5)] + sim_ppm = cosine_similarity(peaks, theo, tolerance=10.0, tolerance_unit="ppm") + sim_da = cosine_similarity(peaks, theo, tolerance=0.01, tolerance_unit="da") + # Both should give reasonably similar results since 10 ppm ≈ 0.0018 Da at 180 Da + assert sim_ppm > 0.9 + assert sim_da > 0.9 + + +@requires_pyopenms +class TestDetectHalogenation: + def test_no_halogen(self): + from isotope_pattern_analyzer import detect_halogenation + + obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + result = detect_halogenation(obs, theo) + assert result["halogen_flag"] is False + assert result["possible_halogen"] == "none" + + def test_chlorine_detection(self): + from isotope_pattern_analyzer import detect_halogenation + + # Chlorine: M+2 ~32.5% of M+0 + obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 35.0)] + theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + result = detect_halogenation(obs, theo) + assert result["halogen_flag"] is True + assert "Cl" in result["possible_halogen"] + + def test_bromine_detection(self): + from isotope_pattern_analyzer import detect_halogenation + + # Bromine: M+2 ~97% of M+0 + obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 98.0)] + theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] + result = detect_halogenation(obs, theo) + assert result["halogen_flag"] is True + assert result["possible_halogen"] == "Br" + + def test_insufficient_peaks(self): + from isotope_pattern_analyzer import detect_halogenation + + obs = [(180.0, 100.0), (181.0, 6.5)] # only 2 peaks + theo = [(180.0, 100.0), (181.0, 6.5)] + result = detect_halogenation(obs, theo) + assert result["halogen_flag"] is False + assert result["m2_ratio_observed"] is None + + +@requires_pyopenms +class TestParseObserved: + def test_colon_format(self): + from isotope_pattern_analyzer import parse_observed + + peaks = parse_observed("180.063:100,181.067:6.5,182.070:0.5") + assert len(peaks) == 3 + assert peaks[0] == (180.063, 100.0) + assert peaks[2] == (182.070, 0.5) + + def test_comma_format_single(self): + from isotope_pattern_analyzer import parse_observed + + peaks = parse_observed("181.0709,100.0") + assert len(peaks) == 1 + assert peaks[0] == (181.0709, 100.0) + + def test_whitespace_stripped(self): + from isotope_pattern_analyzer import parse_observed + + peaks = parse_observed("180.063 : 100 , 181.067 : 6.5") + assert len(peaks) == 2 + + def test_invalid_format_raises(self): + from isotope_pattern_analyzer import parse_observed + + with pytest.raises(ValueError): + parse_observed("notavalue") + + +@requires_pyopenms +class TestScorePattern: + def test_basic_scoring(self): + from isotope_pattern_analyzer import score_pattern + + obs = [(180.063, 100.0), (181.067, 6.5), (182.070, 0.5)] + result = score_pattern(obs, "C6H12O6") + assert "cosine_similarity" in result + assert result["cosine_similarity"] > 0.0 + assert "halogen_detection" in result + assert "theoretical_pattern" in result + + def test_score_range(self): + from isotope_pattern_analyzer import score_pattern + + obs = [(180.063, 100.0), (181.067, 6.5), (182.070, 0.5)] + result = score_pattern(obs, "C6H12O6") + assert 0.0 <= result["cosine_similarity"] <= 1.0 + + def test_ppm_tolerance_mode(self): + from isotope_pattern_analyzer import score_pattern + + obs = [(180.063, 100.0), (181.067, 6.5)] + result = score_pattern(obs, "C6H12O6", tolerance=10.0, tolerance_unit="ppm") + assert result["tolerance_unit"] == "ppm" + assert result["cosine_similarity"] > 0.0 + + def test_output_json(self): + from isotope_pattern_analyzer import score_pattern + + obs = [(180.063, 100.0), (181.067, 6.5)] + result = score_pattern(obs, "C6H12O6") + + with tempfile.TemporaryDirectory() as tmpdir: + out_path = os.path.join(tmpdir, "result.json") + with open(out_path, "w") as fh: + json.dump(result, fh, indent=2) + assert os.path.exists(out_path) + with open(out_path) as fh: + loaded = json.load(fh) + assert loaded["formula"] == "C6H12O6" + + def test_n_peaks_compared(self): + from isotope_pattern_analyzer import score_pattern + + # 2 observed peaks, 6 theoretical → n_peaks_compared = 2 + obs = [(180.063, 100.0), (181.067, 6.5)] + result = score_pattern(obs, "C6H12O6", max_isotopes=6) + assert result["n_peaks_compared"] == 2 + + def test_halogen_integration(self): + from isotope_pattern_analyzer import score_pattern + + # Chlorinated compound with enhanced M+2 + obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 35.0)] + result = score_pattern(obs, "C6H12O6") + assert result["halogen_detection"]["halogen_flag"] is True diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/README.md b/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/README.md deleted file mode 100644 index 716d595..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Isotope Pattern Fit Scorer - -Score observed vs theoretical isotope patterns using cosine similarity. Detect Cl/Br halogenation from enhanced M+2 peaks. - -## Usage - -```bash -python isotope_pattern_fit_scorer.py --observed "180.063:100,181.067:6.5,182.070:0.5" --formula C6H12O6 --output fit.json -``` - -### Output - -JSON file with: -- `cosine_similarity`: Score between 0 and 1 -- `observed_peaks` and `theoretical_peaks`: Peak lists -- `halogen_detection`: M+2 excess analysis with Cl/Br flagging diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/isotope_pattern_fit_scorer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/isotope_pattern_fit_scorer.py deleted file mode 100644 index d53d9ce..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/isotope_pattern_fit_scorer.py +++ /dev/null @@ -1,258 +0,0 @@ -""" -Isotope Pattern Fit Scorer -========================== -Score observed vs theoretical isotope patterns using cosine similarity. -Detect halogenation (Cl/Br) from enhanced M+2 peaks. - -Usage ------ - python isotope_pattern_fit_scorer.py \ - --observed "180.063:100,181.067:6.5,182.070:0.5" \ - --formula C6H12O6 --output fit.json -""" - -import json -import math -import sys - -import click - -try: - import pyopenms as oms -except ImportError: - sys.exit("pyopenms is required. Install it with: pip install pyopenms") - - -def get_theoretical_pattern( - formula: str, - max_isotopes: int = 6, -) -> list[tuple[float, float]]: - """Compute the theoretical isotope distribution for a molecular formula. - - Parameters - ---------- - formula: - Empirical formula string, e.g. ``"C6H12O6"``. - max_isotopes: - Maximum number of isotope peaks. - - Returns - ------- - list of (mz, relative_abundance) - Relative abundances normalized so the max is 100. - """ - ef = oms.EmpiricalFormula(formula) - iso_dist = ef.getIsotopeDistribution( - oms.CoarseIsotopePatternGenerator(max_isotopes) - ) - container = iso_dist.getContainer() - peaks = [(p.getMZ(), p.getIntensity()) for p in container] - if not peaks: - return [] - max_int = max(p[1] for p in peaks) - if max_int == 0: - return peaks - return [(mz, intensity / max_int * 100.0) for mz, intensity in peaks] - - -def parse_observed(observed_str: str) -> list[tuple[float, float]]: - """Parse observed peaks from a string. - - Format: ``"mz1:int1,mz2:int2,..."`` - - Parameters - ---------- - observed_str: - Comma-separated mz:intensity pairs. - - Returns - ------- - list of (mz, intensity) - """ - peaks = [] - for pair in observed_str.split(","): - pair = pair.strip() - if not pair: - continue - parts = pair.split(":") - if len(parts) != 2: - raise ValueError(f"Invalid peak format: {pair!r}. Expected mz:intensity") - mz = float(parts[0].strip()) - intensity = float(parts[1].strip()) - peaks.append((mz, intensity)) - return peaks - - -def cosine_similarity( - observed: list[tuple[float, float]], - theoretical: list[tuple[float, float]], - mz_tolerance: float = 0.05, -) -> float: - """Compute cosine similarity between observed and theoretical patterns. - - Parameters - ---------- - observed: - List of (mz, intensity) for observed peaks. - theoretical: - List of (mz, intensity) for theoretical peaks. - mz_tolerance: - Maximum m/z difference to consider a match. - - Returns - ------- - float - Cosine similarity score between 0 and 1. - """ - if not observed or not theoretical: - return 0.0 - - dot_product = 0.0 - norm_obs = 0.0 - norm_theo = 0.0 - - for obs_mz, obs_int in observed: - norm_obs += obs_int ** 2 - - for theo_mz, theo_int in theoretical: - norm_theo += theo_int ** 2 - - for obs_mz, obs_int in observed: - for theo_mz, theo_int in theoretical: - if abs(obs_mz - theo_mz) <= mz_tolerance: - dot_product += obs_int * theo_int - - if norm_obs == 0 or norm_theo == 0: - return 0.0 - return dot_product / (math.sqrt(norm_obs) * math.sqrt(norm_theo)) - - -def detect_halogenation( - observed: list[tuple[float, float]], - theoretical: list[tuple[float, float]], - mz_tolerance: float = 0.05, -) -> dict: - """Detect Cl/Br halogenation from enhanced M+2 peak. - - Cl has a natural isotope ratio of ~32.5% for M+2 relative to M+0. - Br has a natural isotope ratio of ~97% for M+2 relative to M+0. - If the observed M+2 is significantly higher than theoretical, flag - potential halogenation. - - Parameters - ---------- - observed: - Observed isotope pattern. - theoretical: - Theoretical isotope pattern (without halogens). - mz_tolerance: - m/z tolerance for peak matching. - - Returns - ------- - dict - Keys: m2_observed, m2_theoretical, m2_excess, halogen_flag, possible_halogen. - """ - result = { - "m2_observed": None, - "m2_theoretical": None, - "m2_excess": None, - "halogen_flag": False, - "possible_halogen": "none", - } - - if len(observed) < 3 or len(theoretical) < 3: - return result - - # M+0 is the first peak; M+2 is the third peak - obs_m0_int = observed[0][1] - obs_m2_int = observed[2][1] if len(observed) > 2 else 0.0 - theo_m2_int = theoretical[2][1] if len(theoretical) > 2 else 0.0 - - if obs_m0_int == 0: - return result - - obs_m2_ratio = obs_m2_int / obs_m0_int * 100.0 - theo_m2_ratio = theo_m2_int / max(theoretical[0][1], 1e-10) * 100.0 - - result["m2_observed"] = round(obs_m2_ratio, 2) - result["m2_theoretical"] = round(theo_m2_ratio, 2) - excess = obs_m2_ratio - theo_m2_ratio - result["m2_excess"] = round(excess, 2) - - if excess > 10.0: - result["halogen_flag"] = True - if excess > 70.0: - result["possible_halogen"] = "Br" - elif excess > 20.0: - result["possible_halogen"] = "Cl" - else: - result["possible_halogen"] = "Cl (weak signal)" - - return result - - -def score_pattern( - observed_str: str, - formula: str, - max_isotopes: int = 6, - mz_tolerance: float = 0.05, -) -> dict: - """Score an observed isotope pattern against a theoretical one. - - Parameters - ---------- - observed_str: - Comma-separated mz:intensity pairs. - formula: - Molecular formula. - max_isotopes: - Max isotope peaks. - mz_tolerance: - m/z tolerance. - - Returns - ------- - dict - Score results including cosine_similarity, halogen detection, and peak data. - """ - observed = parse_observed(observed_str) - theoretical = get_theoretical_pattern(formula, max_isotopes=max_isotopes) - cos_sim = cosine_similarity(observed, theoretical, mz_tolerance=mz_tolerance) - halogen = detect_halogenation(observed, theoretical, mz_tolerance=mz_tolerance) - - return { - "formula": formula, - "cosine_similarity": round(cos_sim, 6), - "observed_peaks": [{"mz": mz, "intensity": i} for mz, i in observed], - "theoretical_peaks": [{"mz": round(mz, 6), "intensity": round(i, 4)} for mz, i in theoretical], - "halogen_detection": halogen, - } - - -@click.command() -@click.option("--observed", required=True, - help="Observed peaks as 'mz1:int1,mz2:int2,...'") -@click.option("--formula", required=True, help="Molecular formula (e.g. C6H12O6)") -@click.option("--max-isotopes", type=int, default=6, help="Max isotope peaks (default: 6)") -@click.option("--mz-tolerance", type=float, default=0.05, help="m/z tolerance (default: 0.05)") -@click.option("--output", required=True, help="Output JSON file") -def main(observed, formula, max_isotopes, mz_tolerance, output) -> None: - """CLI entry point.""" - result = score_pattern( - observed, formula, - max_isotopes=max_isotopes, mz_tolerance=mz_tolerance, - ) - - with open(output, "w") as fh: - json.dump(result, fh, indent=2) - - print(f"Cosine similarity: {result['cosine_similarity']:.4f}") - halogen = result["halogen_detection"] - if halogen["halogen_flag"]: - print(f"Halogen detected: {halogen['possible_halogen']} (M+2 excess: {halogen['m2_excess']}%)") - print(f"Results written to {output}") - - -if __name__ == "__main__": - main() diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/requirements.txt b/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/requirements.txt deleted file mode 100644 index 18d6bbb..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyopenms -click diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/test_isotope_pattern_fit_scorer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/test_isotope_pattern_fit_scorer.py deleted file mode 100644 index deb337a..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_fit_scorer/tests/test_isotope_pattern_fit_scorer.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Tests for isotope_pattern_fit_scorer.""" - -import json -import os -import tempfile - -import pytest -from conftest import requires_pyopenms - - -@requires_pyopenms -class TestIsotopePatternFitScorer: - def test_get_theoretical_pattern(self): - from isotope_pattern_fit_scorer import get_theoretical_pattern - - pattern = get_theoretical_pattern("C6H12O6", max_isotopes=4) - assert len(pattern) == 4 - # First peak should be the most abundant (normalized to 100) - assert pattern[0][1] == pytest.approx(100.0) - # Subsequent peaks should be smaller - assert pattern[1][1] < 100.0 - - def test_parse_observed(self): - from isotope_pattern_fit_scorer import parse_observed - - peaks = parse_observed("180.063:100,181.067:6.5,182.070:0.5") - assert len(peaks) == 3 - assert peaks[0] == (180.063, 100.0) - assert peaks[1] == (181.067, 6.5) - - def test_parse_observed_with_spaces(self): - from isotope_pattern_fit_scorer import parse_observed - - peaks = parse_observed("180.063:100 , 181.067:6.5") - assert len(peaks) == 2 - - def test_cosine_similarity_perfect(self): - from isotope_pattern_fit_scorer import cosine_similarity - - peaks = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] - sim = cosine_similarity(peaks, peaks, mz_tolerance=0.1) - assert abs(sim - 1.0) < 1e-6 - - def test_cosine_similarity_no_overlap(self): - from isotope_pattern_fit_scorer import cosine_similarity - - obs = [(180.0, 100.0), (181.0, 6.5)] - theo = [(300.0, 100.0), (301.0, 6.5)] - sim = cosine_similarity(obs, theo, mz_tolerance=0.1) - assert sim == 0.0 - - def test_cosine_similarity_partial(self): - from isotope_pattern_fit_scorer import cosine_similarity - - obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 50.0)] # Enhanced M+2 - theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] - sim = cosine_similarity(obs, theo, mz_tolerance=0.1) - assert 0.0 < sim < 1.0 - - def test_detect_halogenation_no_halogen(self): - from isotope_pattern_fit_scorer import detect_halogenation - - obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] - theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] - result = detect_halogenation(obs, theo) - assert result["halogen_flag"] is False - - def test_detect_halogenation_chlorine(self): - from isotope_pattern_fit_scorer import detect_halogenation - - obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 35.0)] # Enhanced M+2 (~35%) - theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] # Expected ~0.5% - result = detect_halogenation(obs, theo) - assert result["halogen_flag"] is True - assert "Cl" in result["possible_halogen"] - - def test_detect_halogenation_bromine(self): - from isotope_pattern_fit_scorer import detect_halogenation - - obs = [(180.0, 100.0), (181.0, 6.5), (182.0, 98.0)] # Enhanced M+2 (~98%) - theo = [(180.0, 100.0), (181.0, 6.5), (182.0, 0.5)] - result = detect_halogenation(obs, theo) - assert result["halogen_flag"] is True - assert result["possible_halogen"] == "Br" - - def test_score_pattern(self): - from isotope_pattern_fit_scorer import score_pattern - - result = score_pattern( - "180.063:100,181.067:6.5,182.070:0.5", - "C6H12O6", - ) - assert "cosine_similarity" in result - assert result["cosine_similarity"] > 0.0 - assert "halogen_detection" in result - assert "observed_peaks" in result - assert "theoretical_peaks" in result - - def test_score_pattern_output_json(self): - from isotope_pattern_fit_scorer import score_pattern - - result = score_pattern("180.063:100,181.067:6.5", "C6H12O6") - - with tempfile.TemporaryDirectory() as tmpdir: - out_path = os.path.join(tmpdir, "fit.json") - with open(out_path, "w") as fh: - json.dump(result, fh, indent=2) - assert os.path.exists(out_path) - with open(out_path) as fh: - loaded = json.load(fh) - assert loaded["formula"] == "C6H12O6" diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/README.md b/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/README.md deleted file mode 100644 index e56d2a7..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Isotope Pattern Generator & Matcher - -Generate theoretical isotope distributions for any molecular formula and -optionally compute a cosine similarity score against observed peaks. - -## Usage - -```bash -python isotope_pattern_matcher.py --formula C6H12O6 -python isotope_pattern_matcher.py --formula C6H12O6 --peaks 181.0709,100.0 182.0742,6.7 183.0775,0.4 -``` diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/isotope_pattern_matcher.py b/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/isotope_pattern_matcher.py deleted file mode 100644 index e974619..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/isotope_pattern_matcher.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Isotope Pattern Generator & Matcher -===================================== -Generate the theoretical isotope distribution for a molecular formula -and optionally compare it against an observed spectrum to compute a -cosine similarity score using pyopenms. - -Usage ------ - # Generate isotope pattern for glucose - python isotope_pattern_matcher.py --formula C6H12O6 - - # Compare against observed peaks (m/z intensity pairs on stdin or --peaks) - python isotope_pattern_matcher.py --formula C6H12O6 \\ - --peaks 181.0709,100.0 --peaks 182.0742,6.7 --peaks 183.0775,0.4 -""" - -import math -import sys - -import click - -try: - import pyopenms as oms -except ImportError: - sys.exit( - "pyopenms is required. Install it with: pip install pyopenms" - ) - - -def get_isotope_distribution( - formula: str, - max_isotopes: int = 5, -) -> list: - """Compute the theoretical isotope distribution for a molecular formula. - - Parameters - ---------- - formula: - Empirical formula string, e.g. ``"C6H12O6"``. - max_isotopes: - Maximum number of isotope peaks to return (default: 5). - - Returns - ------- - list of (mz, relative_abundance) - Sorted by m/z; relative abundances sum to 100. - """ - ef = oms.EmpiricalFormula(formula) - isotope_dist = ef.getIsotopeDistribution( - oms.CoarseIsotopePatternGenerator(max_isotopes) - ) - peaks = [(p.getMZ(), p.getIntensity()) for p in isotope_dist.getContainer()] - if not peaks: - return [] - max_ab = max(ab for _, ab in peaks) - return [(mz, ab / max_ab * 100) for mz, ab in peaks] - - -def cosine_similarity( - theoretical: list, - observed: list, - mz_tolerance: float = 0.02, -) -> float: - """Compute cosine similarity between theoretical and observed peak lists. - - Parameters - ---------- - theoretical: - List of ``(mz, intensity)`` tuples for the theoretical pattern. - observed: - List of ``(mz, intensity)`` tuples for the observed spectrum. - mz_tolerance: - Maximum m/z difference to match a pair of peaks (default: 0.02 Da). - - Returns - ------- - float - Cosine similarity in [0, 1]. - """ - theo_vec = [] - obs_vec = [] - for tmz, tint in theoretical: - matched = 0.0 - for omz, oint in observed: - if abs(omz - tmz) <= mz_tolerance: - matched = oint - break - theo_vec.append(tint) - obs_vec.append(matched) - - dot = sum(t * o for t, o in zip(theo_vec, obs_vec)) - norm_t = math.sqrt(sum(t ** 2 for t in theo_vec)) - norm_o = math.sqrt(sum(o ** 2 for o in obs_vec)) - if norm_t == 0 or norm_o == 0: - return 0.0 - return dot / (norm_t * norm_o) - - -def parse_peaks(peak_strings: list) -> list: - """Parse ``"mz,intensity"`` strings into ``(float, float)`` tuples.""" - peaks = [] - for s in peak_strings: - parts = s.split(",") - if len(parts) != 2: - raise ValueError( - f"Expected 'mz,intensity' format, got: {s!r}" - ) - peaks.append((float(parts[0]), float(parts[1]))) - return peaks - - -@click.command() -@click.option("--formula", required=True, help="Molecular formula (e.g. C6H12O6)") -@click.option("--max-isotopes", type=int, default=5, - help="Maximum isotope peaks to compute (default: 5)") -@click.option("--peaks", multiple=True, - help="Observed peaks as 'mz,intensity' pairs for similarity scoring") -@click.option("--tolerance", type=float, default=0.02, - help="m/z tolerance in Da for peak matching (default: 0.02)") -def main(formula, max_isotopes, peaks, tolerance): - distribution = get_isotope_distribution(formula, max_isotopes) - if not distribution: - print("Could not compute isotope distribution for the given formula.") - return - - print(f"Isotope distribution for {formula}:") - print(f"\n{'Peak':>5} {'m/z':>12} {'Relative Abundance (%)':>22}") - print("-" * 44) - for i, (mz, rel_ab) in enumerate(distribution): - bar = "#" * int(rel_ab / 5) - print(f" M+{i} {mz:>12.4f} {rel_ab:>6.2f} % {bar}") - - if peaks: - observed = parse_peaks(list(peaks)) - sim = cosine_similarity(distribution, observed, tolerance) - print(f"\nCosine similarity vs. observed peaks: {sim:.4f}") - if sim >= 0.9: - print(" ✓ Excellent match (≥ 0.90)") - elif sim >= 0.7: - print(" ~ Good match (≥ 0.70)") - else: - print(" ✗ Poor match (< 0.70)") - - -if __name__ == "__main__": - main() diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/test_isotope_pattern_matcher.py b/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/test_isotope_pattern_matcher.py deleted file mode 100644 index a0999cb..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/test_isotope_pattern_matcher.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Tests for isotope_pattern_matcher.""" - -import pytest -from conftest import requires_pyopenms - - -@requires_pyopenms -class TestIsotopePatternMatcher: - def test_glucose_pattern(self): - from isotope_pattern_matcher import get_isotope_distribution - - dist = get_isotope_distribution("C6H12O6", max_isotopes=3) - assert len(dist) == 3 - assert dist[0][1] == pytest.approx(100.0) - assert dist[1][1] < dist[0][1] - - def test_pattern_mz_ordering(self): - from isotope_pattern_matcher import get_isotope_distribution - - dist = get_isotope_distribution("C12H22O11", max_isotopes=4) - mzs = [mz for mz, _ in dist] - assert mzs == sorted(mzs) - - def test_cosine_similarity_perfect(self): - from isotope_pattern_matcher import cosine_similarity - - peaks = [(100.0, 50.0), (101.0, 20.0), (102.0, 5.0)] - sim = cosine_similarity(peaks, peaks, mz_tolerance=0.01) - assert abs(sim - 1.0) < 1e-9 - - def test_cosine_similarity_no_overlap(self): - from isotope_pattern_matcher import cosine_similarity - - theoretical = [(100.0, 50.0), (101.0, 20.0)] - observed = [(200.0, 50.0), (201.0, 20.0)] - sim = cosine_similarity(theoretical, observed, mz_tolerance=0.01) - assert sim == 0.0 - - def test_parse_peaks(self): - from isotope_pattern_matcher import parse_peaks - - result = parse_peaks(["181.0709,100.0", "182.0742,6.7"]) - assert len(result) == 2 - assert result[0] == (181.0709, 100.0) - - def test_parse_peaks_invalid(self): - from isotope_pattern_matcher import parse_peaks - - with pytest.raises(ValueError): - parse_peaks(["181.0709"]) diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/isotope_pattern_scorer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/isotope_pattern_scorer.py deleted file mode 100644 index 10e1a75..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/isotope_pattern_scorer.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Isotope Pattern Scorer -======================= -Score observed isotope patterns against theoretical patterns computed -from a molecular formula using pyopenms. - -The score is a cosine similarity between observed and theoretical -isotope intensity ratios. - -Usage ------ - python isotope_pattern_scorer.py --observed "180.063:100,181.067:6.5" --formula C6H12O6 --output fit.json -""" - -import json -import sys - -import click - -try: - import pyopenms as oms -except ImportError: - sys.exit("pyopenms is required. Install it with: pip install pyopenms") - - -def parse_observed(observed_str: str) -> list[tuple[float, float]]: - """Parse observed isotope pattern from a string. - - Parameters - ---------- - observed_str: - Comma-separated ``mz:intensity`` pairs, e.g. - ``"180.063:100,181.067:6.5"``. - - Returns - ------- - list[tuple[float, float]] - List of (mz, intensity) tuples. - """ - peaks = [] - for pair in observed_str.split(","): - pair = pair.strip() - if ":" in pair: - mz_str, int_str = pair.split(":", 1) - peaks.append((float(mz_str), float(int_str))) - return peaks - - -def get_theoretical_pattern(formula: str, n_peaks: int = 5) -> list[tuple[float, float]]: - """Compute theoretical isotope pattern for a formula. - - Parameters - ---------- - formula: - Molecular formula string. - n_peaks: - Number of isotope peaks to generate. - - Returns - ------- - list[tuple[float, float]] - List of (mz, relative_intensity) tuples, normalized to max=100. - """ - ef = oms.EmpiricalFormula(formula) - gen = oms.CoarseIsotopePatternGenerator(n_peaks) - iso = ef.getIsotopeDistribution(gen) - container = iso.getContainer() - - peaks = [(peak.getMZ(), peak.getIntensity()) for peak in container] - if not peaks: - return [] - - max_int = max(p[1] for p in peaks) - if max_int > 0: - peaks = [(mz, intensity / max_int * 100.0) for mz, intensity in peaks] - - return peaks - - -def score_pattern( - observed: list[tuple[float, float]], - theoretical: list[tuple[float, float]], -) -> dict: - """Score observed vs theoretical isotope patterns. - - Parameters - ---------- - observed: - Observed (mz, intensity) pairs. - theoretical: - Theoretical (mz, intensity) pairs. - - Returns - ------- - dict - Contains cosine_score, per-peak comparisons. - """ - n = min(len(observed), len(theoretical)) - if n == 0: - return {"cosine_score": 0.0, "n_peaks_compared": 0, "peaks": []} - - obs_int = [observed[i][1] for i in range(n)] - theo_int = [theoretical[i][1] for i in range(n)] - - # Normalize observed to max=100 - obs_max = max(obs_int) if obs_int else 1.0 - if obs_max > 0: - obs_int = [v / obs_max * 100.0 for v in obs_int] - - # Cosine similarity - dot = sum(a * b for a, b in zip(obs_int, theo_int)) - mag_a = sum(a ** 2 for a in obs_int) ** 0.5 - mag_b = sum(b ** 2 for b in theo_int) ** 0.5 - - cosine = dot / (mag_a * mag_b) if (mag_a > 0 and mag_b > 0) else 0.0 - - peaks = [] - for i in range(n): - peaks.append({ - "peak_index": i, - "obs_mz": round(observed[i][0], 6), - "theo_mz": round(theoretical[i][0], 6), - "obs_intensity": round(obs_int[i], 4), - "theo_intensity": round(theo_int[i], 4), - }) - - return { - "cosine_score": round(cosine, 6), - "n_peaks_compared": n, - "peaks": peaks, - } - - -@click.command() -@click.option("--observed", required=True, - help='Observed pattern as "mz:int,mz:int,..." (e.g. "180.063:100,181.067:6.5")') -@click.option("--formula", required=True, help="Molecular formula (e.g. C6H12O6)") -@click.option("--output", required=True, help="Output JSON file") -def main(observed, formula, output): - observed_peaks = parse_observed(observed) - theoretical = get_theoretical_pattern(formula, n_peaks=len(observed_peaks)) - result = score_pattern(observed_peaks, theoretical) - result["formula"] = formula - - with open(output, "w") as fh: - json.dump(result, fh, indent=2) - - print(f"Isotope fit written to {output}") - print(f" Cosine score: {result['cosine_score']:.6f}") - print(f" Peaks compared: {result['n_peaks_compared']}") - - -if __name__ == "__main__": - main() diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/conftest.py b/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/conftest.py deleted file mode 100644 index 1a21ede..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import sys - -import pytest - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import pyopenms # noqa: F401 - - HAS_PYOPENMS = True -except ImportError: - HAS_PYOPENMS = False - -requires_pyopenms = pytest.mark.skipif(not HAS_PYOPENMS, reason="pyopenms not installed") diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/test_isotope_pattern_scorer.py b/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/test_isotope_pattern_scorer.py deleted file mode 100644 index 07a8d00..0000000 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/tests/test_isotope_pattern_scorer.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Tests for isotope_pattern_scorer.""" - -from conftest import requires_pyopenms - - -@requires_pyopenms -class TestIsotopePatternScorer: - def test_parse_observed(self): - from isotope_pattern_scorer import parse_observed - - peaks = parse_observed("180.063:100,181.067:6.5,182.070:0.5") - assert len(peaks) == 3 - assert peaks[0] == (180.063, 100.0) - - def test_get_theoretical_pattern(self): - from isotope_pattern_scorer import get_theoretical_pattern - - pattern = get_theoretical_pattern("C6H12O6", n_peaks=3) - assert len(pattern) == 3 - # First peak should be the most intense (normalized to 100) - assert pattern[0][1] == 100.0 - - def test_perfect_match_score(self): - from isotope_pattern_scorer import get_theoretical_pattern, score_pattern - - theo = get_theoretical_pattern("C6H12O6", n_peaks=3) - # Use theoretical as observed - result = score_pattern(theo, theo) - assert result["cosine_score"] > 0.999 - - def test_score_range(self): - from isotope_pattern_scorer import get_theoretical_pattern, score_pattern - - theo = get_theoretical_pattern("C6H12O6", n_peaks=3) - observed = [(180.063, 100.0), (181.067, 50.0), (182.070, 30.0)] - result = score_pattern(observed, theo) - assert 0.0 <= result["cosine_score"] <= 1.0 - - def test_empty_observed(self): - from isotope_pattern_scorer import score_pattern - - result = score_pattern([], []) - assert result["cosine_score"] == 0.0 - assert result["n_peaks_compared"] == 0 - - def test_peaks_detail(self): - from isotope_pattern_scorer import get_theoretical_pattern, score_pattern - - theo = get_theoretical_pattern("C6H12O6", n_peaks=2) - observed = [(180.063, 100.0), (181.067, 7.0)] - result = score_pattern(observed, theo) - assert result["n_peaks_compared"] == 2 - assert len(result["peaks"]) == 2 diff --git a/tools/proteomics/file_conversion/mgf_mzml_converter/README.md b/tools/proteomics/file_conversion/mgf_mzml_converter/README.md new file mode 100644 index 0000000..9e6d700 --- /dev/null +++ b/tools/proteomics/file_conversion/mgf_mzml_converter/README.md @@ -0,0 +1,50 @@ +# MGF ↔ mzML Converter + +Bidirectional conversion between MGF (Mascot Generic Format) and mzML. +Consolidates `mgf_to_mzml_converter` and `mzml_to_mgf_converter` into a single tool. + +## Features + +- **MGF → mzML**: preserves title, precursor m/z, charge, retention time, fragment peaks +- **mzML → MGF**: full filter suite (charge, RT range, m/z range, min peaks, + min base-peak intensity, MS level) +- **Auto-detection** of conversion direction from file extensions +- Conversion statistics printed to stdout + +## Installation + +```bash +pip install pyopenms +``` + +## CLI Usage + +```bash +# Auto-detect direction from file extensions +python mgf_mzml_converter.py --input spectra.mgf --output spectra.mzML +python mgf_mzml_converter.py --input run.mzML --output spectra.mgf + +# Explicit direction +python mgf_mzml_converter.py --direction mgf2mzml --input spectra.mgf --output out.mzML +python mgf_mzml_converter.py --direction mzml2mgf --input run.mzML --output out.mgf + +# mzML → MGF with filters +python mgf_mzml_converter.py --input run.mzML --output out.mgf \ + --charge 2 3 \ + --rt-min 600 --rt-max 1800 \ + --mz-min 400 --mz-max 1200 \ + --min-peaks 5 \ + --min-intensity 1000 +``` + +## Options + +| Option | Direction | Description | +|---|---|---| +| `--direction` | both | `mgf2mzml` or `mzml2mgf` (auto if omitted) | +| `--ms-level` | mzml2mgf | MS level to export (default: 2) | +| `--min-peaks` | mzml2mgf | Minimum fragment peaks (default: 1) | +| `--charge` | mzml2mgf | Keep only these precursor charges (repeatable) | +| `--rt-min` / `--rt-max` | mzml2mgf | Retention time window in seconds | +| `--mz-min` / `--mz-max` | mzml2mgf | Precursor m/z window | +| `--min-intensity` | mzml2mgf | Minimum base-peak intensity | diff --git a/tools/proteomics/file_conversion/mgf_mzml_converter/mgf_mzml_converter.py b/tools/proteomics/file_conversion/mgf_mzml_converter/mgf_mzml_converter.py new file mode 100644 index 0000000..4ece445 --- /dev/null +++ b/tools/proteomics/file_conversion/mgf_mzml_converter/mgf_mzml_converter.py @@ -0,0 +1,471 @@ +""" +MGF ↔ mzML Converter +===================== +Bidirectional conversion between MGF (Mascot Generic Format) and mzML. +Combines and improves upon the separate mgf_to_mzml_converter and +mzml_to_mgf_converter tools. + +Conversion direction is inferred automatically from the input file extension +when ``--direction`` is not supplied: + - ``.mgf`` → mzML + - ``.mzML`` / ``.mzml`` → MGF + +Features +-------- +- MGF → mzML: preserves title, precursor m/z, charge, retention time +- mzML → MGF: full filter suite (charge, RT range, m/z range, min peaks, + min base-peak intensity, MS level) +- Auto-detection of conversion direction from file extension +- Conversion statistics printed to stdout + +Usage +----- + # Auto-detect direction from extensions + python mgf_mzml_converter.py --input spectra.mgf --output spectra.mzML + python mgf_mzml_converter.py --input run.mzML --output spectra.mgf + + # Explicit direction + python mgf_mzml_converter.py --direction mgf2mzml --input spectra.mgf --output out.mzML + python mgf_mzml_converter.py --direction mzml2mgf --input run.mzML --output out.mgf + + # mzML → MGF with filters + python mgf_mzml_converter.py --input run.mzML --output out.mgf \\ + --charge 2 3 --rt-min 600 --rt-max 1800 --mz-min 400 --mz-max 1200 \\ + --min-peaks 5 --min-intensity 1000 +""" + +import os +import sys +from typing import List, Optional + +import click + +try: + import pyopenms as oms +except ImportError: + sys.exit("pyopenms is required. Install it with: pip install pyopenms") + +PROTON = 1.007276 + + +# --------------------------------------------------------------------------- +# MGF → mzML +# --------------------------------------------------------------------------- + + +def parse_mgf(input_path: str) -> List[dict]: + """Parse an MGF file and return a list of spectrum dicts. + + Each dict has keys: title, pepmass, charge, rt, peaks (list of + (mz, intensity) tuples). + + Parameters + ---------- + input_path: + Path to the MGF file. + + Returns + ------- + list of dict + """ + spectra = [] + current: Optional[dict] = None + + with open(input_path) as fh: + for line in fh: + line = line.strip() + if line == "BEGIN IONS": + current = {"title": "", "pepmass": 0.0, "charge": 0, "rt": 0.0, "peaks": []} + elif line == "END IONS": + if current is not None: + spectra.append(current) + current = None + elif current is not None: + if line.startswith("TITLE="): + current["title"] = line[6:] + elif line.startswith("PEPMASS="): + parts = line[8:].split() + current["pepmass"] = float(parts[0]) + elif line.startswith("CHARGE="): + charge_str = line[7:].replace("+", "").replace("-", "").strip() + try: + current["charge"] = int(charge_str) + except ValueError: + current["charge"] = 0 + elif line.startswith("RTINSECONDS="): + current["rt"] = float(line[12:]) + elif line and line[0].isdigit(): + parts = line.split() + if len(parts) >= 2: + try: + current["peaks"].append((float(parts[0]), float(parts[1]))) + except ValueError: + pass + + return spectra + + +def convert_mgf_to_mzml(input_path: str, output_path: str) -> dict: + """Convert an MGF file to mzML format. + + Parameters + ---------- + input_path: + Path to the input MGF file. + output_path: + Path to the output mzML file. + + Returns + ------- + dict + ``{"spectra_converted": int}`` + """ + mgf_spectra = parse_mgf(input_path) + exp = oms.MSExperiment() + + for i, spec_data in enumerate(mgf_spectra): + spectrum = oms.MSSpectrum() + spectrum.setMSLevel(2) + spectrum.setRT(spec_data["rt"]) + spectrum.setNativeID(spec_data["title"] if spec_data["title"] else f"index={i}") + + if spec_data["pepmass"] > 0: + prec = oms.Precursor() + prec.setMZ(spec_data["pepmass"]) + if spec_data["charge"] > 0: + prec.setCharge(spec_data["charge"]) + spectrum.setPrecursors([prec]) + + if spec_data["peaks"]: + mzs = [p[0] for p in spec_data["peaks"]] + intensities = [p[1] for p in spec_data["peaks"]] + spectrum.set_peaks((mzs, intensities)) + + exp.addSpectrum(spectrum) + + oms.MzMLFile().store(output_path, exp) + return {"spectra_converted": len(mgf_spectra)} + + +# --------------------------------------------------------------------------- +# mzML → MGF +# --------------------------------------------------------------------------- + + +def load_mzml(input_path: str) -> oms.MSExperiment: + """Load an mzML file into an MSExperiment. + + Parameters + ---------- + input_path: + Path to the mzML file. + + Returns + ------- + oms.MSExperiment + """ + exp = oms.MSExperiment() + oms.MzMLFile().load(input_path, exp) + return exp + + +def passes_filters( + spectrum: oms.MSSpectrum, + min_peaks: int = 1, + charges: Optional[tuple] = None, + rt_min: Optional[float] = None, + rt_max: Optional[float] = None, + mz_min: Optional[float] = None, + mz_max: Optional[float] = None, + min_intensity: Optional[float] = None, +) -> bool: + """Check whether a spectrum passes all active filters. + + Parameters + ---------- + spectrum: + The spectrum to check. + min_peaks: + Minimum number of fragment peaks required. + charges: + Allowed charge states (precursor). ``None`` means any charge. + rt_min: + Minimum retention time in seconds. + rt_max: + Maximum retention time in seconds. + mz_min: + Minimum precursor m/z. + mz_max: + Maximum precursor m/z. + min_intensity: + Minimum base-peak intensity. + + Returns + ------- + bool + """ + mz_array, intensity_array = spectrum.get_peaks() + if len(mz_array) < min_peaks: + return False + + if rt_min is not None and spectrum.getRT() < rt_min: + return False + if rt_max is not None and spectrum.getRT() > rt_max: + return False + + precursors = spectrum.getPrecursors() + if precursors: + prec = precursors[0] + if charges and prec.getCharge() not in charges: + return False + if mz_min is not None and prec.getMZ() < mz_min: + return False + if mz_max is not None and prec.getMZ() > mz_max: + return False + elif charges or mz_min is not None or mz_max is not None: + return False + + if min_intensity is not None: + if len(intensity_array) == 0 or max(intensity_array) < min_intensity: + return False + + return True + + +def spectrum_to_mgf_block(spectrum: oms.MSSpectrum, index: int) -> str: + """Convert a single spectrum to an MGF block string. + + Parameters + ---------- + spectrum: + The spectrum to convert. + index: + Fallback index used in the title when the native ID is empty. + + Returns + ------- + str + MGF block ending with a blank line. + """ + lines = ["BEGIN IONS"] + + native_id = spectrum.getNativeID() if spectrum.getNativeID() else f"index={index}" + lines.append(f"TITLE={native_id}") + lines.append(f"RTINSECONDS={spectrum.getRT():.4f}") + + precursors = spectrum.getPrecursors() + if precursors: + prec = precursors[0] + lines.append(f"PEPMASS={prec.getMZ():.6f}") + if prec.getCharge() > 0: + lines.append(f"CHARGE={prec.getCharge()}+") + + mz_array, intensity_array = spectrum.get_peaks() + for mz_val, intensity_val in zip(mz_array, intensity_array): + lines.append(f"{mz_val:.6f} {intensity_val:.4f}") + + lines.append("END IONS") + lines.append("") + return "\n".join(lines) + + +def convert_mzml_to_mgf( + input_path: str, + output_path: str, + ms_level: int = 2, + min_peaks: int = 1, + charges: Optional[tuple] = None, + rt_min: Optional[float] = None, + rt_max: Optional[float] = None, + mz_min: Optional[float] = None, + mz_max: Optional[float] = None, + min_intensity: Optional[float] = None, +) -> dict: + """Convert an mzML file to MGF format with optional filtering. + + Parameters + ---------- + input_path: + Path to the input mzML file. + output_path: + Path to the output MGF file. + ms_level: + MS level of spectra to export (default: 2). + min_peaks: + Minimum fragment peaks required to include a spectrum. + charges: + Allowed precursor charge states. + rt_min: + Minimum retention time in seconds. + rt_max: + Maximum retention time in seconds. + mz_min: + Minimum precursor m/z. + mz_max: + Maximum precursor m/z. + min_intensity: + Minimum base-peak intensity. + + Returns + ------- + dict + ``{"total_spectra", "ms_level_spectra", "converted", "filtered_out"}`` + """ + exp = load_mzml(input_path) + target_spectra = [s for s in exp if s.getMSLevel() == ms_level] + + converted = 0 + filtered_out = 0 + with open(output_path, "w") as fh: + for i, spectrum in enumerate(target_spectra): + if not passes_filters(spectrum, min_peaks, charges, rt_min, rt_max, mz_min, mz_max, min_intensity): + filtered_out += 1 + continue + fh.write(spectrum_to_mgf_block(spectrum, i) + "\n") + converted += 1 + + return { + "total_spectra": exp.size(), + "ms_level_spectra": len(target_spectra), + "converted": converted, + "filtered_out": filtered_out, + } + + +# --------------------------------------------------------------------------- +# Synthetic test data helper +# --------------------------------------------------------------------------- + + +def create_synthetic_mzml(output_path: str, n_spectra: int = 5) -> None: + """Create a synthetic mzML file with MS2 spectra for testing. + + Parameters + ---------- + output_path: + Path for the output mzML file. + n_spectra: + Number of MS2 spectra to generate (default: 5). + """ + exp = oms.MSExperiment() + + ms1 = oms.MSSpectrum() + ms1.setMSLevel(1) + ms1.setRT(10.0) + ms1.set_peaks(([100.0, 200.0, 300.0], [1000.0, 2000.0, 1500.0])) + exp.addSpectrum(ms1) + + for i in range(n_spectra): + ms2 = oms.MSSpectrum() + ms2.setMSLevel(2) + ms2.setRT(10.0 + i * 0.5) + ms2.setNativeID(f"scan={i + 1}") + + prec = oms.Precursor() + prec.setMZ(500.0 + i * 50.0) + prec.setCharge(2 if i % 2 == 0 else 3) + ms2.setPrecursors([prec]) + + mzs = [100.0 + j * 50.0 for j in range(10)] + intensities = [1000.0 * (10 - j) for j in range(10)] + ms2.set_peaks((mzs, intensities)) + exp.addSpectrum(ms2) + + oms.MzMLFile().store(output_path, exp) + + +# --------------------------------------------------------------------------- +# Direction auto-detection +# --------------------------------------------------------------------------- + + +def _detect_direction(input_path: str, output_path: str) -> str: + """Infer conversion direction from file extensions. + + Parameters + ---------- + input_path: + Path to the input file. + output_path: + Path to the output file. + + Returns + ------- + str + ``"mgf2mzml"`` or ``"mzml2mgf"``. + + Raises + ------ + click.UsageError + If the direction cannot be determined. + """ + in_ext = os.path.splitext(input_path)[1].lower() + if in_ext == ".mgf": + return "mgf2mzml" + if in_ext in (".mzml", ".mzxml"): + return "mzml2mgf" + out_ext = os.path.splitext(output_path)[1].lower() + if out_ext in (".mzml", ".mzxml"): + return "mgf2mzml" + if out_ext == ".mgf": + return "mzml2mgf" + raise click.UsageError( + "Cannot determine conversion direction from file extensions. " + "Use --direction mgf2mzml or --direction mzml2mgf explicitly." + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +@click.command(help="Bidirectional MGF ↔ mzML converter with optional spectrum filtering.") +@click.option("--input", "input_path", required=True, help="Input file (MGF or mzML)") +@click.option("--output", "output_path", required=True, help="Output file (mzML or MGF)") +@click.option( + "--direction", + type=click.Choice(["mgf2mzml", "mzml2mgf"]), + default=None, + help="Conversion direction (auto-detected from extension if omitted).", +) +@click.option("--ms-level", type=int, default=2, help="MS level to export (mzml2mgf, default: 2)") +@click.option("--min-peaks", type=int, default=1, help="Min fragment peaks to keep (mzml2mgf, default: 1)") +@click.option( + "--charge", multiple=True, type=int, + help="Keep only these precursor charge states (mzml2mgf, repeatable).", +) +@click.option("--rt-min", type=float, default=None, help="Min retention time in seconds (mzml2mgf)") +@click.option("--rt-max", type=float, default=None, help="Max retention time in seconds (mzml2mgf)") +@click.option("--mz-min", type=float, default=None, help="Min precursor m/z (mzml2mgf)") +@click.option("--mz-max", type=float, default=None, help="Max precursor m/z (mzml2mgf)") +@click.option( + "--min-intensity", type=float, default=None, + help="Min base-peak intensity to keep a spectrum (mzml2mgf).", +) +def main( + input_path, output_path, direction, ms_level, min_peaks, charge, + rt_min, rt_max, mz_min, mz_max, min_intensity, +) -> None: + """CLI entry point.""" + if direction is None: + direction = _detect_direction(input_path, output_path) + + if direction == "mgf2mzml": + stats = convert_mgf_to_mzml(input_path, output_path) + print(f"Converted {stats['spectra_converted']} spectra {input_path} → {output_path}") + else: + charges = tuple(charge) if charge else None + stats = convert_mzml_to_mgf( + input_path, output_path, ms_level, min_peaks, charges, + rt_min, rt_max, mz_min, mz_max, min_intensity, + ) + print( + f"Converted {stats['converted']} / {stats['ms_level_spectra']} " + f"MS{ms_level} spectra {input_path} → {output_path} " + f"({stats['filtered_out']} filtered out)" + ) + + +if __name__ == "__main__": + main() diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/requirements.txt b/tools/proteomics/file_conversion/mgf_mzml_converter/requirements.txt similarity index 60% rename from tools/metabolomics/spectral_analysis/isotope_pattern_scorer/requirements.txt rename to tools/proteomics/file_conversion/mgf_mzml_converter/requirements.txt index 18d6bbb..7ce28ec 100644 --- a/tools/metabolomics/spectral_analysis/isotope_pattern_scorer/requirements.txt +++ b/tools/proteomics/file_conversion/mgf_mzml_converter/requirements.txt @@ -1,2 +1 @@ pyopenms -click diff --git a/tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/conftest.py b/tools/proteomics/file_conversion/mgf_mzml_converter/tests/conftest.py similarity index 100% rename from tools/metabolomics/spectral_analysis/isotope_pattern_matcher/tests/conftest.py rename to tools/proteomics/file_conversion/mgf_mzml_converter/tests/conftest.py diff --git a/tools/proteomics/file_conversion/mgf_mzml_converter/tests/test_mgf_mzml_converter.py b/tools/proteomics/file_conversion/mgf_mzml_converter/tests/test_mgf_mzml_converter.py new file mode 100644 index 0000000..11cc862 --- /dev/null +++ b/tools/proteomics/file_conversion/mgf_mzml_converter/tests/test_mgf_mzml_converter.py @@ -0,0 +1,300 @@ +"""Tests for mgf_mzml_converter.""" + +import os +import tempfile + +from conftest import requires_pyopenms + +SAMPLE_MGF = """\ +BEGIN IONS +TITLE=scan=1 +RTINSECONDS=120.5 +PEPMASS=500.250000 +CHARGE=2+ +100.100000 1000.0000 +200.200000 2000.0000 +300.300000 1500.0000 +END IONS + +BEGIN IONS +TITLE=scan=2 +RTINSECONDS=130.0 +PEPMASS=600.350000 +CHARGE=3+ +150.150000 800.0000 +250.250000 1200.0000 +END IONS +""" + + +# --------------------------------------------------------------------------- +# MGF → mzML tests +# --------------------------------------------------------------------------- + + +@requires_pyopenms +def test_parse_mgf(): + from mgf_mzml_converter import parse_mgf + + with tempfile.TemporaryDirectory() as tmp: + mgf_path = os.path.join(tmp, "test.mgf") + with open(mgf_path, "w") as fh: + fh.write(SAMPLE_MGF) + + spectra = parse_mgf(mgf_path) + assert len(spectra) == 2 + assert spectra[0]["title"] == "scan=1" + assert abs(spectra[0]["pepmass"] - 500.25) < 0.01 + assert spectra[0]["charge"] == 2 + assert len(spectra[0]["peaks"]) == 3 + assert len(spectra[1]["peaks"]) == 2 + + +@requires_pyopenms +def test_convert_mgf_to_mzml(): + import pyopenms as oms + from mgf_mzml_converter import convert_mgf_to_mzml + + with tempfile.TemporaryDirectory() as tmp: + mgf_path = os.path.join(tmp, "test.mgf") + mzml_path = os.path.join(tmp, "test.mzML") + + with open(mgf_path, "w") as fh: + fh.write(SAMPLE_MGF) + + stats = convert_mgf_to_mzml(mgf_path, mzml_path) + assert stats["spectra_converted"] == 2 + + exp = oms.MSExperiment() + oms.MzMLFile().load(mzml_path, exp) + assert exp.size() == 2 + + s = exp[0] + assert s.getMSLevel() == 2 + precursors = s.getPrecursors() + assert len(precursors) == 1 + assert abs(precursors[0].getMZ() - 500.25) < 0.01 + assert precursors[0].getCharge() == 2 + + mz_arr, _ = s.get_peaks() + assert len(mz_arr) == 3 + + +# --------------------------------------------------------------------------- +# mzML → MGF tests +# --------------------------------------------------------------------------- + + +@requires_pyopenms +def test_create_synthetic_mzml(): + import pyopenms as oms + from mgf_mzml_converter import create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + create_synthetic_mzml(mzml_path, n_spectra=3) + + exp = oms.MSExperiment() + oms.MzMLFile().load(mzml_path, exp) + assert exp.size() == 4 # 1 MS1 + 3 MS2 + + +@requires_pyopenms +def test_convert_mzml_to_mgf(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=3) + stats = convert_mzml_to_mgf(mzml_path, mgf_path) + + assert stats["converted"] == 3 + assert stats["ms_level_spectra"] == 3 + assert stats["filtered_out"] == 0 + + with open(mgf_path) as fh: + content = fh.read() + assert content.count("BEGIN IONS") == 3 + assert "PEPMASS=" in content + assert "CHARGE=" in content + + +@requires_pyopenms +def test_mgf_content_format(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=1) + convert_mzml_to_mgf(mzml_path, mgf_path) + + with open(mgf_path) as fh: + lines = fh.readlines() + + assert lines[0].strip() == "BEGIN IONS" + assert any(line.startswith("TITLE=") for line in lines) + assert any(line.startswith("PEPMASS=") for line in lines) + + +@requires_pyopenms +def test_min_peaks_filter(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=3) + stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_peaks=100) + assert stats["converted"] == 0 + assert stats["filtered_out"] == 3 + + +@requires_pyopenms +def test_charge_filter(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=6) + stats = convert_mzml_to_mgf(mzml_path, mgf_path, charges=(2,)) + assert stats["converted"] == 3 + assert stats["filtered_out"] == 3 + + with open(mgf_path) as fh: + content = fh.read() + assert "CHARGE=2+" in content + assert "CHARGE=3+" not in content + + +@requires_pyopenms +def test_rt_range_filter(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=5) + stats = convert_mzml_to_mgf(mzml_path, mgf_path, rt_min=10.5, rt_max=11.5) + assert stats["converted"] == 3 + assert stats["filtered_out"] == 2 + + +@requires_pyopenms +def test_mz_range_filter(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=5) + stats = convert_mzml_to_mgf(mzml_path, mgf_path, mz_min=550.0, mz_max=650.0) + assert stats["converted"] == 3 + assert stats["filtered_out"] == 2 + + +@requires_pyopenms +def test_min_intensity_filter(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=3) + stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_intensity=10001.0) + assert stats["converted"] == 0 + + stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_intensity=5000.0) + assert stats["converted"] == 3 + + +@requires_pyopenms +def test_combined_filters(): + from mgf_mzml_converter import convert_mzml_to_mgf, create_synthetic_mzml + + with tempfile.TemporaryDirectory() as tmp: + mzml_path = os.path.join(tmp, "test.mzML") + mgf_path = os.path.join(tmp, "test.mgf") + + create_synthetic_mzml(mzml_path, n_spectra=6) + stats = convert_mzml_to_mgf( + mzml_path, mgf_path, + charges=(2,), rt_min=10.0, rt_max=11.5, mz_min=500.0, mz_max=700.0, + ) + assert stats["converted"] == 2 + + +# --------------------------------------------------------------------------- +# Direction auto-detection tests +# --------------------------------------------------------------------------- + + +@requires_pyopenms +def test_auto_detect_mgf_to_mzml(): + from mgf_mzml_converter import _detect_direction + + direction = _detect_direction("spectra.mgf", "spectra.mzML") + assert direction == "mgf2mzml" + + +@requires_pyopenms +def test_auto_detect_mzml_to_mgf(): + from mgf_mzml_converter import _detect_direction + + direction = _detect_direction("run.mzML", "out.mgf") + assert direction == "mzml2mgf" + + +@requires_pyopenms +def test_auto_detect_mzml_lowercase(): + from mgf_mzml_converter import _detect_direction + + direction = _detect_direction("run.mzml", "out.mgf") + assert direction == "mzml2mgf" + + +@requires_pyopenms +def test_auto_detect_from_output_extension(): + """When input has unknown extension, infer from output extension.""" + from mgf_mzml_converter import _detect_direction + + direction = _detect_direction("data.unknown", "out.mzML") + assert direction == "mgf2mzml" + + +@requires_pyopenms +def test_roundtrip_mgf_mzml_mgf(): + """MGF → mzML → MGF round-trip preserves spectrum count.""" + import pyopenms as oms + from mgf_mzml_converter import convert_mgf_to_mzml, convert_mzml_to_mgf + + with tempfile.TemporaryDirectory() as tmp: + mgf_in = os.path.join(tmp, "in.mgf") + mzml_path = os.path.join(tmp, "intermediate.mzML") + mgf_out = os.path.join(tmp, "out.mgf") + + with open(mgf_in, "w") as fh: + fh.write(SAMPLE_MGF) + + stats1 = convert_mgf_to_mzml(mgf_in, mzml_path) + assert stats1["spectra_converted"] == 2 + + exp = oms.MSExperiment() + oms.MzMLFile().load(mzml_path, exp) + assert exp.size() == 2 + + stats2 = convert_mzml_to_mgf(mzml_path, mgf_out) + assert stats2["converted"] == 2 + + with open(mgf_out) as fh: + content = fh.read() + assert content.count("BEGIN IONS") == 2 diff --git a/tools/proteomics/file_conversion/mgf_to_mzml_converter/README.md b/tools/proteomics/file_conversion/mgf_to_mzml_converter/README.md deleted file mode 100644 index 1950599..0000000 --- a/tools/proteomics/file_conversion/mgf_to_mzml_converter/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# MGF to mzML Converter - -Convert MGF (Mascot Generic Format) spectra to mzML format. - -## Installation - -```bash -pip install -r requirements.txt -``` - -## Usage - -```bash -python mgf_to_mzml_converter.py --input spectra.mgf --output spectra.mzML -``` diff --git a/tools/proteomics/file_conversion/mgf_to_mzml_converter/mgf_to_mzml_converter.py b/tools/proteomics/file_conversion/mgf_to_mzml_converter/mgf_to_mzml_converter.py deleted file mode 100644 index 51aa79e..0000000 --- a/tools/proteomics/file_conversion/mgf_to_mzml_converter/mgf_to_mzml_converter.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -MGF to mzML Converter -===================== -Convert MGF (Mascot Generic Format) spectra to mzML format. - -Usage ------ - python mgf_to_mzml_converter.py --input spectra.mgf --output spectra.mzML -""" - -import sys -from typing import List - -import click - -try: - import pyopenms as oms -except ImportError: - sys.exit("pyopenms is required. Install it with: pip install pyopenms") - - -def parse_mgf(input_path: str) -> List[dict]: - """Parse an MGF file and return a list of spectrum dicts. - - Each dict has keys: title, pepmass, charge, rt, peaks (list of (mz, intensity) tuples). - """ - spectra = [] - current = None - - with open(input_path) as fh: - for line in fh: - line = line.strip() - if line == "BEGIN IONS": - current = {"title": "", "pepmass": 0.0, "charge": 0, "rt": 0.0, "peaks": []} - elif line == "END IONS": - if current is not None: - spectra.append(current) - current = None - elif current is not None: - if line.startswith("TITLE="): - current["title"] = line[6:] - elif line.startswith("PEPMASS="): - parts = line[8:].split() - current["pepmass"] = float(parts[0]) - elif line.startswith("CHARGE="): - charge_str = line[7:].replace("+", "").replace("-", "") - try: - current["charge"] = int(charge_str) - except ValueError: - current["charge"] = 0 - elif line.startswith("RTINSECONDS="): - current["rt"] = float(line[12:]) - elif line and line[0].isdigit(): - parts = line.split() - if len(parts) >= 2: - try: - mz = float(parts[0]) - intensity = float(parts[1]) - current["peaks"].append((mz, intensity)) - except ValueError: - pass - - return spectra - - -def convert_mgf_to_mzml(input_path: str, output_path: str) -> dict: - """Convert MGF to mzML format. - - Returns statistics about the conversion. - """ - mgf_spectra = parse_mgf(input_path) - exp = oms.MSExperiment() - - for i, spec_data in enumerate(mgf_spectra): - spectrum = oms.MSSpectrum() - spectrum.setMSLevel(2) - spectrum.setRT(spec_data["rt"]) - spectrum.setNativeID(spec_data["title"] if spec_data["title"] else f"index={i}") - - # Set precursor - if spec_data["pepmass"] > 0: - prec = oms.Precursor() - prec.setMZ(spec_data["pepmass"]) - if spec_data["charge"] > 0: - prec.setCharge(spec_data["charge"]) - spectrum.setPrecursors([prec]) - - # Set peaks - if spec_data["peaks"]: - mzs = [p[0] for p in spec_data["peaks"]] - intensities = [p[1] for p in spec_data["peaks"]] - spectrum.set_peaks((mzs, intensities)) - - exp.addSpectrum(spectrum) - - oms.MzMLFile().store(output_path, exp) - - return {"spectra_converted": len(mgf_spectra)} - - -@click.command(help="Convert MGF to mzML format.") -@click.option("--input", "input", required=True, help="Input MGF file") -@click.option("--output", required=True, help="Output mzML file") -def main(input, output) -> None: - stats = convert_mgf_to_mzml(input, output) - print(f"Converted {stats['spectra_converted']} spectra to {output}") - - -if __name__ == "__main__": - main() diff --git a/tools/proteomics/file_conversion/mgf_to_mzml_converter/requirements.txt b/tools/proteomics/file_conversion/mgf_to_mzml_converter/requirements.txt deleted file mode 100644 index 18d6bbb..0000000 --- a/tools/proteomics/file_conversion/mgf_to_mzml_converter/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyopenms -click diff --git a/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/conftest.py b/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/conftest.py deleted file mode 100644 index 1a21ede..0000000 --- a/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import sys - -import pytest - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import pyopenms # noqa: F401 - - HAS_PYOPENMS = True -except ImportError: - HAS_PYOPENMS = False - -requires_pyopenms = pytest.mark.skipif(not HAS_PYOPENMS, reason="pyopenms not installed") diff --git a/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/test_mgf_to_mzml_converter.py b/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/test_mgf_to_mzml_converter.py deleted file mode 100644 index f51f243..0000000 --- a/tools/proteomics/file_conversion/mgf_to_mzml_converter/tests/test_mgf_to_mzml_converter.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Tests for mgf_to_mzml_converter.""" - -import os -import tempfile - -from conftest import requires_pyopenms - -SAMPLE_MGF = """\ -BEGIN IONS -TITLE=scan=1 -RTINSECONDS=120.5 -PEPMASS=500.250000 -CHARGE=2+ -100.100000 1000.0000 -200.200000 2000.0000 -300.300000 1500.0000 -END IONS - -BEGIN IONS -TITLE=scan=2 -RTINSECONDS=130.0 -PEPMASS=600.350000 -CHARGE=3+ -150.150000 800.0000 -250.250000 1200.0000 -END IONS -""" - - -@requires_pyopenms -def test_parse_mgf(): - from mgf_to_mzml_converter import parse_mgf - - with tempfile.TemporaryDirectory() as tmp: - mgf_path = os.path.join(tmp, "test.mgf") - with open(mgf_path, "w") as fh: - fh.write(SAMPLE_MGF) - - spectra = parse_mgf(mgf_path) - assert len(spectra) == 2 - assert spectra[0]["title"] == "scan=1" - assert abs(spectra[0]["pepmass"] - 500.25) < 0.01 - assert spectra[0]["charge"] == 2 - assert len(spectra[0]["peaks"]) == 3 - assert len(spectra[1]["peaks"]) == 2 - - -@requires_pyopenms -def test_convert_mgf_to_mzml(): - import pyopenms as oms - from mgf_to_mzml_converter import convert_mgf_to_mzml - - with tempfile.TemporaryDirectory() as tmp: - mgf_path = os.path.join(tmp, "test.mgf") - mzml_path = os.path.join(tmp, "test.mzML") - - with open(mgf_path, "w") as fh: - fh.write(SAMPLE_MGF) - - stats = convert_mgf_to_mzml(mgf_path, mzml_path) - assert stats["spectra_converted"] == 2 - - exp = oms.MSExperiment() - oms.MzMLFile().load(mzml_path, exp) - assert exp.size() == 2 - - # Check first spectrum - s = exp[0] - assert s.getMSLevel() == 2 - precursors = s.getPrecursors() - assert len(precursors) == 1 - assert abs(precursors[0].getMZ() - 500.25) < 0.01 - assert precursors[0].getCharge() == 2 - - mz_arr, int_arr = s.get_peaks() - assert len(mz_arr) == 3 - - -@requires_pyopenms -def test_roundtrip_mgf_mzml_mgf(): - """Test MGF -> mzML -> MGF roundtrip.""" - import sys - - from mgf_to_mzml_converter import convert_mgf_to_mzml - sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "mzml_to_mgf_converter")) - - with tempfile.TemporaryDirectory() as tmp: - mgf_path = os.path.join(tmp, "test.mgf") - mzml_path = os.path.join(tmp, "test.mzML") - - with open(mgf_path, "w") as fh: - fh.write(SAMPLE_MGF) - - stats = convert_mgf_to_mzml(mgf_path, mzml_path) - assert stats["spectra_converted"] == 2 - - # Verify mzML is valid - import pyopenms as oms - exp = oms.MSExperiment() - oms.MzMLFile().load(mzml_path, exp) - assert exp.size() == 2 diff --git a/tools/proteomics/file_conversion/mzml_to_mgf_converter/README.md b/tools/proteomics/file_conversion/mzml_to_mgf_converter/README.md deleted file mode 100644 index 9dd8b98..0000000 --- a/tools/proteomics/file_conversion/mzml_to_mgf_converter/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# mzML to MGF Converter - -Convert MS2 spectra from mzML format to MGF (Mascot Generic Format). - -## Installation - -```bash -pip install -r requirements.txt -``` - -## Usage - -```bash -# Convert MS2 spectra -python mzml_to_mgf_converter.py --input run.mzML --ms-level 2 --output spectra.mgf - -# Filter by minimum number of peaks -python mzml_to_mgf_converter.py --input run.mzML --min-peaks 10 --output spectra.mgf -``` diff --git a/tools/proteomics/file_conversion/mzml_to_mgf_converter/mzml_to_mgf_converter.py b/tools/proteomics/file_conversion/mzml_to_mgf_converter/mzml_to_mgf_converter.py deleted file mode 100644 index 658d900..0000000 --- a/tools/proteomics/file_conversion/mzml_to_mgf_converter/mzml_to_mgf_converter.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -mzML to MGF Converter -===================== -Convert MS2 spectra from mzML format to MGF (Mascot Generic Format) -with optional filtering by charge state, retention time, precursor m/z, -and minimum peak count. - -Usage ------ - python mzml_to_mgf_converter.py --input run.mzML --output spectra.mgf - python mzml_to_mgf_converter.py --input run.mzML --output spectra.mgf --charge 2 3 - python mzml_to_mgf_converter.py --input run.mzML --output spectra.mgf --rt-min 600 --rt-max 1800 - python mzml_to_mgf_converter.py --input run.mzML --output spectra.mgf --mz-min 400 --mz-max 1200 -""" - -import sys -from typing import List, Optional - -import click - -try: - import pyopenms as oms -except ImportError: - sys.exit("pyopenms is required. Install it with: pip install pyopenms") - -PROTON = 1.007276 - - -def load_mzml(input_path: str) -> oms.MSExperiment: - """Load an mzML file into an MSExperiment.""" - exp = oms.MSExperiment() - oms.MzMLFile().load(input_path, exp) - return exp - - -def get_spectra_by_level(exp: oms.MSExperiment, ms_level: int = 2) -> List[oms.MSSpectrum]: - """Extract spectra of a given MS level from an experiment.""" - return [s for s in exp if s.getMSLevel() == ms_level] - - -def passes_filters( - spectrum: oms.MSSpectrum, - min_peaks: int = 1, - charges: Optional[tuple] = None, - rt_min: Optional[float] = None, - rt_max: Optional[float] = None, - mz_min: Optional[float] = None, - mz_max: Optional[float] = None, - min_intensity: Optional[float] = None, -) -> bool: - """Check whether a spectrum passes all active filters.""" - mz_array, intensity_array = spectrum.get_peaks() - if len(mz_array) < min_peaks: - return False - - if rt_min is not None and spectrum.getRT() < rt_min: - return False - if rt_max is not None and spectrum.getRT() > rt_max: - return False - - precursors = spectrum.getPrecursors() - if precursors: - prec = precursors[0] - if charges and prec.getCharge() not in charges: - return False - if mz_min is not None and prec.getMZ() < mz_min: - return False - if mz_max is not None and prec.getMZ() > mz_max: - return False - elif charges or mz_min is not None or mz_max is not None: - return False - - if min_intensity is not None: - if len(intensity_array) == 0 or max(intensity_array) < min_intensity: - return False - - return True - - -def spectrum_to_mgf_block(spectrum: oms.MSSpectrum, index: int) -> str: - """Convert a single spectrum to an MGF block string.""" - lines = ["BEGIN IONS"] - - native_id = spectrum.getNativeID() if spectrum.getNativeID() else f"index={index}" - lines.append(f"TITLE={native_id}") - - rt = spectrum.getRT() - lines.append(f"RTINSECONDS={rt:.4f}") - - precursors = spectrum.getPrecursors() - if precursors: - prec = precursors[0] - mz = prec.getMZ() - charge = prec.getCharge() - lines.append(f"PEPMASS={mz:.6f}") - if charge > 0: - lines.append(f"CHARGE={charge}+") - - mz_array, intensity_array = spectrum.get_peaks() - for mz_val, intensity_val in zip(mz_array, intensity_array): - lines.append(f"{mz_val:.6f} {intensity_val:.4f}") - - lines.append("END IONS") - lines.append("") - return "\n".join(lines) - - -def convert_mzml_to_mgf( - input_path: str, - output_path: str, - ms_level: int = 2, - min_peaks: int = 1, - charges: Optional[tuple] = None, - rt_min: Optional[float] = None, - rt_max: Optional[float] = None, - mz_min: Optional[float] = None, - mz_max: Optional[float] = None, - min_intensity: Optional[float] = None, -) -> dict: - """Convert mzML to MGF format with optional filtering. - - Returns statistics about the conversion. - """ - exp = load_mzml(input_path) - spectra = get_spectra_by_level(exp, ms_level) - - converted = 0 - filtered_out = 0 - with open(output_path, "w") as fh: - for i, spectrum in enumerate(spectra): - if not passes_filters( - spectrum, min_peaks, charges, rt_min, rt_max, mz_min, mz_max, - min_intensity, - ): - filtered_out += 1 - continue - block = spectrum_to_mgf_block(spectrum, i) - fh.write(block + "\n") - converted += 1 - - return { - "total_spectra": exp.size(), - "ms_level_spectra": len(spectra), - "converted": converted, - "filtered_out": filtered_out, - } - - -def create_synthetic_mzml(output_path: str, n_spectra: int = 5) -> None: - """Create a synthetic mzML file with MS2 spectra for testing. - - Generates spectra with: - - RT: 10.0, 10.5, 11.0, ... (0.5s apart) - - precursor m/z: 500, 550, 600, ... - - charge: alternating 2, 3 - - base peak intensity: 10000, 9000, 8000, ... - """ - exp = oms.MSExperiment() - - ms1 = oms.MSSpectrum() - ms1.setMSLevel(1) - ms1.setRT(10.0) - ms1.set_peaks(([100.0, 200.0, 300.0], [1000.0, 2000.0, 1500.0])) - exp.addSpectrum(ms1) - - for i in range(n_spectra): - ms2 = oms.MSSpectrum() - ms2.setMSLevel(2) - ms2.setRT(10.0 + i * 0.5) - ms2.setNativeID(f"scan={i + 1}") - - prec = oms.Precursor() - prec.setMZ(500.0 + i * 50.0) - prec.setCharge(2 if i % 2 == 0 else 3) - ms2.setPrecursors([prec]) - - mzs = [100.0 + j * 50.0 for j in range(10)] - intensities = [1000.0 * (10 - j) for j in range(10)] - ms2.set_peaks((mzs, intensities)) - exp.addSpectrum(ms2) - - oms.MzMLFile().store(output_path, exp) - - -@click.command(help="Convert MS2 spectra from mzML to MGF format with optional filtering.") -@click.option("--input", "input", required=True, help="Input mzML file") -@click.option("--ms-level", type=int, default=2, help="MS level to extract (default: 2)") -@click.option("--min-peaks", type=int, default=1, help="Minimum peaks per spectrum (default: 1)") -@click.option("--charge", multiple=True, type=int, help="Keep only these charge states (repeatable)") -@click.option("--rt-min", type=float, default=None, help="Minimum retention time in seconds") -@click.option("--rt-max", type=float, default=None, help="Maximum retention time in seconds") -@click.option("--mz-min", type=float, default=None, help="Minimum precursor m/z") -@click.option("--mz-max", type=float, default=None, help="Maximum precursor m/z") -@click.option( - "--min-intensity", type=float, default=None, - help="Minimum base peak intensity to keep a spectrum", -) -@click.option("--output", required=True, help="Output MGF file") -def main(input, ms_level, min_peaks, charge, rt_min, rt_max, mz_min, mz_max, - min_intensity, output) -> None: - charges = tuple(charge) if charge else None - stats = convert_mzml_to_mgf( - input, output, ms_level, min_peaks, charges, rt_min, rt_max, - mz_min, mz_max, min_intensity, - ) - print( - f"Converted {stats['converted']} / {stats['ms_level_spectra']} " - f"MS{ms_level} spectra to {output} " - f"({stats['filtered_out']} filtered out)" - ) - - -if __name__ == "__main__": - main() diff --git a/tools/proteomics/file_conversion/mzml_to_mgf_converter/requirements.txt b/tools/proteomics/file_conversion/mzml_to_mgf_converter/requirements.txt deleted file mode 100644 index 18d6bbb..0000000 --- a/tools/proteomics/file_conversion/mzml_to_mgf_converter/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pyopenms -click diff --git a/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/conftest.py b/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/conftest.py deleted file mode 100644 index 1a21ede..0000000 --- a/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import sys - -import pytest - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import pyopenms # noqa: F401 - - HAS_PYOPENMS = True -except ImportError: - HAS_PYOPENMS = False - -requires_pyopenms = pytest.mark.skipif(not HAS_PYOPENMS, reason="pyopenms not installed") diff --git a/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/test_mzml_to_mgf_converter.py b/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/test_mzml_to_mgf_converter.py deleted file mode 100644 index 8902e5e..0000000 --- a/tools/proteomics/file_conversion/mzml_to_mgf_converter/tests/test_mzml_to_mgf_converter.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Tests for mzml_to_mgf_converter.""" - -import os -import tempfile - -from conftest import requires_pyopenms - - -@requires_pyopenms -def test_create_synthetic_mzml(): - import pyopenms as oms - from mzml_to_mgf_converter import create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - create_synthetic_mzml(mzml_path, n_spectra=3) - - exp = oms.MSExperiment() - oms.MzMLFile().load(mzml_path, exp) - assert exp.size() == 4 # 1 MS1 + 3 MS2 - - -@requires_pyopenms -def test_convert_mzml_to_mgf(): - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=3) - stats = convert_mzml_to_mgf(mzml_path, mgf_path) - - assert stats["converted"] == 3 - assert stats["ms_level_spectra"] == 3 - assert stats["filtered_out"] == 0 - - with open(mgf_path) as fh: - content = fh.read() - assert content.count("BEGIN IONS") == 3 - assert "PEPMASS=" in content - assert "CHARGE=" in content - - -@requires_pyopenms -def test_mgf_content_format(): - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=1) - convert_mzml_to_mgf(mzml_path, mgf_path) - - with open(mgf_path) as fh: - lines = fh.readlines() - - assert lines[0].strip() == "BEGIN IONS" - assert any(line.startswith("TITLE=") for line in lines) - assert any(line.startswith("PEPMASS=") for line in lines) - - -@requires_pyopenms -def test_min_peaks_filter(): - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=3) - stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_peaks=100) - assert stats["converted"] == 0 - assert stats["filtered_out"] == 3 - - -@requires_pyopenms -def test_charge_filter(): - """Synthetic spectra alternate charge 2, 3. Filter for charge 2 only.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=6) - stats = convert_mzml_to_mgf(mzml_path, mgf_path, charges=(2,)) - # Spectra 0, 2, 4 have charge 2; spectra 1, 3, 5 have charge 3 - assert stats["converted"] == 3 - assert stats["filtered_out"] == 3 - - with open(mgf_path) as fh: - content = fh.read() - assert "CHARGE=2+" in content - assert "CHARGE=3+" not in content - - -@requires_pyopenms -def test_charge_filter_multiple(): - """Filter for both charge 2 and 3 — should keep all.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=4) - stats = convert_mzml_to_mgf(mzml_path, mgf_path, charges=(2, 3)) - assert stats["converted"] == 4 - assert stats["filtered_out"] == 0 - - -@requires_pyopenms -def test_rt_range_filter(): - """Synthetic spectra at RT 10.0, 10.5, 11.0, 11.5, 12.0. Filter RT 10.5-11.5.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=5) - stats = convert_mzml_to_mgf( - mzml_path, mgf_path, rt_min=10.5, rt_max=11.5, - ) - # RT 10.5, 11.0, 11.5 pass; RT 10.0, 12.0 filtered out - assert stats["converted"] == 3 - assert stats["filtered_out"] == 2 - - -@requires_pyopenms -def test_mz_range_filter(): - """Synthetic precursor m/z: 500, 550, 600, 650, 700. Filter 550-650.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=5) - stats = convert_mzml_to_mgf( - mzml_path, mgf_path, mz_min=550.0, mz_max=650.0, - ) - # m/z 550, 600, 650 pass; 500, 700 filtered out - assert stats["converted"] == 3 - assert stats["filtered_out"] == 2 - - -@requires_pyopenms -def test_min_intensity_filter(): - """All synthetic spectra have base peak intensity 10000. Filter > 10000 removes all.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - create_synthetic_mzml(mzml_path, n_spectra=3) - stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_intensity=10001.0) - assert stats["converted"] == 0 - - stats = convert_mzml_to_mgf(mzml_path, mgf_path, min_intensity=5000.0) - assert stats["converted"] == 3 - - -@requires_pyopenms -def test_combined_filters(): - """Apply charge + RT + m/z filters together.""" - from mzml_to_mgf_converter import convert_mzml_to_mgf, create_synthetic_mzml - - with tempfile.TemporaryDirectory() as tmp: - mzml_path = os.path.join(tmp, "test.mzML") - mgf_path = os.path.join(tmp, "test.mgf") - - # 6 spectra: RT 10.0-12.5, m/z 500-750, charge alternating 2/3 - create_synthetic_mzml(mzml_path, n_spectra=6) - stats = convert_mzml_to_mgf( - mzml_path, mgf_path, - charges=(2,), rt_min=10.0, rt_max=11.5, mz_min=500.0, mz_max=700.0, - ) - # charge 2: indices 0, 2, 4 (RT 10.0/11.0/12.0, m/z 500/600/700) - # RT filter: removes index 4 (RT 12.0) - # m/z filter: all 500, 600, 700 pass (700 is at boundary, passes <=) - # Result: indices 0, 2 pass - assert stats["converted"] == 2 From 6233d206fe734af4f8ef775ed093e76701311f14 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:56:23 +0000 Subject: [PATCH 2/2] Fix CI: skip linting/testing deleted tool directories Co-authored-by: ypriverol <52113+ypriverol@users.noreply.github.com> Agent-Logs-Url: https://github.com/OpenMS/agentomics/sessions/ea65b391-3afc-47cb-9dee-24639c3de03a --- .github/workflows/validate.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index a209594..01bae5b 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -20,10 +20,12 @@ jobs: name: Detect changed tool directories run: | # Note: github.base_ref is only available on pull_request events - # Find all tool directories that changed in this PR + # Find all tool directories that changed in this PR, keeping only + # those that still exist (deleted tool dirs must not be linted/tested). CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'tools/' \ | grep -oP 'tools/[^/]+/[^/]+/[^/]+/' \ | sort -u \ + | while read -r dir; do [ -d "$dir" ] && echo "$dir"; done \ | jq -R -s -c 'split("\n") | map(select(length > 0))') if [ "$CHANGED" = "[]" ] || [ -z "$CHANGED" ]; then @@ -60,9 +62,11 @@ jobs: run: | DIRS='${{ needs.detect-changes.outputs.matrix }}' echo "$DIRS" | jq -r '.[]' | while read -r dir; do - echo "::group::ruff $dir" - /tmp/validate_venv/bin/python -m ruff check "$dir" - echo "::endgroup::" + if [ -d "$dir" ]; then + echo "::group::ruff $dir" + /tmp/validate_venv/bin/python -m ruff check "$dir" + echo "::endgroup::" + fi done - name: Test changed tools