From f101864320783ba425e038ab2c5679fdc07dfcd7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:43:55 +0000 Subject: [PATCH 1/2] Initial plan From 9e73afc87b25850d7a101ebe711cc6c0eb06dd5a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:50:46 +0000 Subject: [PATCH 2/2] Add pyopenms scripts for proteomics and metabolomics with tests and docs Co-authored-by: ypriverol <52113+ypriverol@users.noreply.github.com> Agent-Logs-Url: https://github.com/OpenMS/agentomics/sessions/871c3e78-4575-48b8-beca-2f71e43e6ed0 --- README.md | 63 ++++++- requirements.txt | 1 + scripts/metabolomics/README.md | 46 +++++ .../metabolomics/isotope_pattern_matcher.py | 169 ++++++++++++++++++ .../metabolomics/mass_accuracy_calculator.py | 140 +++++++++++++++ .../metabolite_feature_detection.py | 145 +++++++++++++++ scripts/proteomics/README.md | 54 ++++++ .../feature_detection_proteomics.py | 101 +++++++++++ scripts/proteomics/peptide_mass_calculator.py | 133 ++++++++++++++ scripts/proteomics/protein_digest.py | 162 +++++++++++++++++ scripts/proteomics/spectrum_file_info.py | 134 ++++++++++++++ tests/test_metabolomics.py | 122 +++++++++++++ tests/test_proteomics.py | 111 ++++++++++++ 13 files changed, 1380 insertions(+), 1 deletion(-) create mode 100644 requirements.txt create mode 100644 scripts/metabolomics/README.md create mode 100644 scripts/metabolomics/isotope_pattern_matcher.py create mode 100644 scripts/metabolomics/mass_accuracy_calculator.py create mode 100644 scripts/metabolomics/metabolite_feature_detection.py create mode 100644 scripts/proteomics/README.md create mode 100644 scripts/proteomics/feature_detection_proteomics.py create mode 100644 scripts/proteomics/peptide_mass_calculator.py create mode 100644 scripts/proteomics/protein_digest.py create mode 100644 scripts/proteomics/spectrum_file_info.py create mode 100644 tests/test_metabolomics.py create mode 100644 tests/test_proteomics.py diff --git a/README.md b/README.md index fd929c9..3325b78 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,63 @@ # agentomics -A repository of agentic created tools in proteomics using pyopenms + +A repository of agentic-created scripts using [pyopenms](https://pyopenms.readthedocs.io/) +for proteomics and metabolomics users. + +## Requirements + +```bash +pip install pyopenms +``` + +Or install all dependencies at once: + +```bash +pip install -r requirements.txt +``` + +## Repository Structure + +``` +agentomics/ +├── scripts/ +│ ├── proteomics/ # Scripts for proteomics workflows +│ │ ├── peptide_mass_calculator.py +│ │ ├── protein_digest.py +│ │ ├── spectrum_file_info.py +│ │ └── feature_detection_proteomics.py +│ └── metabolomics/ # Scripts for metabolomics workflows +│ ├── metabolite_feature_detection.py +│ ├── mass_accuracy_calculator.py +│ └── isotope_pattern_matcher.py +└── tests/ + ├── test_proteomics.py + └── test_metabolomics.py +``` + +## Proteomics Scripts + +| Script | Description | +|--------|-------------| +| [`peptide_mass_calculator.py`](scripts/proteomics/peptide_mass_calculator.py) | Monoisotopic/average masses and b/y fragment ion series for peptide sequences | +| [`protein_digest.py`](scripts/proteomics/protein_digest.py) | In-silico enzymatic protein digestion (Trypsin, Lys-C, …) | +| [`spectrum_file_info.py`](scripts/proteomics/spectrum_file_info.py) | Summary statistics for mzML files (spectra, RT range, TIC) | +| [`feature_detection_proteomics.py`](scripts/proteomics/feature_detection_proteomics.py) | FeatureFinderCentroided-based peptide feature detection from LC-MS/MS data | + +See [`scripts/proteomics/README.md`](scripts/proteomics/README.md) for usage examples. + +## Metabolomics Scripts + +| Script | Description | +|--------|-------------| +| [`metabolite_feature_detection.py`](scripts/metabolomics/metabolite_feature_detection.py) | FeatureFinderMetabo-based metabolite feature detection from LC-MS data | +| [`mass_accuracy_calculator.py`](scripts/metabolomics/mass_accuracy_calculator.py) | Compute m/z mass accuracy (ppm error) for sequences or molecular formulas | +| [`isotope_pattern_matcher.py`](scripts/metabolomics/isotope_pattern_matcher.py) | Generate theoretical isotope distributions and score against observed peaks | + +See [`scripts/metabolomics/README.md`](scripts/metabolomics/README.md) for usage examples. + +## Running Tests + +```bash +pip install pytest pyopenms +python -m pytest tests/ -v +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7ce28ec --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pyopenms diff --git a/scripts/metabolomics/README.md b/scripts/metabolomics/README.md new file mode 100644 index 0000000..c57cff4 --- /dev/null +++ b/scripts/metabolomics/README.md @@ -0,0 +1,46 @@ +# Metabolomics Scripts + +A collection of agentic-created Python scripts for common metabolomics tasks +using [pyopenms](https://pyopenms.readthedocs.io/). + +## Scripts + +### `metabolite_feature_detection.py` +Detect small-molecule features (isotope envelopes) in centroided LC-MS data +using the `FeatureFinderMetabo` pipeline (mass tracing → elution peak +detection → feature finding). Output is written as a featureXML file. + +```bash +python metabolite_feature_detection.py --input sample.mzML +python metabolite_feature_detection.py --input sample.mzML --output features.featureXML --noise 1e5 +``` + +### `mass_accuracy_calculator.py` +Calculate the mass accuracy (ppm error) between a theoretical value derived +from a peptide sequence or molecular formula and one or more observed m/z +values. + +```bash +# Peptide sequence +python mass_accuracy_calculator.py --sequence PEPTIDEK --observed 803.4560 + +# Molecular formula +python mass_accuracy_calculator.py --formula C6H12O6 --observed 181.0709 + +# Multiple observed values at charge 2 +python mass_accuracy_calculator.py --sequence ACDEFGHIK --charge 2 \ + --observed 554.2478 554.2480 554.2482 +``` + +### `isotope_pattern_matcher.py` +Generate a theoretical isotope distribution for any molecular formula and +optionally compute a cosine similarity score against observed peaks. + +```bash +# Generate pattern for glucose +python isotope_pattern_matcher.py --formula C6H12O6 + +# Compare against observed peaks +python isotope_pattern_matcher.py --formula C6H12O6 \ + --peaks 181.0709,100.0 182.0742,6.7 183.0775,0.4 +``` diff --git a/scripts/metabolomics/isotope_pattern_matcher.py b/scripts/metabolomics/isotope_pattern_matcher.py new file mode 100644 index 0000000..f0510d4 --- /dev/null +++ b/scripts/metabolomics/isotope_pattern_matcher.py @@ -0,0 +1,169 @@ +""" +Isotope Pattern Generator & Matcher +===================================== +Generate the theoretical isotope distribution for a molecular formula +and optionally compare it against an observed spectrum to compute a +cosine similarity score using pyopenms. + +Usage +----- + # Generate isotope pattern for glucose + python isotope_pattern_matcher.py --formula C6H12O6 + + # Compare against observed peaks (m/z intensity pairs on stdin or --peaks) + python isotope_pattern_matcher.py --formula C6H12O6 \\ + --peaks 181.0709,100.0 182.0742,6.7 183.0775,0.4 +""" + +import argparse +import math +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + + +def get_isotope_distribution( + formula: str, + max_isotopes: int = 5, +) -> list: + """Compute the theoretical isotope distribution for a molecular formula. + + Parameters + ---------- + formula: + Empirical formula string, e.g. ``"C6H12O6"``. + max_isotopes: + Maximum number of isotope peaks to return (default: 5). + + Returns + ------- + list of (mz, relative_abundance) + Sorted by m/z; relative abundances sum to 100. + """ + ef = oms.EmpiricalFormula(formula) + isotope_dist = ef.getIsotopeDistribution( + oms.CoarseIsotopePatternGenerator(max_isotopes) + ) + peaks = [(p.getMZ(), p.getIntensity()) for p in isotope_dist.getContainer()] + if not peaks: + return [] + max_ab = max(ab for _, ab in peaks) + return [(mz, ab / max_ab * 100) for mz, ab in peaks] + + +def cosine_similarity( + theoretical: list, + observed: list, + mz_tolerance: float = 0.02, +) -> float: + """Compute cosine similarity between theoretical and observed peak lists. + + Parameters + ---------- + theoretical: + List of ``(mz, intensity)`` tuples for the theoretical pattern. + observed: + List of ``(mz, intensity)`` tuples for the observed spectrum. + mz_tolerance: + Maximum m/z difference to match a pair of peaks (default: 0.02 Da). + + Returns + ------- + float + Cosine similarity in [0, 1]. + """ + theo_vec = [] + obs_vec = [] + for tmz, tint in theoretical: + matched = 0.0 + for omz, oint in observed: + if abs(omz - tmz) <= mz_tolerance: + matched = oint + break + theo_vec.append(tint) + obs_vec.append(matched) + + dot = sum(t * o for t, o in zip(theo_vec, obs_vec)) + norm_t = math.sqrt(sum(t ** 2 for t in theo_vec)) + norm_o = math.sqrt(sum(o ** 2 for o in obs_vec)) + if norm_t == 0 or norm_o == 0: + return 0.0 + return dot / (norm_t * norm_o) + + +def parse_peaks(peak_strings: list) -> list: + """Parse ``"mz,intensity"`` strings into ``(float, float)`` tuples.""" + peaks = [] + for s in peak_strings: + parts = s.split(",") + if len(parts) != 2: + raise ValueError( + f"Expected 'mz,intensity' format, got: {s!r}" + ) + peaks.append((float(parts[0]), float(parts[1]))) + return peaks + + +def main(): + parser = argparse.ArgumentParser( + description="Generate theoretical isotope patterns and optionally " + "compare them against observed peaks using pyopenms." + ) + parser.add_argument( + "--formula", + required=True, + help="Molecular formula (e.g. C6H12O6)", + ) + parser.add_argument( + "--max-isotopes", + type=int, + default=5, + dest="max_isotopes", + help="Maximum isotope peaks to compute (default: 5)", + ) + parser.add_argument( + "--peaks", + nargs="+", + metavar="MZ,INTENSITY", + help="Observed peaks as 'mz,intensity' pairs for similarity scoring", + ) + parser.add_argument( + "--tolerance", + type=float, + default=0.02, + metavar="DA", + help="m/z tolerance in Da for peak matching (default: 0.02)", + ) + args = parser.parse_args() + + distribution = get_isotope_distribution(args.formula, args.max_isotopes) + if not distribution: + print("Could not compute isotope distribution for the given formula.") + return + + print(f"Isotope distribution for {args.formula}:") + print(f"\n{'Peak':>5} {'m/z':>12} {'Relative Abundance (%)':>22}") + print("-" * 44) + for i, (mz, rel_ab) in enumerate(distribution): + bar = "#" * int(rel_ab / 5) + print(f" M+{i} {mz:>12.4f} {rel_ab:>6.2f} % {bar}") + + if args.peaks: + observed = parse_peaks(args.peaks) + sim = cosine_similarity(distribution, observed, args.tolerance) + print(f"\nCosine similarity vs. observed peaks: {sim:.4f}") + if sim >= 0.9: + print(" ✓ Excellent match (≥ 0.90)") + elif sim >= 0.7: + print(" ~ Good match (≥ 0.70)") + else: + print(" ✗ Poor match (< 0.70)") + + +if __name__ == "__main__": + main() diff --git a/scripts/metabolomics/mass_accuracy_calculator.py b/scripts/metabolomics/mass_accuracy_calculator.py new file mode 100644 index 0000000..b8c1b7a --- /dev/null +++ b/scripts/metabolomics/mass_accuracy_calculator.py @@ -0,0 +1,140 @@ +""" +Mass Accuracy Calculator +======================== +Calculate the mass accuracy (ppm error) between a theoretical mass +(or peptide/formula string) and one or more observed m/z values. + +Supports both: +- Amino acid sequence inputs (e.g. ``PEPTIDEK``) +- Empirical formula inputs (e.g. ``C6H12O6``) + +Usage +----- + # Peptide sequence + python mass_accuracy_calculator.py --sequence PEPTIDEK --observed 803.4560 + + # Molecular formula (charge 1 default) + python mass_accuracy_calculator.py --formula C6H12O6 --observed 181.0709 + + # Multiple observed values at charge 2 + python mass_accuracy_calculator.py --sequence ACDEFGHIK --charge 2 \\ + --observed 554.2478 554.2480 554.2482 +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + +PROTON = 1.007276 + + +def theoretical_mz_from_sequence(sequence: str, charge: int) -> float: + """Compute the theoretical m/z for a peptide sequence. + + Parameters + ---------- + sequence: + Amino acid sequence, optionally with bracket-enclosed modifications. + charge: + Charge state. + + Returns + ------- + float + Theoretical m/z value. + """ + aa_seq = oms.AASequence.fromString(sequence) + mass = aa_seq.getMonoWeight() + return (mass + charge * PROTON) / charge + + +def theoretical_mz_from_formula(formula: str, charge: int) -> float: + """Compute the theoretical m/z for a molecular formula. + + Parameters + ---------- + formula: + Empirical formula string, e.g. ``"C6H12O6"``. + charge: + Charge state (used for proton addition). + + Returns + ------- + float + Theoretical m/z value (monoisotopic). + """ + ef = oms.EmpiricalFormula(formula) + mass = ef.getMonoWeight() + return (mass + charge * PROTON) / charge + + +def ppm_error(theoretical: float, observed: float) -> float: + """Calculate the mass accuracy in parts-per-million (ppm). + + Parameters + ---------- + theoretical: + Theoretical m/z value. + observed: + Observed m/z value. + + Returns + ------- + float + PPM error (positive = observed > theoretical). + """ + return (observed - theoretical) / theoretical * 1e6 + + +def main(): + parser = argparse.ArgumentParser( + description="Compute m/z mass accuracy (ppm error) using pyopenms." + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "--sequence", + help="Peptide sequence (e.g. PEPTIDEK)", + ) + group.add_argument( + "--formula", + help="Molecular formula (e.g. C6H12O6)", + ) + parser.add_argument( + "--charge", + type=int, + default=1, + help="Charge state (default: 1)", + ) + parser.add_argument( + "--observed", + nargs="+", + type=float, + required=True, + metavar="MZ", + help="Observed m/z value(s)", + ) + args = parser.parse_args() + + if args.sequence: + theoretical = theoretical_mz_from_sequence(args.sequence, args.charge) + label = f"sequence={args.sequence}" + else: + theoretical = theoretical_mz_from_formula(args.formula, args.charge) + label = f"formula={args.formula}" + + print(f"Theoretical m/z ({label}, charge {args.charge}+): {theoretical:.6f}") + print(f"\n{'Observed m/z':>14} {'PPM error':>10}") + print("-" * 28) + for obs in args.observed: + ppm = ppm_error(theoretical, obs) + print(f"{obs:>14.6f} {ppm:>+10.4f}") + + +if __name__ == "__main__": + main() diff --git a/scripts/metabolomics/metabolite_feature_detection.py b/scripts/metabolomics/metabolite_feature_detection.py new file mode 100644 index 0000000..4a5a0f8 --- /dev/null +++ b/scripts/metabolomics/metabolite_feature_detection.py @@ -0,0 +1,145 @@ +""" +Metabolite Feature Detection +============================= +Detect small-molecule features (isotope envelopes) in an LC-MS mzML file +using the pyopenms FeatureFinderMetabo algorithm. Results are written to a +featureXML file which can be opened in TOPPView. + +Usage +----- + python metabolite_feature_detection.py --input sample.mzML + python metabolite_feature_detection.py --input sample.mzML --output features.featureXML --noise 1e5 +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + + +def detect_metabolite_features( + input_path: str, + output_path: str, + noise_threshold: float = 1e4, +) -> oms.FeatureMap: + """Run FeatureFinderMetabo on an mzML file. + + Parameters + ---------- + input_path: + Path to the centroided mzML file. + output_path: + Path for the output featureXML file. + noise_threshold: + Minimum peak intensity to consider during mass tracing (default 1e4). + + Returns + ------- + pyopenms.FeatureMap + Map of detected metabolite features. + """ + exp = oms.MSExperiment() + print(f"Loading {input_path} …") + oms.MzMLFile().load(input_path, exp) + exp.updateRanges() + + # --- Mass tracing --- + mass_traces = [] + mt_params = oms.MassTraceDetection().getDefaults() + mt_params.setValue("noise_threshold_int", noise_threshold) + mt_det = oms.MassTraceDetection() + mt_det.setParameters(mt_params) + mt_det.run(exp, mass_traces, 0) + print(f"Mass traces found: {len(mass_traces)}") + + # --- Elution peak detection --- + mass_traces_split = [] + mass_traces_final = [] + epd_params = oms.ElutionPeakDetection().getDefaults() + epd = oms.ElutionPeakDetection() + epd.setParameters(epd_params) + epd.detectPeaks(mass_traces, mass_traces_split) + if epd.getParameters().getValue("width_filtering") == "auto": + epd.filterByPeakWidth(mass_traces_split, mass_traces_final) + else: + mass_traces_final = mass_traces_split + + # --- Feature detection --- + feature_map = oms.FeatureMap() + chrom_fwhm = 0.0 + ffm_params = oms.FeatureFindingMetabo().getDefaults() + ffm = oms.FeatureFindingMetabo() + ffm.setParameters(ffm_params) + ffm.run(mass_traces_final, feature_map, chrom_fwhm) + + feature_map.setUniqueIds() + oms.FeatureXMLFile().store(output_path, feature_map) + print(f"Detected {feature_map.size()} metabolite features → {output_path}") + return feature_map + + +def print_feature_summary(feature_map: oms.FeatureMap, top_n: int = 20) -> None: + """Print a tabular summary of the top-N most intense features.""" + if feature_map.size() == 0: + print("No features detected.") + return + + features = list(feature_map) + features.sort(key=lambda f: f.getIntensity(), reverse=True) + + display = features[:top_n] + print( + f"\nTop {len(display)} features (by intensity):\n" + f"{'#':>5} {'RT (s)':>10} {'m/z':>12} {'Charge':>6} {'Intensity':>14}" + ) + print("-" * 56) + for i, feature in enumerate(display, 1): + print( + f"{i:>5} {feature.getRT():>10.2f} {feature.getMZ():>12.4f} " + f"{feature.getCharge():>6} {feature.getIntensity():>14.3e}" + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Detect metabolite features in an mzML file using pyopenms." + ) + parser.add_argument( + "--input", + required=True, + metavar="FILE", + help="Centroided mzML input file", + ) + parser.add_argument( + "--output", + metavar="FILE", + help="Output featureXML file (default: .featureXML)", + ) + parser.add_argument( + "--noise", + type=float, + default=1e4, + metavar="THRESHOLD", + help="Noise intensity threshold for mass tracing (default: 1e4)", + ) + parser.add_argument( + "--top", + type=int, + default=20, + metavar="N", + help="Number of top features to print (default: 20)", + ) + args = parser.parse_args() + + output_path = args.output or args.input.replace(".mzML", "_metabolites.featureXML") + feature_map = detect_metabolite_features(args.input, output_path, args.noise) + print_feature_summary(feature_map, args.top) + + +if __name__ == "__main__": + main() diff --git a/scripts/proteomics/README.md b/scripts/proteomics/README.md new file mode 100644 index 0000000..e4d4046 --- /dev/null +++ b/scripts/proteomics/README.md @@ -0,0 +1,54 @@ +# Proteomics Scripts + +A collection of agentic-created Python scripts for common proteomics tasks +using [pyopenms](https://pyopenms.readthedocs.io/). + +## Scripts + +### `peptide_mass_calculator.py` +Calculate monoisotopic and average masses for peptide sequences, and compute +b-ion / y-ion fragment series. + +```bash +# Plain sequence +python peptide_mass_calculator.py --sequence PEPTIDEK + +# Modified sequence (oxidised Met) at charge 2 +python peptide_mass_calculator.py --sequence PEPTM[147]IDEK --charge 2 + +# Also print fragment ions +python peptide_mass_calculator.py --sequence ACDEFGHIK --fragments +``` + +### `protein_digest.py` +Perform in-silico enzymatic digestion of a protein sequence and report +the resulting peptides with their masses. + +```bash +# Tryptic digest +python protein_digest.py --sequence MKVLWAALLVTFLAGCQAK... --enzyme Trypsin + +# Lys-C with up to 2 missed cleavages +python protein_digest.py --sequence MKVLWAALLVTFLAGCQAK... --enzyme Lys-C --missed-cleavages 2 + +# List all available enzymes +python protein_digest.py --list-enzymes +``` + +### `spectrum_file_info.py` +Summarise the contents of an mzML file: number of spectra by MS level, +retention time range, m/z range, and TIC statistics. + +```bash +python spectrum_file_info.py --input sample.mzML +python spectrum_file_info.py --input sample.mzML --tic # also print per-spectrum TIC +``` + +### `feature_detection_proteomics.py` +Detect peptide isotope features in centroided LC-MS/MS data using the +`FeatureFinderCentroided` algorithm. Output is written as a featureXML file. + +```bash +python feature_detection_proteomics.py --input sample.mzML +python feature_detection_proteomics.py --input sample.mzML --output features.featureXML +``` diff --git a/scripts/proteomics/feature_detection_proteomics.py b/scripts/proteomics/feature_detection_proteomics.py new file mode 100644 index 0000000..ab25e32 --- /dev/null +++ b/scripts/proteomics/feature_detection_proteomics.py @@ -0,0 +1,101 @@ +""" +Feature Detection for Proteomics LC-MS Data +============================================ +Detect peptide features (isotope envelopes) in an mzML file using the +pyopenms FeatureFinderCentroided algorithm. Results are written to a +featureXML file which can be opened in TOPPView. + +Usage +----- + python feature_detection_proteomics.py --input sample.mzML + python feature_detection_proteomics.py --input sample.mzML --output features.featureXML +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + + +def detect_features( + input_path: str, + output_path: str, +) -> oms.FeatureMap: + """Run FeatureFinderCentroided on an mzML file. + + Parameters + ---------- + input_path: + Path to the centroided mzML file. + output_path: + Path for the output featureXML file. + + Returns + ------- + pyopenms.FeatureMap + Map of detected features. + """ + exp = oms.MSExperiment() + print(f"Loading {input_path} …") + oms.MzMLFile().load(input_path, exp) + exp.updateRanges() + + feature_map = oms.FeatureMap() + seeds = oms.FeatureMap() + + ff = oms.FeatureFinder() + ff.setLogType(oms.LogType.CMD) + + params = oms.FeatureFinder().getParameters("centroided") + ff.run("centroided", exp, feature_map, params, seeds) + + feature_map.setUniqueIds() + oms.FeatureXMLFile().store(output_path, feature_map) + print(f"Detected {feature_map.size()} features → {output_path}") + return feature_map + + +def print_feature_summary(feature_map: oms.FeatureMap) -> None: + """Print a tabular summary of detected features.""" + if feature_map.size() == 0: + print("No features detected.") + return + + print(f"\n{'#':>5} {'RT (s)':>10} {'m/z':>12} {'Charge':>6} {'Intensity':>14}") + print("-" * 56) + for i, feature in enumerate(feature_map, 1): + print( + f"{i:>5} {feature.getRT():>10.2f} {feature.getMZ():>12.4f} " + f"{feature.getCharge():>6} {feature.getIntensity():>14.3e}" + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Detect peptide features in an mzML file using pyopenms." + ) + parser.add_argument( + "--input", + required=True, + metavar="FILE", + help="Centroided mzML input file", + ) + parser.add_argument( + "--output", + metavar="FILE", + help="Output featureXML file (default: .featureXML)", + ) + args = parser.parse_args() + + output_path = args.output or args.input.replace(".mzML", ".featureXML") + feature_map = detect_features(args.input, output_path) + print_feature_summary(feature_map) + + +if __name__ == "__main__": + main() diff --git a/scripts/proteomics/peptide_mass_calculator.py b/scripts/proteomics/peptide_mass_calculator.py new file mode 100644 index 0000000..8bc4893 --- /dev/null +++ b/scripts/proteomics/peptide_mass_calculator.py @@ -0,0 +1,133 @@ +""" +Peptide Mass Calculator +======================= +Calculate monoisotopic and average masses for peptide sequences using pyopenms. + +Supports: +- Plain amino acid sequences (e.g. "PEPTIDEK") +- Modified sequences in bracket notation (e.g. "PEPTM[147]IDEK") +- Fragment ion mass series (b-ions and y-ions) +- Multiple charge states + +Usage +----- + python peptide_mass_calculator.py --sequence PEPTIDEK + python peptide_mass_calculator.py --sequence PEPTM[147]IDEK --charge 2 + python peptide_mass_calculator.py --sequence ACDEFGHIK --fragments +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + +PROTON = 1.007276 + + +def peptide_masses(sequence: str, charge: int = 1) -> dict: + """Return monoisotopic and average masses for the given peptide sequence. + + Parameters + ---------- + sequence: + Amino acid sequence, optionally with bracket-enclosed modifications, + e.g. ``"PEPTM[147]IDEK"``. + charge: + Desired charge state for m/z calculation (default 1). + + Returns + ------- + dict + Dictionary with keys ``monoisotopic_mass``, ``average_mass``, + ``mz_monoisotopic``, and ``mz_average``. + """ + aa_seq = oms.AASequence.fromString(sequence) + mono = aa_seq.getMonoWeight() + avg = aa_seq.getAverageWeight() + return { + "sequence": sequence, + "charge": charge, + "monoisotopic_mass": mono, + "average_mass": avg, + "mz_monoisotopic": (mono + charge * PROTON) / charge, + "mz_average": (avg + charge * PROTON) / charge, + } + + +def fragment_ions(sequence: str) -> dict: + """Compute singly charged b-ion and y-ion series for a peptide. + + Parameters + ---------- + sequence: + Plain or modified amino acid sequence. + + Returns + ------- + dict + Dictionary with keys ``b_ions`` and ``y_ions``, each a list of + ``(index, mass)`` tuples. + """ + aa_seq = oms.AASequence.fromString(sequence) + n = aa_seq.size() + + b_ions = [] + for i in range(1, n): + prefix = aa_seq.getPrefix(i) + b_ions.append((i, prefix.getMonoWeight(oms.Residue.ResidueType.BIon, 1))) + + y_ions = [] + for i in range(1, n): + suffix = aa_seq.getSuffix(i) + y_ions.append((i, suffix.getMonoWeight(oms.Residue.ResidueType.YIon, 1))) + + return {"b_ions": b_ions, "y_ions": y_ions} + + +def main(): + parser = argparse.ArgumentParser( + description="Calculate peptide/fragment masses using pyopenms." + ) + parser.add_argument( + "--sequence", + required=True, + help="Amino acid sequence (e.g. PEPTIDEK or PEPTM[147]IDEK)", + ) + parser.add_argument( + "--charge", + type=int, + default=1, + help="Charge state for m/z calculation (default: 1)", + ) + parser.add_argument( + "--fragments", + action="store_true", + help="Also compute b-ion and y-ion series", + ) + args = parser.parse_args() + + info = peptide_masses(args.sequence, args.charge) + print(f"Sequence : {info['sequence']}") + print(f"Charge : {info['charge']}+") + print(f"Monoisotopic mass : {info['monoisotopic_mass']:.6f} Da") + print(f"Average mass : {info['average_mass']:.6f} Da") + print(f"m/z (mono) : {info['mz_monoisotopic']:.6f}") + print(f"m/z (avg) : {info['mz_average']:.6f}") + + if args.fragments: + ions = fragment_ions(args.sequence) + print("\n--- b-ions ---") + for idx, mass in ions["b_ions"]: + print(f" b{idx:>2} {mass:.6f} Da") + print("\n--- y-ions ---") + for idx, mass in ions["y_ions"]: + print(f" y{idx:>2} {mass:.6f} Da") + + +if __name__ == "__main__": + main() diff --git a/scripts/proteomics/protein_digest.py b/scripts/proteomics/protein_digest.py new file mode 100644 index 0000000..ce87be1 --- /dev/null +++ b/scripts/proteomics/protein_digest.py @@ -0,0 +1,162 @@ +""" +Protein In-Silico Digest +======================== +Digest a protein sequence with a chosen enzyme and report the resulting +peptides together with their masses using pyopenms. + +Supported enzymes (examples): Trypsin, Lys-C, Arg-C, Asp-N, Glu-C, Chymotrypsin +For the full list call: python protein_digest.py --list-enzymes + +Usage +----- + python protein_digest.py --sequence MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQVQEELLSSQVTQELRALMDETMKELKAYKSELEEQLTPVAEETRARLSKELQAAQARLGADVLASHGRLVQYRGEVQAMLGQSTEELRVRLASHLRKLRKRLLRDADDLQKRLAVYQAGAREGAERGLSAIRERLGPLVEQGRVRAATVGSLAGQPLQERAQAWGERLRARMEEMGSRTRDRLDEVKEQVAEVRAKLEEQAQQIRLQAEAFQARLKSWFEPLVEDMQRQWAGLVEKVQAAVGTSAAPVPSDNH --enzyme Trypsin + python protein_digest.py --sequence MKVLWAALLVTFLAGC --enzyme Lys-C --missed-cleavages 2 +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + + +def list_enzymes() -> list: + """Return all enzyme names registered in the pyopenms ProteaseDB.""" + db = oms.ProteaseDB() + raw_names = [] + db.getAllNames(raw_names) + return sorted( + n.decode() if isinstance(n, bytes) else n for n in raw_names + ) + + +def digest_protein( + sequence: str, + enzyme: str = "Trypsin", + missed_cleavages: int = 0, + min_length: int = 6, + max_length: int = 40, +) -> list: + """Digest a protein sequence in silico. + + Parameters + ---------- + sequence: + Single-letter amino acid sequence of the protein. + enzyme: + Enzyme name as known to pyopenms ProteaseDB (default: ``"Trypsin"``). + missed_cleavages: + Maximum number of missed cleavages allowed (default: 0). + min_length: + Minimum peptide length to include (default: 6). + max_length: + Maximum peptide length to include (default: 40). + + Returns + ------- + list of dict + Each entry contains ``sequence``, ``start``, ``end``, + ``monoisotopic_mass``, and ``missed_cleavages``. + """ + protein_seq = oms.AASequence.fromString(sequence) + digest = oms.ProteaseDigestion() + digest.setEnzyme(enzyme) + digest.setMissedCleavages(missed_cleavages) + + peptides = [] + digest.digest(protein_seq, peptides, min_length, max_length) + + results = [] + for pep in peptides: + pep_str = pep.toString() + mass = pep.getMonoWeight() + results.append( + { + "sequence": pep_str, + "length": pep.size(), + "monoisotopic_mass": mass, + } + ) + return results + + +def main(): + parser = argparse.ArgumentParser( + description="In-silico protein digestion using pyopenms." + ) + parser.add_argument( + "--sequence", + help="Single-letter amino acid sequence of the protein", + ) + parser.add_argument( + "--enzyme", + default="Trypsin", + help="Digestion enzyme name (default: Trypsin)", + ) + parser.add_argument( + "--missed-cleavages", + type=int, + default=0, + dest="missed_cleavages", + help="Maximum missed cleavages (default: 0)", + ) + parser.add_argument( + "--min-length", + type=int, + default=6, + dest="min_length", + help="Minimum peptide length (default: 6)", + ) + parser.add_argument( + "--max-length", + type=int, + default=40, + dest="max_length", + help="Maximum peptide length (default: 40)", + ) + parser.add_argument( + "--list-enzymes", + action="store_true", + dest="list_enzymes", + help="List all available enzyme names and exit", + ) + args = parser.parse_args() + + if args.list_enzymes: + enzymes = list_enzymes() + print("Available enzymes:") + for name in enzymes: + print(f" {name}") + return + + if not args.sequence: + parser.error("--sequence is required unless --list-enzymes is used.") + + peptides = digest_protein( + args.sequence, + enzyme=args.enzyme, + missed_cleavages=args.missed_cleavages, + min_length=args.min_length, + max_length=args.max_length, + ) + + print( + f"Enzyme: {args.enzyme} | Missed cleavages ≤ {args.missed_cleavages} " + f"| Length {args.min_length}–{args.max_length}" + ) + print(f"Total peptides: {len(peptides)}\n") + print(f"{'#':>4} {'Sequence':<40} {'Length':>6} {'Mono Mass (Da)':>14}") + print("-" * 72) + for i, pep in enumerate(peptides, 1): + print( + f"{i:>4} {pep['sequence']:<40} {pep['length']:>6} " + f"{pep['monoisotopic_mass']:>14.6f}" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/proteomics/spectrum_file_info.py b/scripts/proteomics/spectrum_file_info.py new file mode 100644 index 0000000..514f64a --- /dev/null +++ b/scripts/proteomics/spectrum_file_info.py @@ -0,0 +1,134 @@ +""" +Mass Spectrum File Info +======================= +Read an mzML (or mzXML) file and print a summary of its contents: +number of spectra, MS levels, retention time range, m/z range, +and basic TIC statistics. + +Usage +----- + python spectrum_file_info.py --input sample.mzML + python spectrum_file_info.py --input sample.mzML --tic +""" + +import argparse +import sys + +try: + import pyopenms as oms +except ImportError: + sys.exit( + "pyopenms is required. Install it with: pip install pyopenms" + ) + + +def summarise_experiment(exp: oms.MSExperiment) -> dict: + """Summarise a loaded MSExperiment object. + + Parameters + ---------- + exp: + Loaded ``pyopenms.MSExperiment`` instance. + + Returns + ------- + dict + Summary statistics for the experiment. + """ + spectra = exp.getSpectra() + if not spectra: + return {"n_spectra": 0} + + ms_levels = {} + rt_min = float("inf") + rt_max = float("-inf") + mz_min = float("inf") + mz_max = float("-inf") + tic_values = [] + + for spec in spectra: + level = spec.getMSLevel() + ms_levels[level] = ms_levels.get(level, 0) + 1 + rt = spec.getRT() + rt_min = min(rt_min, rt) + rt_max = max(rt_max, rt) + + mzs, intensities = spec.get_peaks() + if len(mzs) > 0: + mz_min = min(mz_min, float(mzs.min())) + mz_max = max(mz_max, float(mzs.max())) + tic_values.append(float(intensities.sum())) + + return { + "n_spectra": len(spectra), + "ms_levels": ms_levels, + "rt_range_sec": (rt_min, rt_max), + "mz_range": (mz_min, mz_max), + "tic_total": sum(tic_values), + "tic_max": max(tic_values) if tic_values else 0.0, + "tic_per_spectrum": tic_values, + } + + +def load_file(path: str) -> oms.MSExperiment: + """Load an mzML file into an MSExperiment. + + Parameters + ---------- + path: + Path to the mzML file. + + Returns + ------- + pyopenms.MSExperiment + """ + exp = oms.MSExperiment() + oms.MzMLFile().load(path, exp) + return exp + + +def main(): + parser = argparse.ArgumentParser( + description="Summarise an mzML file using pyopenms." + ) + parser.add_argument( + "--input", + required=True, + metavar="FILE", + help="Path to an mzML file", + ) + parser.add_argument( + "--tic", + action="store_true", + help="Print per-spectrum TIC values", + ) + args = parser.parse_args() + + print(f"Loading {args.input} …") + exp = load_file(args.input) + summary = summarise_experiment(exp) + + if summary["n_spectra"] == 0: + print("No spectra found in file.") + return + + print(f"\n{'File':<22}: {args.input}") + print(f"{'Total spectra':<22}: {summary['n_spectra']}") + for level, count in sorted(summary["ms_levels"].items()): + print(f" {'MS' + str(level) + ' spectra':<20}: {count}") + rt_min, rt_max = summary["rt_range_sec"] + print(f"{'RT range':<22}: {rt_min:.2f} – {rt_max:.2f} s " + f"({rt_min/60:.2f} – {rt_max/60:.2f} min)") + mz_lo, mz_hi = summary["mz_range"] + print(f"{'m/z range':<22}: {mz_lo:.4f} – {mz_hi:.4f}") + print(f"{'Total TIC':<22}: {summary['tic_total']:.3e}") + print(f"{'Max spectrum TIC':<22}: {summary['tic_max']:.3e}") + + if args.tic: + print("\n--- Per-spectrum TIC ---") + for i, tic in enumerate(summary["tic_per_spectrum"], 1): + print(f" Spectrum {i:>5}: {tic:.3e}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_metabolomics.py b/tests/test_metabolomics.py new file mode 100644 index 0000000..b41cbab --- /dev/null +++ b/tests/test_metabolomics.py @@ -0,0 +1,122 @@ +""" +Tests for metabolomics scripts. +Run with: python -m pytest tests/ -v +""" + +import sys +import os +import math + +import pytest + +# Allow importing from the scripts directory +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts", "metabolomics")) + +try: + import pyopenms as oms # noqa: F401 + + HAS_PYOPENMS = True +except ImportError: + HAS_PYOPENMS = False + +requires_pyopenms = pytest.mark.skipif( + not HAS_PYOPENMS, reason="pyopenms not installed" +) + + +@requires_pyopenms +class TestMassAccuracyCalculator: + def test_sequence_theoretical(self): + from mass_accuracy_calculator import theoretical_mz_from_sequence + + mz = theoretical_mz_from_sequence("PEPTIDEK", 1) + # The monoisotopic mass of PEPTIDEK is ~927.49 Da; +1 proton / 1 + assert 928.0 < mz < 929.0 + + def test_formula_theoretical(self): + from mass_accuracy_calculator import theoretical_mz_from_formula + + # Glucose C6H12O6 monoisotopic mass ~180.063 Da; +1 proton / 1 => ~181.070 + mz = theoretical_mz_from_formula("C6H12O6", 1) + assert 181.0 < mz < 182.0 + + def test_ppm_zero_error(self): + from mass_accuracy_calculator import ppm_error + + theoretical = 500.0 + observed = 500.0 + assert ppm_error(theoretical, observed) == 0.0 + + def test_ppm_positive_error(self): + from mass_accuracy_calculator import ppm_error + + theoretical = 500.0 + observed = 500.001 + ppm = ppm_error(theoretical, observed) + assert ppm > 0 + + def test_ppm_negative_error(self): + from mass_accuracy_calculator import ppm_error + + theoretical = 500.0 + observed = 499.999 + ppm = ppm_error(theoretical, observed) + assert ppm < 0 + + def test_ppm_known_value(self): + from mass_accuracy_calculator import ppm_error + + # 1 ppm at 1000 Da => 0.001 Da shift + theoretical = 1000.0 + observed = 1000.001 + ppm = ppm_error(theoretical, observed) + assert abs(ppm - 1.0) < 0.001 + + +@requires_pyopenms +class TestIsotopePatternMatcher: + def test_glucose_pattern(self): + from isotope_pattern_matcher import get_isotope_distribution + + dist = get_isotope_distribution("C6H12O6", max_isotopes=3) + assert len(dist) == 3 + # M+0 should be the base peak + assert dist[0][1] == pytest.approx(100.0) + # Each successive peak must be less intense than the previous for small molecules + assert dist[1][1] < dist[0][1] + + def test_pattern_mz_ordering(self): + from isotope_pattern_matcher import get_isotope_distribution + + dist = get_isotope_distribution("C12H22O11", max_isotopes=4) # sucrose + mzs = [mz for mz, _ in dist] + assert mzs == sorted(mzs) + + def test_cosine_similarity_perfect(self): + from isotope_pattern_matcher import cosine_similarity + + peaks = [(100.0, 50.0), (101.0, 20.0), (102.0, 5.0)] + sim = cosine_similarity(peaks, peaks, mz_tolerance=0.01) + assert abs(sim - 1.0) < 1e-9 + + def test_cosine_similarity_no_overlap(self): + from isotope_pattern_matcher import cosine_similarity + + theoretical = [(100.0, 50.0), (101.0, 20.0)] + observed = [(200.0, 50.0), (201.0, 20.0)] + sim = cosine_similarity(theoretical, observed, mz_tolerance=0.01) + assert sim == 0.0 + + def test_parse_peaks(self): + from isotope_pattern_matcher import parse_peaks + + result = parse_peaks(["181.0709,100.0", "182.0742,6.7"]) + assert len(result) == 2 + assert result[0] == (181.0709, 100.0) + assert result[1] == (182.0742, 6.7) + + def test_parse_peaks_invalid(self): + from isotope_pattern_matcher import parse_peaks + + with pytest.raises(ValueError): + parse_peaks(["181.0709"]) diff --git a/tests/test_proteomics.py b/tests/test_proteomics.py new file mode 100644 index 0000000..257b774 --- /dev/null +++ b/tests/test_proteomics.py @@ -0,0 +1,111 @@ +""" +Tests for proteomics scripts. +Run with: python -m pytest tests/ -v +""" + +import sys +import os + +import pytest + +# Allow importing from the scripts directory +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts", "proteomics")) + +try: + import pyopenms as oms # noqa: F401 + + HAS_PYOPENMS = True +except ImportError: + HAS_PYOPENMS = False + +requires_pyopenms = pytest.mark.skipif( + not HAS_PYOPENMS, reason="pyopenms not installed" +) + + +@requires_pyopenms +class TestPeptideMassCalculator: + def test_basic_mass(self): + from peptide_mass_calculator import peptide_masses + + result = peptide_masses("PEPTIDEK") + assert result["sequence"] == "PEPTIDEK" + assert result["charge"] == 1 + assert 927.0 < result["monoisotopic_mass"] < 928.0 + assert result["mz_monoisotopic"] > result["monoisotopic_mass"] + + def test_charge_state(self): + from peptide_mass_calculator import peptide_masses + + r1 = peptide_masses("PEPTIDEK", charge=1) + r2 = peptide_masses("PEPTIDEK", charge=2) + # Higher charge → lower m/z + assert r2["mz_monoisotopic"] < r1["mz_monoisotopic"] + + def test_fragment_ions(self): + from peptide_mass_calculator import fragment_ions + + ions = fragment_ions("PEPTIDEK") + seq_len = len("PEPTIDEK") + # Expect seq_len-1 b-ions and seq_len-1 y-ions + assert len(ions["b_ions"]) == seq_len - 1 + assert len(ions["y_ions"]) == seq_len - 1 + + def test_modified_sequence(self): + from peptide_mass_calculator import peptide_masses + + # Oxidised methionine + result = peptide_masses("PEPTM[147]IDEK") + assert result["monoisotopic_mass"] > 0 + + def test_mz_formula(self): + from peptide_mass_calculator import peptide_masses, PROTON + + r = peptide_masses("PEPTIDEK", charge=2) + expected = (r["monoisotopic_mass"] + 2 * PROTON) / 2 + assert abs(r["mz_monoisotopic"] - expected) < 1e-6 + + +@requires_pyopenms +class TestProteinDigest: + # Short test protein containing known tryptic sites (K, R) + PROTEIN = "MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELAL" + + def test_tryptic_digest_returns_peptides(self): + from protein_digest import digest_protein + + peptides = digest_protein(self.PROTEIN, enzyme="Trypsin", min_length=1) + assert len(peptides) > 0 + + def test_peptide_structure(self): + from protein_digest import digest_protein + + peptides = digest_protein(self.PROTEIN, enzyme="Trypsin", min_length=1) + for pep in peptides: + assert "sequence" in pep + assert "monoisotopic_mass" in pep + assert pep["monoisotopic_mass"] > 0 + + def test_missed_cleavages(self): + from protein_digest import digest_protein + + peps_0 = digest_protein(self.PROTEIN, enzyme="Trypsin", missed_cleavages=0, min_length=1) + peps_2 = digest_protein(self.PROTEIN, enzyme="Trypsin", missed_cleavages=2, min_length=1) + # More missed cleavages → at least as many peptides + assert len(peps_2) >= len(peps_0) + + def test_length_filter(self): + from protein_digest import digest_protein + + peptides = digest_protein( + self.PROTEIN, enzyme="Trypsin", min_length=5, max_length=20, missed_cleavages=2 + ) + for pep in peptides: + assert 5 <= pep["length"] <= 20 + + def test_list_enzymes(self): + from protein_digest import list_enzymes + + enzymes = list_enzymes() + assert "Trypsin" in enzymes + assert len(enzymes) > 5