From f101864320783ba425e038ab2c5679fdc07dfcd7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 24 Mar 2026 17:43:55 +0000
Subject: [PATCH 1/2] Initial plan
From 9e73afc87b25850d7a101ebe711cc6c0eb06dd5a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 24 Mar 2026 17:50:46 +0000
Subject: [PATCH 2/2] Add pyopenms scripts for proteomics and metabolomics with
tests and docs
Co-authored-by: ypriverol <52113+ypriverol@users.noreply.github.com>
Agent-Logs-Url: https://github.com/OpenMS/agentomics/sessions/871c3e78-4575-48b8-beca-2f71e43e6ed0
---
README.md | 63 ++++++-
requirements.txt | 1 +
scripts/metabolomics/README.md | 46 +++++
.../metabolomics/isotope_pattern_matcher.py | 169 ++++++++++++++++++
.../metabolomics/mass_accuracy_calculator.py | 140 +++++++++++++++
.../metabolite_feature_detection.py | 145 +++++++++++++++
scripts/proteomics/README.md | 54 ++++++
.../feature_detection_proteomics.py | 101 +++++++++++
scripts/proteomics/peptide_mass_calculator.py | 133 ++++++++++++++
scripts/proteomics/protein_digest.py | 162 +++++++++++++++++
scripts/proteomics/spectrum_file_info.py | 134 ++++++++++++++
tests/test_metabolomics.py | 122 +++++++++++++
tests/test_proteomics.py | 111 ++++++++++++
13 files changed, 1380 insertions(+), 1 deletion(-)
create mode 100644 requirements.txt
create mode 100644 scripts/metabolomics/README.md
create mode 100644 scripts/metabolomics/isotope_pattern_matcher.py
create mode 100644 scripts/metabolomics/mass_accuracy_calculator.py
create mode 100644 scripts/metabolomics/metabolite_feature_detection.py
create mode 100644 scripts/proteomics/README.md
create mode 100644 scripts/proteomics/feature_detection_proteomics.py
create mode 100644 scripts/proteomics/peptide_mass_calculator.py
create mode 100644 scripts/proteomics/protein_digest.py
create mode 100644 scripts/proteomics/spectrum_file_info.py
create mode 100644 tests/test_metabolomics.py
create mode 100644 tests/test_proteomics.py
diff --git a/README.md b/README.md
index fd929c9..3325b78 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,63 @@
# agentomics
-A repository of agentic created tools in proteomics using pyopenms
+
+A repository of agentic-created scripts using [pyopenms](https://pyopenms.readthedocs.io/)
+for proteomics and metabolomics users.
+
+## Requirements
+
+```bash
+pip install pyopenms
+```
+
+Or install all dependencies at once:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Repository Structure
+
+```
+agentomics/
+├── scripts/
+│ ├── proteomics/ # Scripts for proteomics workflows
+│ │ ├── peptide_mass_calculator.py
+│ │ ├── protein_digest.py
+│ │ ├── spectrum_file_info.py
+│ │ └── feature_detection_proteomics.py
+│ └── metabolomics/ # Scripts for metabolomics workflows
+│ ├── metabolite_feature_detection.py
+│ ├── mass_accuracy_calculator.py
+│ └── isotope_pattern_matcher.py
+└── tests/
+ ├── test_proteomics.py
+ └── test_metabolomics.py
+```
+
+## Proteomics Scripts
+
+| Script | Description |
+|--------|-------------|
+| [`peptide_mass_calculator.py`](scripts/proteomics/peptide_mass_calculator.py) | Monoisotopic/average masses and b/y fragment ion series for peptide sequences |
+| [`protein_digest.py`](scripts/proteomics/protein_digest.py) | In-silico enzymatic protein digestion (Trypsin, Lys-C, …) |
+| [`spectrum_file_info.py`](scripts/proteomics/spectrum_file_info.py) | Summary statistics for mzML files (spectra, RT range, TIC) |
+| [`feature_detection_proteomics.py`](scripts/proteomics/feature_detection_proteomics.py) | FeatureFinderCentroided-based peptide feature detection from LC-MS/MS data |
+
+See [`scripts/proteomics/README.md`](scripts/proteomics/README.md) for usage examples.
+
+## Metabolomics Scripts
+
+| Script | Description |
+|--------|-------------|
+| [`metabolite_feature_detection.py`](scripts/metabolomics/metabolite_feature_detection.py) | FeatureFinderMetabo-based metabolite feature detection from LC-MS data |
+| [`mass_accuracy_calculator.py`](scripts/metabolomics/mass_accuracy_calculator.py) | Compute m/z mass accuracy (ppm error) for sequences or molecular formulas |
+| [`isotope_pattern_matcher.py`](scripts/metabolomics/isotope_pattern_matcher.py) | Generate theoretical isotope distributions and score against observed peaks |
+
+See [`scripts/metabolomics/README.md`](scripts/metabolomics/README.md) for usage examples.
+
+## Running Tests
+
+```bash
+pip install pytest pyopenms
+python -m pytest tests/ -v
+```
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7ce28ec
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pyopenms
diff --git a/scripts/metabolomics/README.md b/scripts/metabolomics/README.md
new file mode 100644
index 0000000..c57cff4
--- /dev/null
+++ b/scripts/metabolomics/README.md
@@ -0,0 +1,46 @@
+# Metabolomics Scripts
+
+A collection of agentic-created Python scripts for common metabolomics tasks
+using [pyopenms](https://pyopenms.readthedocs.io/).
+
+## Scripts
+
+### `metabolite_feature_detection.py`
+Detect small-molecule features (isotope envelopes) in centroided LC-MS data
+using the `FeatureFinderMetabo` pipeline (mass tracing → elution peak
+detection → feature finding). Output is written as a featureXML file.
+
+```bash
+python metabolite_feature_detection.py --input sample.mzML
+python metabolite_feature_detection.py --input sample.mzML --output features.featureXML --noise 1e5
+```
+
+### `mass_accuracy_calculator.py`
+Calculate the mass accuracy (ppm error) between a theoretical value derived
+from a peptide sequence or molecular formula and one or more observed m/z
+values.
+
+```bash
+# Peptide sequence
+python mass_accuracy_calculator.py --sequence PEPTIDEK --observed 803.4560
+
+# Molecular formula
+python mass_accuracy_calculator.py --formula C6H12O6 --observed 181.0709
+
+# Multiple observed values at charge 2
+python mass_accuracy_calculator.py --sequence ACDEFGHIK --charge 2 \
+ --observed 554.2478 554.2480 554.2482
+```
+
+### `isotope_pattern_matcher.py`
+Generate a theoretical isotope distribution for any molecular formula and
+optionally compute a cosine similarity score against observed peaks.
+
+```bash
+# Generate pattern for glucose
+python isotope_pattern_matcher.py --formula C6H12O6
+
+# Compare against observed peaks
+python isotope_pattern_matcher.py --formula C6H12O6 \
+ --peaks 181.0709,100.0 182.0742,6.7 183.0775,0.4
+```
diff --git a/scripts/metabolomics/isotope_pattern_matcher.py b/scripts/metabolomics/isotope_pattern_matcher.py
new file mode 100644
index 0000000..f0510d4
--- /dev/null
+++ b/scripts/metabolomics/isotope_pattern_matcher.py
@@ -0,0 +1,169 @@
+"""
+Isotope Pattern Generator & Matcher
+=====================================
+Generate the theoretical isotope distribution for a molecular formula
+and optionally compare it against an observed spectrum to compute a
+cosine similarity score using pyopenms.
+
+Usage
+-----
+ # Generate isotope pattern for glucose
+ python isotope_pattern_matcher.py --formula C6H12O6
+
+ # Compare against observed peaks (m/z intensity pairs on stdin or --peaks)
+ python isotope_pattern_matcher.py --formula C6H12O6 \\
+ --peaks 181.0709,100.0 182.0742,6.7 183.0775,0.4
+"""
+
+import argparse
+import math
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+
+def get_isotope_distribution(
+ formula: str,
+ max_isotopes: int = 5,
+) -> list:
+ """Compute the theoretical isotope distribution for a molecular formula.
+
+ Parameters
+ ----------
+ formula:
+ Empirical formula string, e.g. ``"C6H12O6"``.
+ max_isotopes:
+ Maximum number of isotope peaks to return (default: 5).
+
+ Returns
+ -------
+ list of (mz, relative_abundance)
+ Sorted by m/z; relative abundances sum to 100.
+ """
+ ef = oms.EmpiricalFormula(formula)
+ isotope_dist = ef.getIsotopeDistribution(
+ oms.CoarseIsotopePatternGenerator(max_isotopes)
+ )
+ peaks = [(p.getMZ(), p.getIntensity()) for p in isotope_dist.getContainer()]
+ if not peaks:
+ return []
+ max_ab = max(ab for _, ab in peaks)
+ return [(mz, ab / max_ab * 100) for mz, ab in peaks]
+
+
+def cosine_similarity(
+ theoretical: list,
+ observed: list,
+ mz_tolerance: float = 0.02,
+) -> float:
+ """Compute cosine similarity between theoretical and observed peak lists.
+
+ Parameters
+ ----------
+ theoretical:
+ List of ``(mz, intensity)`` tuples for the theoretical pattern.
+ observed:
+ List of ``(mz, intensity)`` tuples for the observed spectrum.
+ mz_tolerance:
+ Maximum m/z difference to match a pair of peaks (default: 0.02 Da).
+
+ Returns
+ -------
+ float
+ Cosine similarity in [0, 1].
+ """
+ theo_vec = []
+ obs_vec = []
+ for tmz, tint in theoretical:
+ matched = 0.0
+ for omz, oint in observed:
+ if abs(omz - tmz) <= mz_tolerance:
+ matched = oint
+ break
+ theo_vec.append(tint)
+ obs_vec.append(matched)
+
+ dot = sum(t * o for t, o in zip(theo_vec, obs_vec))
+ norm_t = math.sqrt(sum(t ** 2 for t in theo_vec))
+ norm_o = math.sqrt(sum(o ** 2 for o in obs_vec))
+ if norm_t == 0 or norm_o == 0:
+ return 0.0
+ return dot / (norm_t * norm_o)
+
+
+def parse_peaks(peak_strings: list) -> list:
+ """Parse ``"mz,intensity"`` strings into ``(float, float)`` tuples."""
+ peaks = []
+ for s in peak_strings:
+ parts = s.split(",")
+ if len(parts) != 2:
+ raise ValueError(
+ f"Expected 'mz,intensity' format, got: {s!r}"
+ )
+ peaks.append((float(parts[0]), float(parts[1])))
+ return peaks
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate theoretical isotope patterns and optionally "
+ "compare them against observed peaks using pyopenms."
+ )
+ parser.add_argument(
+ "--formula",
+ required=True,
+ help="Molecular formula (e.g. C6H12O6)",
+ )
+ parser.add_argument(
+ "--max-isotopes",
+ type=int,
+ default=5,
+ dest="max_isotopes",
+ help="Maximum isotope peaks to compute (default: 5)",
+ )
+ parser.add_argument(
+ "--peaks",
+ nargs="+",
+ metavar="MZ,INTENSITY",
+ help="Observed peaks as 'mz,intensity' pairs for similarity scoring",
+ )
+ parser.add_argument(
+ "--tolerance",
+ type=float,
+ default=0.02,
+ metavar="DA",
+ help="m/z tolerance in Da for peak matching (default: 0.02)",
+ )
+ args = parser.parse_args()
+
+ distribution = get_isotope_distribution(args.formula, args.max_isotopes)
+ if not distribution:
+ print("Could not compute isotope distribution for the given formula.")
+ return
+
+ print(f"Isotope distribution for {args.formula}:")
+ print(f"\n{'Peak':>5} {'m/z':>12} {'Relative Abundance (%)':>22}")
+ print("-" * 44)
+ for i, (mz, rel_ab) in enumerate(distribution):
+ bar = "#" * int(rel_ab / 5)
+ print(f" M+{i} {mz:>12.4f} {rel_ab:>6.2f} % {bar}")
+
+ if args.peaks:
+ observed = parse_peaks(args.peaks)
+ sim = cosine_similarity(distribution, observed, args.tolerance)
+ print(f"\nCosine similarity vs. observed peaks: {sim:.4f}")
+ if sim >= 0.9:
+ print(" ✓ Excellent match (≥ 0.90)")
+ elif sim >= 0.7:
+ print(" ~ Good match (≥ 0.70)")
+ else:
+ print(" ✗ Poor match (< 0.70)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/metabolomics/mass_accuracy_calculator.py b/scripts/metabolomics/mass_accuracy_calculator.py
new file mode 100644
index 0000000..b8c1b7a
--- /dev/null
+++ b/scripts/metabolomics/mass_accuracy_calculator.py
@@ -0,0 +1,140 @@
+"""
+Mass Accuracy Calculator
+========================
+Calculate the mass accuracy (ppm error) between a theoretical mass
+(or peptide/formula string) and one or more observed m/z values.
+
+Supports both:
+- Amino acid sequence inputs (e.g. ``PEPTIDEK``)
+- Empirical formula inputs (e.g. ``C6H12O6``)
+
+Usage
+-----
+ # Peptide sequence
+ python mass_accuracy_calculator.py --sequence PEPTIDEK --observed 803.4560
+
+ # Molecular formula (charge 1 default)
+ python mass_accuracy_calculator.py --formula C6H12O6 --observed 181.0709
+
+ # Multiple observed values at charge 2
+ python mass_accuracy_calculator.py --sequence ACDEFGHIK --charge 2 \\
+ --observed 554.2478 554.2480 554.2482
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+PROTON = 1.007276
+
+
+def theoretical_mz_from_sequence(sequence: str, charge: int) -> float:
+ """Compute the theoretical m/z for a peptide sequence.
+
+ Parameters
+ ----------
+ sequence:
+ Amino acid sequence, optionally with bracket-enclosed modifications.
+ charge:
+ Charge state.
+
+ Returns
+ -------
+ float
+ Theoretical m/z value.
+ """
+ aa_seq = oms.AASequence.fromString(sequence)
+ mass = aa_seq.getMonoWeight()
+ return (mass + charge * PROTON) / charge
+
+
+def theoretical_mz_from_formula(formula: str, charge: int) -> float:
+ """Compute the theoretical m/z for a molecular formula.
+
+ Parameters
+ ----------
+ formula:
+ Empirical formula string, e.g. ``"C6H12O6"``.
+ charge:
+ Charge state (used for proton addition).
+
+ Returns
+ -------
+ float
+ Theoretical m/z value (monoisotopic).
+ """
+ ef = oms.EmpiricalFormula(formula)
+ mass = ef.getMonoWeight()
+ return (mass + charge * PROTON) / charge
+
+
+def ppm_error(theoretical: float, observed: float) -> float:
+ """Calculate the mass accuracy in parts-per-million (ppm).
+
+ Parameters
+ ----------
+ theoretical:
+ Theoretical m/z value.
+ observed:
+ Observed m/z value.
+
+ Returns
+ -------
+ float
+ PPM error (positive = observed > theoretical).
+ """
+ return (observed - theoretical) / theoretical * 1e6
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Compute m/z mass accuracy (ppm error) using pyopenms."
+ )
+ group = parser.add_mutually_exclusive_group(required=True)
+ group.add_argument(
+ "--sequence",
+ help="Peptide sequence (e.g. PEPTIDEK)",
+ )
+ group.add_argument(
+ "--formula",
+ help="Molecular formula (e.g. C6H12O6)",
+ )
+ parser.add_argument(
+ "--charge",
+ type=int,
+ default=1,
+ help="Charge state (default: 1)",
+ )
+ parser.add_argument(
+ "--observed",
+ nargs="+",
+ type=float,
+ required=True,
+ metavar="MZ",
+ help="Observed m/z value(s)",
+ )
+ args = parser.parse_args()
+
+ if args.sequence:
+ theoretical = theoretical_mz_from_sequence(args.sequence, args.charge)
+ label = f"sequence={args.sequence}"
+ else:
+ theoretical = theoretical_mz_from_formula(args.formula, args.charge)
+ label = f"formula={args.formula}"
+
+ print(f"Theoretical m/z ({label}, charge {args.charge}+): {theoretical:.6f}")
+ print(f"\n{'Observed m/z':>14} {'PPM error':>10}")
+ print("-" * 28)
+ for obs in args.observed:
+ ppm = ppm_error(theoretical, obs)
+ print(f"{obs:>14.6f} {ppm:>+10.4f}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/metabolomics/metabolite_feature_detection.py b/scripts/metabolomics/metabolite_feature_detection.py
new file mode 100644
index 0000000..4a5a0f8
--- /dev/null
+++ b/scripts/metabolomics/metabolite_feature_detection.py
@@ -0,0 +1,145 @@
+"""
+Metabolite Feature Detection
+=============================
+Detect small-molecule features (isotope envelopes) in an LC-MS mzML file
+using the pyopenms FeatureFinderMetabo algorithm. Results are written to a
+featureXML file which can be opened in TOPPView.
+
+Usage
+-----
+ python metabolite_feature_detection.py --input sample.mzML
+ python metabolite_feature_detection.py --input sample.mzML --output features.featureXML --noise 1e5
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+
+def detect_metabolite_features(
+ input_path: str,
+ output_path: str,
+ noise_threshold: float = 1e4,
+) -> oms.FeatureMap:
+ """Run FeatureFinderMetabo on an mzML file.
+
+ Parameters
+ ----------
+ input_path:
+ Path to the centroided mzML file.
+ output_path:
+ Path for the output featureXML file.
+ noise_threshold:
+ Minimum peak intensity to consider during mass tracing (default 1e4).
+
+ Returns
+ -------
+ pyopenms.FeatureMap
+ Map of detected metabolite features.
+ """
+ exp = oms.MSExperiment()
+ print(f"Loading {input_path} …")
+ oms.MzMLFile().load(input_path, exp)
+ exp.updateRanges()
+
+ # --- Mass tracing ---
+ mass_traces = []
+ mt_params = oms.MassTraceDetection().getDefaults()
+ mt_params.setValue("noise_threshold_int", noise_threshold)
+ mt_det = oms.MassTraceDetection()
+ mt_det.setParameters(mt_params)
+ mt_det.run(exp, mass_traces, 0)
+ print(f"Mass traces found: {len(mass_traces)}")
+
+ # --- Elution peak detection ---
+ mass_traces_split = []
+ mass_traces_final = []
+ epd_params = oms.ElutionPeakDetection().getDefaults()
+ epd = oms.ElutionPeakDetection()
+ epd.setParameters(epd_params)
+ epd.detectPeaks(mass_traces, mass_traces_split)
+ if epd.getParameters().getValue("width_filtering") == "auto":
+ epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
+ else:
+ mass_traces_final = mass_traces_split
+
+ # --- Feature detection ---
+ feature_map = oms.FeatureMap()
+ chrom_fwhm = 0.0
+ ffm_params = oms.FeatureFindingMetabo().getDefaults()
+ ffm = oms.FeatureFindingMetabo()
+ ffm.setParameters(ffm_params)
+ ffm.run(mass_traces_final, feature_map, chrom_fwhm)
+
+ feature_map.setUniqueIds()
+ oms.FeatureXMLFile().store(output_path, feature_map)
+ print(f"Detected {feature_map.size()} metabolite features → {output_path}")
+ return feature_map
+
+
+def print_feature_summary(feature_map: oms.FeatureMap, top_n: int = 20) -> None:
+ """Print a tabular summary of the top-N most intense features."""
+ if feature_map.size() == 0:
+ print("No features detected.")
+ return
+
+ features = list(feature_map)
+ features.sort(key=lambda f: f.getIntensity(), reverse=True)
+
+ display = features[:top_n]
+ print(
+ f"\nTop {len(display)} features (by intensity):\n"
+ f"{'#':>5} {'RT (s)':>10} {'m/z':>12} {'Charge':>6} {'Intensity':>14}"
+ )
+ print("-" * 56)
+ for i, feature in enumerate(display, 1):
+ print(
+ f"{i:>5} {feature.getRT():>10.2f} {feature.getMZ():>12.4f} "
+ f"{feature.getCharge():>6} {feature.getIntensity():>14.3e}"
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Detect metabolite features in an mzML file using pyopenms."
+ )
+ parser.add_argument(
+ "--input",
+ required=True,
+ metavar="FILE",
+ help="Centroided mzML input file",
+ )
+ parser.add_argument(
+ "--output",
+ metavar="FILE",
+ help="Output featureXML file (default: .featureXML)",
+ )
+ parser.add_argument(
+ "--noise",
+ type=float,
+ default=1e4,
+ metavar="THRESHOLD",
+ help="Noise intensity threshold for mass tracing (default: 1e4)",
+ )
+ parser.add_argument(
+ "--top",
+ type=int,
+ default=20,
+ metavar="N",
+ help="Number of top features to print (default: 20)",
+ )
+ args = parser.parse_args()
+
+ output_path = args.output or args.input.replace(".mzML", "_metabolites.featureXML")
+ feature_map = detect_metabolite_features(args.input, output_path, args.noise)
+ print_feature_summary(feature_map, args.top)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/proteomics/README.md b/scripts/proteomics/README.md
new file mode 100644
index 0000000..e4d4046
--- /dev/null
+++ b/scripts/proteomics/README.md
@@ -0,0 +1,54 @@
+# Proteomics Scripts
+
+A collection of agentic-created Python scripts for common proteomics tasks
+using [pyopenms](https://pyopenms.readthedocs.io/).
+
+## Scripts
+
+### `peptide_mass_calculator.py`
+Calculate monoisotopic and average masses for peptide sequences, and compute
+b-ion / y-ion fragment series.
+
+```bash
+# Plain sequence
+python peptide_mass_calculator.py --sequence PEPTIDEK
+
+# Modified sequence (oxidised Met) at charge 2
+python peptide_mass_calculator.py --sequence PEPTM[147]IDEK --charge 2
+
+# Also print fragment ions
+python peptide_mass_calculator.py --sequence ACDEFGHIK --fragments
+```
+
+### `protein_digest.py`
+Perform in-silico enzymatic digestion of a protein sequence and report
+the resulting peptides with their masses.
+
+```bash
+# Tryptic digest
+python protein_digest.py --sequence MKVLWAALLVTFLAGCQAK... --enzyme Trypsin
+
+# Lys-C with up to 2 missed cleavages
+python protein_digest.py --sequence MKVLWAALLVTFLAGCQAK... --enzyme Lys-C --missed-cleavages 2
+
+# List all available enzymes
+python protein_digest.py --list-enzymes
+```
+
+### `spectrum_file_info.py`
+Summarise the contents of an mzML file: number of spectra by MS level,
+retention time range, m/z range, and TIC statistics.
+
+```bash
+python spectrum_file_info.py --input sample.mzML
+python spectrum_file_info.py --input sample.mzML --tic # also print per-spectrum TIC
+```
+
+### `feature_detection_proteomics.py`
+Detect peptide isotope features in centroided LC-MS/MS data using the
+`FeatureFinderCentroided` algorithm. Output is written as a featureXML file.
+
+```bash
+python feature_detection_proteomics.py --input sample.mzML
+python feature_detection_proteomics.py --input sample.mzML --output features.featureXML
+```
diff --git a/scripts/proteomics/feature_detection_proteomics.py b/scripts/proteomics/feature_detection_proteomics.py
new file mode 100644
index 0000000..ab25e32
--- /dev/null
+++ b/scripts/proteomics/feature_detection_proteomics.py
@@ -0,0 +1,101 @@
+"""
+Feature Detection for Proteomics LC-MS Data
+============================================
+Detect peptide features (isotope envelopes) in an mzML file using the
+pyopenms FeatureFinderCentroided algorithm. Results are written to a
+featureXML file which can be opened in TOPPView.
+
+Usage
+-----
+ python feature_detection_proteomics.py --input sample.mzML
+ python feature_detection_proteomics.py --input sample.mzML --output features.featureXML
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+
+def detect_features(
+ input_path: str,
+ output_path: str,
+) -> oms.FeatureMap:
+ """Run FeatureFinderCentroided on an mzML file.
+
+ Parameters
+ ----------
+ input_path:
+ Path to the centroided mzML file.
+ output_path:
+ Path for the output featureXML file.
+
+ Returns
+ -------
+ pyopenms.FeatureMap
+ Map of detected features.
+ """
+ exp = oms.MSExperiment()
+ print(f"Loading {input_path} …")
+ oms.MzMLFile().load(input_path, exp)
+ exp.updateRanges()
+
+ feature_map = oms.FeatureMap()
+ seeds = oms.FeatureMap()
+
+ ff = oms.FeatureFinder()
+ ff.setLogType(oms.LogType.CMD)
+
+ params = oms.FeatureFinder().getParameters("centroided")
+ ff.run("centroided", exp, feature_map, params, seeds)
+
+ feature_map.setUniqueIds()
+ oms.FeatureXMLFile().store(output_path, feature_map)
+ print(f"Detected {feature_map.size()} features → {output_path}")
+ return feature_map
+
+
+def print_feature_summary(feature_map: oms.FeatureMap) -> None:
+ """Print a tabular summary of detected features."""
+ if feature_map.size() == 0:
+ print("No features detected.")
+ return
+
+ print(f"\n{'#':>5} {'RT (s)':>10} {'m/z':>12} {'Charge':>6} {'Intensity':>14}")
+ print("-" * 56)
+ for i, feature in enumerate(feature_map, 1):
+ print(
+ f"{i:>5} {feature.getRT():>10.2f} {feature.getMZ():>12.4f} "
+ f"{feature.getCharge():>6} {feature.getIntensity():>14.3e}"
+ )
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Detect peptide features in an mzML file using pyopenms."
+ )
+ parser.add_argument(
+ "--input",
+ required=True,
+ metavar="FILE",
+ help="Centroided mzML input file",
+ )
+ parser.add_argument(
+ "--output",
+ metavar="FILE",
+ help="Output featureXML file (default: .featureXML)",
+ )
+ args = parser.parse_args()
+
+ output_path = args.output or args.input.replace(".mzML", ".featureXML")
+ feature_map = detect_features(args.input, output_path)
+ print_feature_summary(feature_map)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/proteomics/peptide_mass_calculator.py b/scripts/proteomics/peptide_mass_calculator.py
new file mode 100644
index 0000000..8bc4893
--- /dev/null
+++ b/scripts/proteomics/peptide_mass_calculator.py
@@ -0,0 +1,133 @@
+"""
+Peptide Mass Calculator
+=======================
+Calculate monoisotopic and average masses for peptide sequences using pyopenms.
+
+Supports:
+- Plain amino acid sequences (e.g. "PEPTIDEK")
+- Modified sequences in bracket notation (e.g. "PEPTM[147]IDEK")
+- Fragment ion mass series (b-ions and y-ions)
+- Multiple charge states
+
+Usage
+-----
+ python peptide_mass_calculator.py --sequence PEPTIDEK
+ python peptide_mass_calculator.py --sequence PEPTM[147]IDEK --charge 2
+ python peptide_mass_calculator.py --sequence ACDEFGHIK --fragments
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+PROTON = 1.007276
+
+
+def peptide_masses(sequence: str, charge: int = 1) -> dict:
+ """Return monoisotopic and average masses for the given peptide sequence.
+
+ Parameters
+ ----------
+ sequence:
+ Amino acid sequence, optionally with bracket-enclosed modifications,
+ e.g. ``"PEPTM[147]IDEK"``.
+ charge:
+ Desired charge state for m/z calculation (default 1).
+
+ Returns
+ -------
+ dict
+ Dictionary with keys ``monoisotopic_mass``, ``average_mass``,
+ ``mz_monoisotopic``, and ``mz_average``.
+ """
+ aa_seq = oms.AASequence.fromString(sequence)
+ mono = aa_seq.getMonoWeight()
+ avg = aa_seq.getAverageWeight()
+ return {
+ "sequence": sequence,
+ "charge": charge,
+ "monoisotopic_mass": mono,
+ "average_mass": avg,
+ "mz_monoisotopic": (mono + charge * PROTON) / charge,
+ "mz_average": (avg + charge * PROTON) / charge,
+ }
+
+
+def fragment_ions(sequence: str) -> dict:
+ """Compute singly charged b-ion and y-ion series for a peptide.
+
+ Parameters
+ ----------
+ sequence:
+ Plain or modified amino acid sequence.
+
+ Returns
+ -------
+ dict
+ Dictionary with keys ``b_ions`` and ``y_ions``, each a list of
+ ``(index, mass)`` tuples.
+ """
+ aa_seq = oms.AASequence.fromString(sequence)
+ n = aa_seq.size()
+
+ b_ions = []
+ for i in range(1, n):
+ prefix = aa_seq.getPrefix(i)
+ b_ions.append((i, prefix.getMonoWeight(oms.Residue.ResidueType.BIon, 1)))
+
+ y_ions = []
+ for i in range(1, n):
+ suffix = aa_seq.getSuffix(i)
+ y_ions.append((i, suffix.getMonoWeight(oms.Residue.ResidueType.YIon, 1)))
+
+ return {"b_ions": b_ions, "y_ions": y_ions}
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Calculate peptide/fragment masses using pyopenms."
+ )
+ parser.add_argument(
+ "--sequence",
+ required=True,
+ help="Amino acid sequence (e.g. PEPTIDEK or PEPTM[147]IDEK)",
+ )
+ parser.add_argument(
+ "--charge",
+ type=int,
+ default=1,
+ help="Charge state for m/z calculation (default: 1)",
+ )
+ parser.add_argument(
+ "--fragments",
+ action="store_true",
+ help="Also compute b-ion and y-ion series",
+ )
+ args = parser.parse_args()
+
+ info = peptide_masses(args.sequence, args.charge)
+ print(f"Sequence : {info['sequence']}")
+ print(f"Charge : {info['charge']}+")
+ print(f"Monoisotopic mass : {info['monoisotopic_mass']:.6f} Da")
+ print(f"Average mass : {info['average_mass']:.6f} Da")
+ print(f"m/z (mono) : {info['mz_monoisotopic']:.6f}")
+ print(f"m/z (avg) : {info['mz_average']:.6f}")
+
+ if args.fragments:
+ ions = fragment_ions(args.sequence)
+ print("\n--- b-ions ---")
+ for idx, mass in ions["b_ions"]:
+ print(f" b{idx:>2} {mass:.6f} Da")
+ print("\n--- y-ions ---")
+ for idx, mass in ions["y_ions"]:
+ print(f" y{idx:>2} {mass:.6f} Da")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/proteomics/protein_digest.py b/scripts/proteomics/protein_digest.py
new file mode 100644
index 0000000..ce87be1
--- /dev/null
+++ b/scripts/proteomics/protein_digest.py
@@ -0,0 +1,162 @@
+"""
+Protein In-Silico Digest
+========================
+Digest a protein sequence with a chosen enzyme and report the resulting
+peptides together with their masses using pyopenms.
+
+Supported enzymes (examples): Trypsin, Lys-C, Arg-C, Asp-N, Glu-C, Chymotrypsin
+For the full list call: python protein_digest.py --list-enzymes
+
+Usage
+-----
+ python protein_digest.py --sequence MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQVQEELLSSQVTQELRALMDETMKELKAYKSELEEQLTPVAEETRARLSKELQAAQARLGADVLASHGRLVQYRGEVQAMLGQSTEELRVRLASHLRKLRKRLLRDADDLQKRLAVYQAGAREGAERGLSAIRERLGPLVEQGRVRAATVGSLAGQPLQERAQAWGERLRARMEEMGSRTRDRLDEVKEQVAEVRAKLEEQAQQIRLQAEAFQARLKSWFEPLVEDMQRQWAGLVEKVQAAVGTSAAPVPSDNH --enzyme Trypsin
+ python protein_digest.py --sequence MKVLWAALLVTFLAGC --enzyme Lys-C --missed-cleavages 2
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+
+def list_enzymes() -> list:
+ """Return all enzyme names registered in the pyopenms ProteaseDB."""
+ db = oms.ProteaseDB()
+ raw_names = []
+ db.getAllNames(raw_names)
+ return sorted(
+ n.decode() if isinstance(n, bytes) else n for n in raw_names
+ )
+
+
+def digest_protein(
+ sequence: str,
+ enzyme: str = "Trypsin",
+ missed_cleavages: int = 0,
+ min_length: int = 6,
+ max_length: int = 40,
+) -> list:
+ """Digest a protein sequence in silico.
+
+ Parameters
+ ----------
+ sequence:
+ Single-letter amino acid sequence of the protein.
+ enzyme:
+ Enzyme name as known to pyopenms ProteaseDB (default: ``"Trypsin"``).
+ missed_cleavages:
+ Maximum number of missed cleavages allowed (default: 0).
+ min_length:
+ Minimum peptide length to include (default: 6).
+ max_length:
+ Maximum peptide length to include (default: 40).
+
+ Returns
+ -------
+ list of dict
+ Each entry contains ``sequence``, ``start``, ``end``,
+ ``monoisotopic_mass``, and ``missed_cleavages``.
+ """
+ protein_seq = oms.AASequence.fromString(sequence)
+ digest = oms.ProteaseDigestion()
+ digest.setEnzyme(enzyme)
+ digest.setMissedCleavages(missed_cleavages)
+
+ peptides = []
+ digest.digest(protein_seq, peptides, min_length, max_length)
+
+ results = []
+ for pep in peptides:
+ pep_str = pep.toString()
+ mass = pep.getMonoWeight()
+ results.append(
+ {
+ "sequence": pep_str,
+ "length": pep.size(),
+ "monoisotopic_mass": mass,
+ }
+ )
+ return results
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="In-silico protein digestion using pyopenms."
+ )
+ parser.add_argument(
+ "--sequence",
+ help="Single-letter amino acid sequence of the protein",
+ )
+ parser.add_argument(
+ "--enzyme",
+ default="Trypsin",
+ help="Digestion enzyme name (default: Trypsin)",
+ )
+ parser.add_argument(
+ "--missed-cleavages",
+ type=int,
+ default=0,
+ dest="missed_cleavages",
+ help="Maximum missed cleavages (default: 0)",
+ )
+ parser.add_argument(
+ "--min-length",
+ type=int,
+ default=6,
+ dest="min_length",
+ help="Minimum peptide length (default: 6)",
+ )
+ parser.add_argument(
+ "--max-length",
+ type=int,
+ default=40,
+ dest="max_length",
+ help="Maximum peptide length (default: 40)",
+ )
+ parser.add_argument(
+ "--list-enzymes",
+ action="store_true",
+ dest="list_enzymes",
+ help="List all available enzyme names and exit",
+ )
+ args = parser.parse_args()
+
+ if args.list_enzymes:
+ enzymes = list_enzymes()
+ print("Available enzymes:")
+ for name in enzymes:
+ print(f" {name}")
+ return
+
+ if not args.sequence:
+ parser.error("--sequence is required unless --list-enzymes is used.")
+
+ peptides = digest_protein(
+ args.sequence,
+ enzyme=args.enzyme,
+ missed_cleavages=args.missed_cleavages,
+ min_length=args.min_length,
+ max_length=args.max_length,
+ )
+
+ print(
+ f"Enzyme: {args.enzyme} | Missed cleavages ≤ {args.missed_cleavages} "
+ f"| Length {args.min_length}–{args.max_length}"
+ )
+ print(f"Total peptides: {len(peptides)}\n")
+ print(f"{'#':>4} {'Sequence':<40} {'Length':>6} {'Mono Mass (Da)':>14}")
+ print("-" * 72)
+ for i, pep in enumerate(peptides, 1):
+ print(
+ f"{i:>4} {pep['sequence']:<40} {pep['length']:>6} "
+ f"{pep['monoisotopic_mass']:>14.6f}"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/proteomics/spectrum_file_info.py b/scripts/proteomics/spectrum_file_info.py
new file mode 100644
index 0000000..514f64a
--- /dev/null
+++ b/scripts/proteomics/spectrum_file_info.py
@@ -0,0 +1,134 @@
+"""
+Mass Spectrum File Info
+=======================
+Read an mzML (or mzXML) file and print a summary of its contents:
+number of spectra, MS levels, retention time range, m/z range,
+and basic TIC statistics.
+
+Usage
+-----
+ python spectrum_file_info.py --input sample.mzML
+ python spectrum_file_info.py --input sample.mzML --tic
+"""
+
+import argparse
+import sys
+
+try:
+ import pyopenms as oms
+except ImportError:
+ sys.exit(
+ "pyopenms is required. Install it with: pip install pyopenms"
+ )
+
+
+def summarise_experiment(exp: oms.MSExperiment) -> dict:
+ """Summarise a loaded MSExperiment object.
+
+ Parameters
+ ----------
+ exp:
+ Loaded ``pyopenms.MSExperiment`` instance.
+
+ Returns
+ -------
+ dict
+ Summary statistics for the experiment.
+ """
+ spectra = exp.getSpectra()
+ if not spectra:
+ return {"n_spectra": 0}
+
+ ms_levels = {}
+ rt_min = float("inf")
+ rt_max = float("-inf")
+ mz_min = float("inf")
+ mz_max = float("-inf")
+ tic_values = []
+
+ for spec in spectra:
+ level = spec.getMSLevel()
+ ms_levels[level] = ms_levels.get(level, 0) + 1
+ rt = spec.getRT()
+ rt_min = min(rt_min, rt)
+ rt_max = max(rt_max, rt)
+
+ mzs, intensities = spec.get_peaks()
+ if len(mzs) > 0:
+ mz_min = min(mz_min, float(mzs.min()))
+ mz_max = max(mz_max, float(mzs.max()))
+ tic_values.append(float(intensities.sum()))
+
+ return {
+ "n_spectra": len(spectra),
+ "ms_levels": ms_levels,
+ "rt_range_sec": (rt_min, rt_max),
+ "mz_range": (mz_min, mz_max),
+ "tic_total": sum(tic_values),
+ "tic_max": max(tic_values) if tic_values else 0.0,
+ "tic_per_spectrum": tic_values,
+ }
+
+
+def load_file(path: str) -> oms.MSExperiment:
+ """Load an mzML file into an MSExperiment.
+
+ Parameters
+ ----------
+ path:
+ Path to the mzML file.
+
+ Returns
+ -------
+ pyopenms.MSExperiment
+ """
+ exp = oms.MSExperiment()
+ oms.MzMLFile().load(path, exp)
+ return exp
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Summarise an mzML file using pyopenms."
+ )
+ parser.add_argument(
+ "--input",
+ required=True,
+ metavar="FILE",
+ help="Path to an mzML file",
+ )
+ parser.add_argument(
+ "--tic",
+ action="store_true",
+ help="Print per-spectrum TIC values",
+ )
+ args = parser.parse_args()
+
+ print(f"Loading {args.input} …")
+ exp = load_file(args.input)
+ summary = summarise_experiment(exp)
+
+ if summary["n_spectra"] == 0:
+ print("No spectra found in file.")
+ return
+
+ print(f"\n{'File':<22}: {args.input}")
+ print(f"{'Total spectra':<22}: {summary['n_spectra']}")
+ for level, count in sorted(summary["ms_levels"].items()):
+ print(f" {'MS' + str(level) + ' spectra':<20}: {count}")
+ rt_min, rt_max = summary["rt_range_sec"]
+ print(f"{'RT range':<22}: {rt_min:.2f} – {rt_max:.2f} s "
+ f"({rt_min/60:.2f} – {rt_max/60:.2f} min)")
+ mz_lo, mz_hi = summary["mz_range"]
+ print(f"{'m/z range':<22}: {mz_lo:.4f} – {mz_hi:.4f}")
+ print(f"{'Total TIC':<22}: {summary['tic_total']:.3e}")
+ print(f"{'Max spectrum TIC':<22}: {summary['tic_max']:.3e}")
+
+ if args.tic:
+ print("\n--- Per-spectrum TIC ---")
+ for i, tic in enumerate(summary["tic_per_spectrum"], 1):
+ print(f" Spectrum {i:>5}: {tic:.3e}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_metabolomics.py b/tests/test_metabolomics.py
new file mode 100644
index 0000000..b41cbab
--- /dev/null
+++ b/tests/test_metabolomics.py
@@ -0,0 +1,122 @@
+"""
+Tests for metabolomics scripts.
+Run with: python -m pytest tests/ -v
+"""
+
+import sys
+import os
+import math
+
+import pytest
+
+# Allow importing from the scripts directory
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts", "metabolomics"))
+
+try:
+ import pyopenms as oms # noqa: F401
+
+ HAS_PYOPENMS = True
+except ImportError:
+ HAS_PYOPENMS = False
+
+requires_pyopenms = pytest.mark.skipif(
+ not HAS_PYOPENMS, reason="pyopenms not installed"
+)
+
+
+@requires_pyopenms
+class TestMassAccuracyCalculator:
+ def test_sequence_theoretical(self):
+ from mass_accuracy_calculator import theoretical_mz_from_sequence
+
+ mz = theoretical_mz_from_sequence("PEPTIDEK", 1)
+ # The monoisotopic mass of PEPTIDEK is ~927.49 Da; +1 proton / 1
+ assert 928.0 < mz < 929.0
+
+ def test_formula_theoretical(self):
+ from mass_accuracy_calculator import theoretical_mz_from_formula
+
+ # Glucose C6H12O6 monoisotopic mass ~180.063 Da; +1 proton / 1 => ~181.070
+ mz = theoretical_mz_from_formula("C6H12O6", 1)
+ assert 181.0 < mz < 182.0
+
+ def test_ppm_zero_error(self):
+ from mass_accuracy_calculator import ppm_error
+
+ theoretical = 500.0
+ observed = 500.0
+ assert ppm_error(theoretical, observed) == 0.0
+
+ def test_ppm_positive_error(self):
+ from mass_accuracy_calculator import ppm_error
+
+ theoretical = 500.0
+ observed = 500.001
+ ppm = ppm_error(theoretical, observed)
+ assert ppm > 0
+
+ def test_ppm_negative_error(self):
+ from mass_accuracy_calculator import ppm_error
+
+ theoretical = 500.0
+ observed = 499.999
+ ppm = ppm_error(theoretical, observed)
+ assert ppm < 0
+
+ def test_ppm_known_value(self):
+ from mass_accuracy_calculator import ppm_error
+
+ # 1 ppm at 1000 Da => 0.001 Da shift
+ theoretical = 1000.0
+ observed = 1000.001
+ ppm = ppm_error(theoretical, observed)
+ assert abs(ppm - 1.0) < 0.001
+
+
+@requires_pyopenms
+class TestIsotopePatternMatcher:
+ def test_glucose_pattern(self):
+ from isotope_pattern_matcher import get_isotope_distribution
+
+ dist = get_isotope_distribution("C6H12O6", max_isotopes=3)
+ assert len(dist) == 3
+ # M+0 should be the base peak
+ assert dist[0][1] == pytest.approx(100.0)
+ # Each successive peak must be less intense than the previous for small molecules
+ assert dist[1][1] < dist[0][1]
+
+ def test_pattern_mz_ordering(self):
+ from isotope_pattern_matcher import get_isotope_distribution
+
+ dist = get_isotope_distribution("C12H22O11", max_isotopes=4) # sucrose
+ mzs = [mz for mz, _ in dist]
+ assert mzs == sorted(mzs)
+
+ def test_cosine_similarity_perfect(self):
+ from isotope_pattern_matcher import cosine_similarity
+
+ peaks = [(100.0, 50.0), (101.0, 20.0), (102.0, 5.0)]
+ sim = cosine_similarity(peaks, peaks, mz_tolerance=0.01)
+ assert abs(sim - 1.0) < 1e-9
+
+ def test_cosine_similarity_no_overlap(self):
+ from isotope_pattern_matcher import cosine_similarity
+
+ theoretical = [(100.0, 50.0), (101.0, 20.0)]
+ observed = [(200.0, 50.0), (201.0, 20.0)]
+ sim = cosine_similarity(theoretical, observed, mz_tolerance=0.01)
+ assert sim == 0.0
+
+ def test_parse_peaks(self):
+ from isotope_pattern_matcher import parse_peaks
+
+ result = parse_peaks(["181.0709,100.0", "182.0742,6.7"])
+ assert len(result) == 2
+ assert result[0] == (181.0709, 100.0)
+ assert result[1] == (182.0742, 6.7)
+
+ def test_parse_peaks_invalid(self):
+ from isotope_pattern_matcher import parse_peaks
+
+ with pytest.raises(ValueError):
+ parse_peaks(["181.0709"])
diff --git a/tests/test_proteomics.py b/tests/test_proteomics.py
new file mode 100644
index 0000000..257b774
--- /dev/null
+++ b/tests/test_proteomics.py
@@ -0,0 +1,111 @@
+"""
+Tests for proteomics scripts.
+Run with: python -m pytest tests/ -v
+"""
+
+import sys
+import os
+
+import pytest
+
+# Allow importing from the scripts directory
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts", "proteomics"))
+
+try:
+ import pyopenms as oms # noqa: F401
+
+ HAS_PYOPENMS = True
+except ImportError:
+ HAS_PYOPENMS = False
+
+requires_pyopenms = pytest.mark.skipif(
+ not HAS_PYOPENMS, reason="pyopenms not installed"
+)
+
+
+@requires_pyopenms
+class TestPeptideMassCalculator:
+ def test_basic_mass(self):
+ from peptide_mass_calculator import peptide_masses
+
+ result = peptide_masses("PEPTIDEK")
+ assert result["sequence"] == "PEPTIDEK"
+ assert result["charge"] == 1
+ assert 927.0 < result["monoisotopic_mass"] < 928.0
+ assert result["mz_monoisotopic"] > result["monoisotopic_mass"]
+
+ def test_charge_state(self):
+ from peptide_mass_calculator import peptide_masses
+
+ r1 = peptide_masses("PEPTIDEK", charge=1)
+ r2 = peptide_masses("PEPTIDEK", charge=2)
+ # Higher charge → lower m/z
+ assert r2["mz_monoisotopic"] < r1["mz_monoisotopic"]
+
+ def test_fragment_ions(self):
+ from peptide_mass_calculator import fragment_ions
+
+ ions = fragment_ions("PEPTIDEK")
+ seq_len = len("PEPTIDEK")
+ # Expect seq_len-1 b-ions and seq_len-1 y-ions
+ assert len(ions["b_ions"]) == seq_len - 1
+ assert len(ions["y_ions"]) == seq_len - 1
+
+ def test_modified_sequence(self):
+ from peptide_mass_calculator import peptide_masses
+
+ # Oxidised methionine
+ result = peptide_masses("PEPTM[147]IDEK")
+ assert result["monoisotopic_mass"] > 0
+
+ def test_mz_formula(self):
+ from peptide_mass_calculator import peptide_masses, PROTON
+
+ r = peptide_masses("PEPTIDEK", charge=2)
+ expected = (r["monoisotopic_mass"] + 2 * PROTON) / 2
+ assert abs(r["mz_monoisotopic"] - expected) < 1e-6
+
+
+@requires_pyopenms
+class TestProteinDigest:
+ # Short test protein containing known tryptic sites (K, R)
+ PROTEIN = "MKVLWAALLVTFLAGCQAKVEQAVETEPEPELRQQTEWQSGQRWELAL"
+
+ def test_tryptic_digest_returns_peptides(self):
+ from protein_digest import digest_protein
+
+ peptides = digest_protein(self.PROTEIN, enzyme="Trypsin", min_length=1)
+ assert len(peptides) > 0
+
+ def test_peptide_structure(self):
+ from protein_digest import digest_protein
+
+ peptides = digest_protein(self.PROTEIN, enzyme="Trypsin", min_length=1)
+ for pep in peptides:
+ assert "sequence" in pep
+ assert "monoisotopic_mass" in pep
+ assert pep["monoisotopic_mass"] > 0
+
+ def test_missed_cleavages(self):
+ from protein_digest import digest_protein
+
+ peps_0 = digest_protein(self.PROTEIN, enzyme="Trypsin", missed_cleavages=0, min_length=1)
+ peps_2 = digest_protein(self.PROTEIN, enzyme="Trypsin", missed_cleavages=2, min_length=1)
+ # More missed cleavages → at least as many peptides
+ assert len(peps_2) >= len(peps_0)
+
+ def test_length_filter(self):
+ from protein_digest import digest_protein
+
+ peptides = digest_protein(
+ self.PROTEIN, enzyme="Trypsin", min_length=5, max_length=20, missed_cleavages=2
+ )
+ for pep in peptides:
+ assert 5 <= pep["length"] <= 20
+
+ def test_list_enzymes(self):
+ from protein_digest import list_enzymes
+
+ enzymes = list_enzymes()
+ assert "Trypsin" in enzymes
+ assert len(enzymes) > 5