From c7296b26619c8d7beca415d44e10f4f56c66de1f Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Mon, 20 Oct 2025 17:26:45 -0700 Subject: [PATCH] Add PyOpenms --- .claude-plugin/marketplace.json | 3 +- README.md | 4 +- scientific-packages/pyopenms/SKILL.md | 522 +++++++++++++ .../pyopenms/references/algorithms.md | 643 ++++++++++++++++ .../pyopenms/references/chemistry.md | 715 ++++++++++++++++++ .../pyopenms/references/data_structures.md | 560 ++++++++++++++ 6 files changed, 2445 insertions(+), 2 deletions(-) create mode 100644 scientific-packages/pyopenms/SKILL.md create mode 100644 scientific-packages/pyopenms/references/algorithms.md create mode 100644 scientific-packages/pyopenms/references/chemistry.md create mode 100644 scientific-packages/pyopenms/references/data_structures.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index d1bd958..d4db265 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -7,7 +7,7 @@ }, "metadata": { "description": "Claude scientific skills from K-Dense Inc", - "version": "1.18.3" + "version": "1.19.0" }, "plugins": [ { @@ -39,6 +39,7 @@ "./scientific-packages/pymatgen", "./scientific-packages/pymc", "./scientific-packages/pymoo", + "./scientific-packages/pyopenms", "./scientific-packages/pytdc", "./scientific-packages/pytorch-lightning", "./scientific-packages/rdkit", diff --git a/README.md b/README.md index 7059e22..21b841d 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,9 @@ After installing the plugin, you can use the skill by just mentioning it. Additi - **PyTDC** - Therapeutics Data Commons for drug discovery datasets and benchmarks - **RDKit** - Cheminformatics toolkit for molecular I/O, descriptors, fingerprints, and SMARTS +**Proteomics & Mass Spectrometry:** +- **pyOpenMS** - Comprehensive mass spectrometry data analysis for proteomics and metabolomics (LC-MS/MS processing, peptide identification, feature detection, quantification, chemical calculations, and integration with search engines like Comet, Mascot, MSGF+) + **Machine Learning & Deep Learning:** - **PyMC** - Bayesian statistical modeling and probabilistic programming - **PyMOO** - Multi-objective optimization with evolutionary algorithms @@ -157,7 +160,6 @@ After installing the plugin, you can use the skill by just mentioning it. Additi ### Proteomics & Mass Spectrometry - **pyteomics** - Mass spectrometry data analysis -- **pyOpenMS** - OpenMS Python bindings for proteomics - **matchms** - Processing and similarity matching of mass spectrometry data - **MSstats** - Statistical analysis of quantitative proteomics diff --git a/scientific-packages/pyopenms/SKILL.md b/scientific-packages/pyopenms/SKILL.md new file mode 100644 index 0000000..b7db7fd --- /dev/null +++ b/scientific-packages/pyopenms/SKILL.md @@ -0,0 +1,522 @@ +--- +name: pyopenms +description: Toolkit for mass spectrometry data analysis with pyOpenMS, supporting proteomics and metabolomics workflows including LC-MS/MS data processing, peptide identification, feature detection, quantification, and chemical calculations. Use this skill when: (1) Working with mass spectrometry file formats (mzML, mzXML, FASTA, mzTab, mzIdentML, TraML, pepXML/protXML) and need to read, write, or convert between formats; (2) Processing raw LC-MS/MS data including spectral smoothing, peak picking, noise filtering, and signal processing; (3) Performing proteomics workflows such as peptide digestion simulation, theoretical fragmentation, modification analysis, and protein identification post-processing; (4) Conducting metabolomics analysis including feature detection, adduct annotation, isotope pattern matching, and small molecule identification; (5) Implementing quantitative proteomics pipelines with feature detection, alignment across samples, and statistical analysis; (6) Calculating chemical properties including molecular formulas, isotopic distributions, amino acid properties, and peptide masses; (7) Integrating with search engines (Comet, Mascot, MSGF+) and post-processing tools (Percolator, MSstats); (8) Building custom MS data analysis workflows that require low-level access to spectra, chromatograms, and peak data; (9) Performing quality control on MS data including TIC/BPC calculation, retention time analysis, and data validation; (10) When you need Python-based alternatives to vendor software for MS data processing and analysis. +--- + +# pyOpenMS + +## Overview + +pyOpenMS is an open-source Python library providing comprehensive tools for mass spectrometry data analysis in proteomics and metabolomics research. It offers Python bindings to the OpenMS C++ library, enabling efficient processing of LC-MS/MS data, peptide identification, feature detection, quantification, and integration with common proteomics tools like Comet, Mascot, MSGF+, Percolator, and MSstats. + +Use this skill when working with mass spectrometry data analysis tasks, processing proteomics or metabolomics datasets, or implementing computational workflows for biomolecular identification and quantification. + +## Core Capabilities + +### 1. File I/O and Data Import/Export + +Handle diverse mass spectrometry file formats efficiently: + +**Supported Formats:** +- **mzML/mzXML**: Primary raw MS data formats (profile or centroid) +- **FASTA**: Protein/peptide sequence databases +- **mzTab**: Standardized reporting format for identification and quantification +- **mzIdentML**: Peptide and protein identification data +- **TraML**: Transition lists for targeted experiments +- **pepXML/protXML**: Search engine results + +**Reading mzML Files:** +```python +import pyopenms as oms + +# Load MS data +exp = oms.MSExperiment() +oms.MzMLFile().load("input_data.mzML", exp) + +# Access basic information +print(f"Number of spectra: {exp.getNrSpectra()}") +print(f"Number of chromatograms: {exp.getNrChromatograms()}") +``` + +**Writing mzML Files:** +```python +# Save processed data +oms.MzMLFile().store("output_data.mzML", exp) +``` + +**File Encoding:** pyOpenMS automatically handles Base64 encoding, zlib compression, and Numpress compression internally. + +### 2. MS Data Structures and Manipulation + +Work with core mass spectrometry data structures. See `references/data_structures.md` for comprehensive details. + +**MSSpectrum** - Individual mass spectrum: +```python +# Create spectrum with metadata +spectrum = oms.MSSpectrum() +spectrum.setRT(205.2) # Retention time in seconds +spectrum.setMSLevel(2) # MS2 spectrum + +# Set peak data (m/z, intensity arrays) +mz_array = [100.5, 200.3, 300.7, 400.2] +intensity_array = [1000, 5000, 3000, 2000] +spectrum.set_peaks((mz_array, intensity_array)) + +# Add precursor information for MS2 +precursor = oms.Precursor() +precursor.setMZ(450.5) +precursor.setCharge(2) +spectrum.setPrecursors([precursor]) +``` + +**MSExperiment** - Complete LC-MS/MS run: +```python +# Create experiment and add spectra +exp = oms.MSExperiment() +exp.addSpectrum(spectrum) + +# Access spectra +first_spectrum = exp.getSpectrum(0) +for spec in exp: + print(f"RT: {spec.getRT()}, MS Level: {spec.getMSLevel()}") +``` + +**MSChromatogram** - Extracted ion chromatogram: +```python +# Create chromatogram +chrom = oms.MSChromatogram() +chrom.set_peaks(([10.5, 11.2, 11.8], [1000, 5000, 3000])) # RT, intensity +exp.addChromatogram(chrom) +``` + +**Efficient Peak Access:** +```python +# Get peaks as numpy arrays for fast processing +mz_array, intensity_array = spectrum.get_peaks() + +# Modify and set back +intensity_array *= 2 # Double all intensities +spectrum.set_peaks((mz_array, intensity_array)) +``` + +### 3. Chemistry and Peptide Handling + +Perform chemical calculations for proteomics and metabolomics. See `references/chemistry.md` for detailed examples. + +**Molecular Formulas and Mass Calculations:** +```python +# Create empirical formula +formula = oms.EmpiricalFormula("C6H12O6") # Glucose +print(f"Monoisotopic mass: {formula.getMonoWeight()}") +print(f"Average mass: {formula.getAverageWeight()}") + +# Formula arithmetic +water = oms.EmpiricalFormula("H2O") +dehydrated = formula - water + +# Isotope-specific formulas +heavy_carbon = oms.EmpiricalFormula("(13)C6H12O6") +``` + +**Isotopic Distributions:** +```python +# Generate coarse isotope pattern (unit mass resolution) +coarse_gen = oms.CoarseIsotopePatternGenerator() +pattern = coarse_gen.run(formula) + +# Generate fine structure (high resolution) +fine_gen = oms.FineIsotopePatternGenerator(0.01) # 0.01 Da resolution +fine_pattern = fine_gen.run(formula) +``` + +**Amino Acids and Residues:** +```python +# Access residue information +res_db = oms.ResidueDB() +leucine = res_db.getResidue("Leucine") +print(f"L monoisotopic mass: {leucine.getMonoWeight()}") +print(f"L formula: {leucine.getFormula()}") +print(f"L pKa: {leucine.getPka()}") +``` + +**Peptide Sequences:** +```python +# Create peptide sequence +peptide = oms.AASequence.fromString("PEPTIDE") +print(f"Peptide mass: {peptide.getMonoWeight()}") +print(f"Formula: {peptide.getFormula()}") + +# Add modifications +modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)") +print(f"Modified mass: {modified.getMonoWeight()}") + +# Theoretical fragmentation +ions = [] +for i in range(1, peptide.size()): + b_ion = peptide.getPrefix(i) + y_ion = peptide.getSuffix(i) + ions.append(('b', i, b_ion.getMonoWeight())) + ions.append(('y', i, y_ion.getMonoWeight())) +``` + +**Protein Digestion:** +```python +# Enzymatic digestion +dig = oms.ProteaseDigestion() +dig.setEnzyme("Trypsin") +dig.setMissedCleavages(2) + +protein_seq = oms.AASequence.fromString("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK") +peptides = [] +dig.digest(protein_seq, peptides) + +for pep in peptides: + print(f"{pep.toString()}: {pep.getMonoWeight():.2f} Da") +``` + +**Modifications:** +```python +# Access modification database +mod_db = oms.ModificationsDB() +oxidation = mod_db.getModification("Oxidation") +print(f"Oxidation mass diff: {oxidation.getDiffMonoMass()}") +print(f"Residues: {oxidation.getResidues()}") +``` + +### 4. Signal Processing and Filtering + +Apply algorithms to process and filter MS data. See `references/algorithms.md` for comprehensive coverage. + +**Spectral Smoothing:** +```python +# Gaussian smoothing +gauss_filter = oms.GaussFilter() +params = gauss_filter.getParameters() +params.setValue("gaussian_width", 0.2) +gauss_filter.setParameters(params) +gauss_filter.filterExperiment(exp) + +# Savitzky-Golay filter +sg_filter = oms.SavitzkyGolayFilter() +sg_filter.filterExperiment(exp) +``` + +**Peak Filtering:** +```python +# Keep only N largest peaks per spectrum +n_largest = oms.NLargest() +params = n_largest.getParameters() +params.setValue("n", 100) # Keep top 100 peaks +n_largest.setParameters(params) +n_largest.filterExperiment(exp) + +# Threshold filtering +threshold_filter = oms.ThresholdMower() +params = threshold_filter.getParameters() +params.setValue("threshold", 1000.0) # Remove peaks below 1000 intensity +threshold_filter.setParameters(params) +threshold_filter.filterExperiment(exp) + +# Window-based filtering +window_filter = oms.WindowMower() +params = window_filter.getParameters() +params.setValue("windowsize", 50.0) # 50 m/z windows +params.setValue("peakcount", 10) # Keep 10 highest per window +window_filter.setParameters(params) +window_filter.filterExperiment(exp) +``` + +**Spectrum Normalization:** +```python +normalizer = oms.Normalizer() +normalizer.filterExperiment(exp) +``` + +**MS Level Filtering:** +```python +# Keep only MS2 spectra +exp.filterMSLevel(2) + +# Filter by retention time range +exp.filterRT(100.0, 500.0) # Keep RT between 100-500 seconds + +# Filter by m/z range +exp.filterMZ(400.0, 1500.0) # Keep m/z between 400-1500 +``` + +### 5. Feature Detection and Quantification + +Detect and quantify features in LC-MS data: + +**Peak Picking (Centroiding):** +```python +# Convert profile data to centroid +picker = oms.PeakPickerHiRes() +params = picker.getParameters() +params.setValue("signal_to_noise", 1.0) +picker.setParameters(params) + +exp_centroided = oms.MSExperiment() +picker.pickExperiment(exp, exp_centroided) +``` + +**Feature Detection:** +```python +# Detect features across LC-MS runs +feature_finder = oms.FeatureFinderMultiplex() + +features = oms.FeatureMap() +feature_finder.run(exp, features, params) + +print(f"Found {features.size()} features") +for feature in features: + print(f"m/z: {feature.getMZ():.4f}, RT: {feature.getRT():.2f}, " + f"Intensity: {feature.getIntensity():.0f}") +``` + +**Feature Linking (Map Alignment):** +```python +# Link features across multiple samples +feature_grouper = oms.FeatureGroupingAlgorithmQT() +consensus_map = oms.ConsensusMap() + +# Provide multiple feature maps from different samples +feature_maps = [features1, features2, features3] +feature_grouper.group(feature_maps, consensus_map) +``` + +### 6. Peptide Identification Workflows + +Integrate with search engines and process identification results: + +**Database Searching:** +```python +# Prepare parameters for search engine +params = oms.Param() +params.setValue("database", "uniprot_human.fasta") +params.setValue("precursor_mass_tolerance", 10.0) # ppm +params.setValue("fragment_mass_tolerance", 0.5) # Da +params.setValue("enzyme", "Trypsin") +params.setValue("missed_cleavages", 2) + +# Variable modifications +params.setValue("variable_modifications", ["Oxidation (M)", "Phospho (STY)"]) + +# Fixed modifications +params.setValue("fixed_modifications", ["Carbamidomethyl (C)"]) +``` + +**FDR Control:** +```python +# False discovery rate estimation +fdr = oms.FalseDiscoveryRate() +fdr_threshold = 0.01 # 1% FDR + +# Apply to peptide identifications +protein_ids = [] +peptide_ids = [] +oms.IdXMLFile().load("search_results.idXML", protein_ids, peptide_ids) + +fdr.apply(protein_ids, peptide_ids) +``` + +### 7. Metabolomics Workflows + +Analyze small molecule data: + +**Adduct Detection:** +```python +# Common metabolite adducts +adducts = ["[M+H]+", "[M+Na]+", "[M+K]+", "[M-H]-", "[M+Cl]-"] + +# Feature annotation with adducts +for feature in features: + mz = feature.getMZ() + # Calculate neutral mass for each adduct hypothesis + for adduct in adducts: + # Annotation logic + pass +``` + +**Isotope Pattern Matching:** +```python +# Compare experimental to theoretical isotope patterns +experimental_pattern = [] # Extract from feature +theoretical = coarse_gen.run(formula) + +# Calculate similarity score +similarity = compare_isotope_patterns(experimental_pattern, theoretical) +``` + +### 8. Quality Control and Visualization + +Monitor data quality and visualize results: + +**Basic Statistics:** +```python +# Calculate TIC (Total Ion Current) +tic_values = [] +rt_values = [] +for spectrum in exp: + if spectrum.getMSLevel() == 1: + tic = sum(spectrum.get_peaks()[1]) # Sum intensities + tic_values.append(tic) + rt_values.append(spectrum.getRT()) + +# Base peak chromatogram +bpc_values = [] +for spectrum in exp: + if spectrum.getMSLevel() == 1: + max_intensity = max(spectrum.get_peaks()[1]) if spectrum.size() > 0 else 0 + bpc_values.append(max_intensity) +``` + +**Plotting (with pyopenms.plotting or matplotlib):** +```python +import matplotlib.pyplot as plt + +# Plot TIC +plt.figure(figsize=(10, 4)) +plt.plot(rt_values, tic_values) +plt.xlabel('Retention Time (s)') +plt.ylabel('Total Ion Current') +plt.title('TIC') +plt.show() + +# Plot single spectrum +spectrum = exp.getSpectrum(0) +mz, intensity = spectrum.get_peaks() +plt.stem(mz, intensity, basefmt=' ') +plt.xlabel('m/z') +plt.ylabel('Intensity') +plt.title(f'Spectrum at RT {spectrum.getRT():.2f}s') +plt.show() +``` + +## Common Workflows + +### Complete LC-MS/MS Processing Pipeline + +```python +import pyopenms as oms + +# 1. Load data +exp = oms.MSExperiment() +oms.MzMLFile().load("raw_data.mzML", exp) + +# 2. Filter and smooth +exp.filterMSLevel(1) # Keep only MS1 for feature detection +gauss = oms.GaussFilter() +gauss.filterExperiment(exp) + +# 3. Peak picking +picker = oms.PeakPickerHiRes() +exp_centroid = oms.MSExperiment() +picker.pickExperiment(exp, exp_centroid) + +# 4. Feature detection +ff = oms.FeatureFinderMultiplex() +features = oms.FeatureMap() +ff.run(exp_centroid, features, oms.Param()) + +# 5. Export results +oms.FeatureXMLFile().store("features.featureXML", features) +print(f"Detected {features.size()} features") +``` + +### Theoretical Peptide Mass Calculation + +```python +# Calculate masses for peptide with modifications +peptide = oms.AASequence.fromString("PEPTIDEK") +print(f"Unmodified [M+H]+: {peptide.getMonoWeight() + 1.007276:.4f}") + +# With modification +modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)K") +print(f"Oxidized [M+H]+: {modified.getMonoWeight() + 1.007276:.4f}") + +# Calculate for different charge states +for z in [1, 2, 3]: + mz = (peptide.getMonoWeight() + z * 1.007276) / z + print(f"[M+{z}H]^{z}+: {mz:.4f}") +``` + +## Installation + +Ensure pyOpenMS is installed before using this skill: + +```bash +# Via conda (recommended) +conda install -c bioconda pyopenms + +# Via pip +pip install pyopenms +``` + +## Integration with Other Tools + +pyOpenMS integrates seamlessly with: + +- **Search Engines**: Comet, Mascot, MSGF+, MSFragger, Sage, SpectraST +- **Post-processing**: Percolator, MSstats, Epiphany +- **Metabolomics**: SIRIUS, CSI:FingerID +- **Data Analysis**: Pandas, NumPy, SciPy for downstream analysis +- **Visualization**: Matplotlib, Seaborn for plotting + +## Resources + +### references/ + +Detailed documentation on core concepts: + +- **data_structures.md** - Comprehensive guide to MSExperiment, MSSpectrum, MSChromatogram, and peak data handling +- **algorithms.md** - Complete reference for signal processing, filtering, feature detection, and quantification algorithms +- **chemistry.md** - In-depth coverage of chemistry calculations, peptide handling, modifications, and isotope distributions + +Load these references when needing detailed information about specific pyOpenMS capabilities. + +## Best Practices + +1. **File Format**: Always use mzML for raw MS data (standardized, well-supported) +2. **Peak Access**: Use `get_peaks()` and `set_peaks()` with numpy arrays for efficient processing +3. **Parameters**: Always check and configure algorithm parameters via `getParameters()` and `setParameters()` +4. **Memory**: For large datasets, process spectra iteratively rather than loading entire experiments +5. **Validation**: Check data integrity (MS levels, RT ordering, precursor information) after loading +6. **Modifications**: Use standard modification names from UniMod database +7. **Units**: RT in seconds, m/z in Thomson (Da/charge), intensity in arbitrary units + +## Common Patterns + +**Algorithm Application Pattern:** +```python +# 1. Instantiate algorithm +algorithm = oms.SomeAlgorithm() + +# 2. Get and configure parameters +params = algorithm.getParameters() +params.setValue("parameter_name", value) +algorithm.setParameters(params) + +# 3. Apply to data +algorithm.filterExperiment(exp) # or .process(), .run(), depending on algorithm +``` + +**File I/O Pattern:** +```python +# Read +data_container = oms.DataContainer() # MSExperiment, FeatureMap, etc. +oms.FileHandler().load("input.format", data_container) + +# Process +# ... manipulate data_container ... + +# Write +oms.FileHandler().store("output.format", data_container) +``` + +## Getting Help + +- **Documentation**: https://pyopenms.readthedocs.io/ +- **API Reference**: Browse class documentation for detailed method signatures +- **OpenMS Website**: https://www.openms.org/ +- **GitHub Issues**: https://github.com/OpenMS/OpenMS/issues diff --git a/scientific-packages/pyopenms/references/algorithms.md b/scientific-packages/pyopenms/references/algorithms.md new file mode 100644 index 0000000..c01e313 --- /dev/null +++ b/scientific-packages/pyopenms/references/algorithms.md @@ -0,0 +1,643 @@ +# pyOpenMS Algorithms Reference + +This document provides comprehensive coverage of algorithms available in pyOpenMS for signal processing, feature detection, and quantification. + +## Algorithm Usage Pattern + +Most pyOpenMS algorithms follow a consistent pattern: + +```python +import pyopenms as oms + +# 1. Instantiate algorithm +algorithm = oms.AlgorithmName() + +# 2. Get parameters +params = algorithm.getParameters() + +# 3. Modify parameters +params.setValue("parameter_name", value) + +# 4. Set parameters back +algorithm.setParameters(params) + +# 5. Apply to data +algorithm.filterExperiment(exp) # or .process(), .run(), etc. +``` + +## Signal Processing Algorithms + +### Smoothing Filters + +#### GaussFilter - Gaussian Smoothing + +Applies Gaussian smoothing to reduce noise. + +```python +gauss = oms.GaussFilter() + +# Configure parameters +params = gauss.getParameters() +params.setValue("gaussian_width", 0.2) # Gaussian width (larger = more smoothing) +params.setValue("ppm_tolerance", 10.0) # PPM tolerance for spacing +params.setValue("use_ppm_tolerance", "true") +gauss.setParameters(params) + +# Apply to experiment +gauss.filterExperiment(exp) + +# Or apply to single spectrum +spectrum_smoothed = oms.MSSpectrum() +gauss.filter(spectrum, spectrum_smoothed) +``` + +**Key Parameters:** +- `gaussian_width`: Width of Gaussian kernel (default: 0.2 Da) +- `ppm_tolerance`: Tolerance in ppm for spacing +- `use_ppm_tolerance`: Whether to use ppm instead of absolute spacing + +#### SavitzkyGolayFilter + +Applies Savitzky-Golay smoothing (polynomial fitting). + +```python +sg_filter = oms.SavitzkyGolayFilter() + +params = sg_filter.getParameters() +params.setValue("frame_length", 11) # Window size (must be odd) +params.setValue("polynomial_order", 3) # Polynomial degree +sg_filter.setParameters(params) + +sg_filter.filterExperiment(exp) +``` + +**Key Parameters:** +- `frame_length`: Size of smoothing window (must be odd) +- `polynomial_order`: Degree of polynomial (typically 2-4) + +### Peak Filtering + +#### NLargest - Keep Top N Peaks + +Retains only the N most intense peaks per spectrum. + +```python +n_largest = oms.NLargest() + +params = n_largest.getParameters() +params.setValue("n", 100) # Keep top 100 peaks +params.setValue("threshold", 0.0) # Optional minimum intensity +n_largest.setParameters(params) + +n_largest.filterExperiment(exp) +``` + +**Key Parameters:** +- `n`: Number of peaks to keep per spectrum +- `threshold`: Minimum absolute intensity threshold + +#### ThresholdMower - Intensity Threshold Filtering + +Removes peaks below a specified intensity threshold. + +```python +threshold_filter = oms.ThresholdMower() + +params = threshold_filter.getParameters() +params.setValue("threshold", 1000.0) # Absolute intensity threshold +threshold_filter.setParameters(params) + +threshold_filter.filterExperiment(exp) +``` + +**Key Parameters:** +- `threshold`: Absolute intensity cutoff + +#### WindowMower - Window-Based Peak Selection + +Divides m/z range into windows and keeps top N peaks per window. + +```python +window_mower = oms.WindowMower() + +params = window_mower.getParameters() +params.setValue("windowsize", 50.0) # Window size in Da (or Thomson) +params.setValue("peakcount", 10) # Peaks to keep per window +params.setValue("movetype", "jump") # "jump" or "slide" +window_mower.setParameters(params) + +window_mower.filterExperiment(exp) +``` + +**Key Parameters:** +- `windowsize`: Size of m/z window (Da) +- `peakcount`: Number of peaks to retain per window +- `movetype`: "jump" (non-overlapping) or "slide" (overlapping windows) + +#### BernNorm - Bernoulli Normalization + +Statistical normalization based on Bernoulli distribution. + +```python +bern_norm = oms.BernNorm() + +params = bern_norm.getParameters() +params.setValue("threshold", 0.7) # Threshold for normalization +bern_norm.setParameters(params) + +bern_norm.filterExperiment(exp) +``` + +### Spectrum Normalization + +#### Normalizer + +Normalizes spectrum intensities to unit total intensity or maximum intensity. + +```python +normalizer = oms.Normalizer() + +params = normalizer.getParameters() +params.setValue("method", "to_one") # "to_one" or "to_TIC" +normalizer.setParameters(params) + +normalizer.filterExperiment(exp) +``` + +**Methods:** +- `to_one`: Normalize max peak to 1.0 +- `to_TIC`: Normalize to total ion current = 1.0 + +#### Scaler + +Scales intensities by a constant factor. + +```python +scaler = oms.Scaler() + +params = scaler.getParameters() +params.setValue("scaling", 1000.0) # Scaling factor +scaler.setParameters(params) + +scaler.filterExperiment(exp) +``` + +## Centroiding and Peak Picking + +### PeakPickerHiRes - High-Resolution Peak Picking + +Converts profile spectra to centroid mode for high-resolution data. + +```python +picker = oms.PeakPickerHiRes() + +params = picker.getParameters() +params.setValue("signal_to_noise", 1.0) # S/N threshold +params.setValue("spacing_difference", 1.5) # Peak spacing factor +params.setValue("sn_win_len", 20.0) # S/N window length +params.setValue("sn_bin_count", 30) # Bins for S/N estimation +params.setValue("ms1_only", "false") # Process only MS1 +params.setValue("ms_levels", [1, 2]) # MS levels to process +picker.setParameters(params) + +# Pick peaks +exp_centroided = oms.MSExperiment() +picker.pickExperiment(exp, exp_centroided) +``` + +**Key Parameters:** +- `signal_to_noise`: Minimum signal-to-noise ratio +- `spacing_difference`: Minimum spacing between peaks +- `ms_levels`: List of MS levels to process + +### PeakPickerWavelet - Wavelet-Based Peak Picking + +Uses continuous wavelet transform for peak detection. + +```python +wavelet_picker = oms.PeakPickerWavelet() + +params = wavelet_picker.getParameters() +params.setValue("signal_to_noise", 1.0) +params.setValue("peak_width", 0.15) # Expected peak width (Da) +wavelet_picker.setParameters(params) + +wavelet_picker.pickExperiment(exp, exp_centroided) +``` + +## Feature Detection + +### FeatureFinder Algorithms + +Feature finders detect 2D features (m/z and RT) in LC-MS data. + +#### FeatureFinderMultiplex + +For multiplex labeling experiments (SILAC, dimethyl labeling). + +```python +ff = oms.FeatureFinderMultiplex() + +params = ff.getParameters() +params.setValue("algorithm:labels", "[]") # Empty for label-free +params.setValue("algorithm:charge", "2:4") # Charge range +params.setValue("algorithm:rt_typical", 40.0) # Expected feature RT width +params.setValue("algorithm:rt_min", 2.0) # Minimum RT width +params.setValue("algorithm:mz_tolerance", 10.0) # m/z tolerance (ppm) +params.setValue("algorithm:intensity_cutoff", 1000.0) # Minimum intensity +ff.setParameters(params) + +# Run feature detection +features = oms.FeatureMap() +ff.run(exp, features, oms.Param()) + +print(f"Found {features.size()} features") +``` + +**Key Parameters:** +- `algorithm:charge`: Charge state range to consider +- `algorithm:rt_typical`: Expected peak width in RT dimension +- `algorithm:mz_tolerance`: Mass tolerance in ppm +- `algorithm:intensity_cutoff`: Minimum intensity threshold + +#### FeatureFinderCentroided + +For centroided data, identifies isotope patterns and traces over RT. + +```python +ff_centroided = oms.FeatureFinderCentroided() + +params = ff_centroided.getParameters() +params.setValue("mass_trace:mz_tolerance", 10.0) # ppm +params.setValue("mass_trace:min_spectra", 5) # Min consecutive spectra +params.setValue("isotopic_pattern:charge_low", 1) +params.setValue("isotopic_pattern:charge_high", 4) +params.setValue("seed:min_score", 0.5) +ff_centroided.setParameters(params) + +features = oms.FeatureMap() +seeds = oms.FeatureMap() # Optional seed features +ff_centroided.run(exp, features, params, seeds) +``` + +#### FeatureFinderIdentification + +Uses peptide identifications to guide feature detection. + +```python +ff_id = oms.FeatureFinderIdentification() + +params = ff_id.getParameters() +params.setValue("extract:mz_window", 10.0) # ppm +params.setValue("extract:rt_window", 60.0) # seconds +params.setValue("detect:peak_width", 30.0) # Expected peak width +ff_id.setParameters(params) + +# Requires peptide identifications +protein_ids = [] +peptide_ids = [] +features = oms.FeatureMap() + +ff_id.run(exp, protein_ids, peptide_ids, features) +``` + +## Charge and Isotope Deconvolution + +### Decharging and Charge State Deconvolution + +#### FeatureDeconvolution + +Resolves charge states and combines features. + +```python +deconv = oms.FeatureDeconvolution() + +params = deconv.getParameters() +params.setValue("charge_min", 1) +params.setValue("charge_max", 4) +params.setValue("q_value", 0.01) # FDR threshold +deconv.setParameters(params) + +features_deconv = oms.FeatureMap() +consensus_map = oms.ConsensusMap() +deconv.compute(features, features_deconv, consensus_map) +``` + +## Map Alignment + +### MapAlignmentAlgorithm + +Aligns retention times across multiple LC-MS runs. + +#### MapAlignmentAlgorithmPoseClustering + +Pose clustering-based RT alignment. + +```python +aligner = oms.MapAlignmentAlgorithmPoseClustering() + +params = aligner.getParameters() +params.setValue("max_num_peaks_considered", 1000) +params.setValue("pairfinder:distance_MZ:max_difference", 0.3) # Da +params.setValue("pairfinder:distance_RT:max_difference", 60.0) # seconds +aligner.setParameters(params) + +# Align multiple feature maps +feature_maps = [features1, features2, features3] +transformations = [] + +# Create reference (e.g., use first map) +reference = oms.FeatureMap(feature_maps[0]) + +# Align others to reference +for fm in feature_maps[1:]: + transformation = oms.TransformationDescription() + aligner.align(fm, reference, transformation) + transformations.append(transformation) + + # Apply transformation + transformer = oms.MapAlignmentTransformer() + transformer.transformRetentionTimes(fm, transformation) +``` + +## Feature Linking + +### FeatureGroupingAlgorithm + +Links features across samples to create consensus features. + +#### FeatureGroupingAlgorithmQT + +Quality threshold-based feature linking. + +```python +grouper = oms.FeatureGroupingAlgorithmQT() + +params = grouper.getParameters() +params.setValue("distance_RT:max_difference", 60.0) # seconds +params.setValue("distance_MZ:max_difference", 10.0) # ppm +params.setValue("distance_MZ:unit", "ppm") +grouper.setParameters(params) + +# Create consensus map +consensus_map = oms.ConsensusMap() + +# Group features from multiple samples +feature_maps = [features1, features2, features3] +grouper.group(feature_maps, consensus_map) + +print(f"Created {consensus_map.size()} consensus features") +``` + +#### FeatureGroupingAlgorithmKD + +KD-tree based linking (faster for large datasets). + +```python +grouper_kd = oms.FeatureGroupingAlgorithmKD() + +params = grouper_kd.getParameters() +params.setValue("mz_unit", "ppm") +params.setValue("mz_tolerance", 10.0) +params.setValue("rt_tolerance", 30.0) +grouper_kd.setParameters(params) + +consensus_map = oms.ConsensusMap() +grouper_kd.group(feature_maps, consensus_map) +``` + +## Chromatographic Analysis + +### ElutionPeakDetection + +Detects elution peaks in chromatograms. + +```python +epd = oms.ElutionPeakDetection() + +params = epd.getParameters() +params.setValue("chrom_peak_snr", 3.0) # Signal-to-noise threshold +params.setValue("chrom_fwhm", 5.0) # Expected FWHM (seconds) +epd.setParameters(params) + +# Apply to chromatograms +for chrom in exp.getChromatograms(): + peaks = epd.detectPeaks(chrom) +``` + +### MRMFeatureFinderScoring + +Scoring and peak picking for targeted (MRM/SRM) experiments. + +```python +mrm_finder = oms.MRMFeatureFinderScoring() + +params = mrm_finder.getParameters() +params.setValue("TransitionGroupPicker:min_peak_width", 2.0) +params.setValue("TransitionGroupPicker:recalculate_peaks", "true") +params.setValue("TransitionGroupPicker:PeakPickerMRM:signal_to_noise", 1.0) +mrm_finder.setParameters(params) + +# Requires chromatograms +features = oms.FeatureMap() +mrm_finder.pickExperiment(chrom_exp, features, targets, transformation, swath_maps) +``` + +## Quantification + +### ProteinInference + +Infers proteins from peptide identifications. + +```python +protein_inference = oms.BasicProteinInferenceAlgorithm() + +# Apply to identification results +protein_inference.run(peptide_ids, protein_ids) +``` + +### IsobaricQuantification + +Quantification for isobaric labeling (TMT, iTRAQ). + +```python +# For TMT/iTRAQ quantification +iso_quant = oms.IsobaricQuantification() + +params = iso_quant.getParameters() +params.setValue("channel_116_description", "Sample1") +params.setValue("channel_117_description", "Sample2") +# ... configure all channels +iso_quant.setParameters(params) + +# Run quantification +quant_method = oms.IsobaricQuantitationMethod.TMT_10PLEX +quant_info = oms.IsobaricQuantifierStatistics() +iso_quant.quantify(exp, quant_info) +``` + +## Data Processing + +### BaselineFiltering + +Removes baseline from spectra. + +```python +baseline = oms.TopHatFilter() + +params = baseline.getParameters() +params.setValue("struc_elem_length", 3.0) # Structuring element size +params.setValue("struc_elem_unit", "Thomson") +baseline.setParameters(params) + +baseline.filterExperiment(exp) +``` + +### SpectraMerger + +Merges consecutive similar spectra. + +```python +merger = oms.SpectraMerger() + +params = merger.getParameters() +params.setValue("mz_binning_width", 0.05) # Binning width (Da) +params.setValue("sort_blocks", "RT_ascending") +merger.setParameters(params) + +merger.mergeSpectra(exp) +``` + +## Quality Control + +### MzMLFileQuality + +Analyzes mzML file quality. + +```python +# Calculate basic QC metrics +def calculate_qc_metrics(exp): + metrics = { + 'n_spectra': exp.getNrSpectra(), + 'n_ms1': sum(1 for s in exp if s.getMSLevel() == 1), + 'n_ms2': sum(1 for s in exp if s.getMSLevel() == 2), + 'rt_range': (exp.getMinRT(), exp.getMaxRT()), + 'mz_range': (exp.getMinMZ(), exp.getMaxMZ()), + } + + # Calculate TIC + tics = [] + for spectrum in exp: + if spectrum.getMSLevel() == 1: + mz, intensity = spectrum.get_peaks() + tics.append(sum(intensity)) + + metrics['median_tic'] = np.median(tics) + metrics['mean_tic'] = np.mean(tics) + + return metrics +``` + +## FDR Control + +### FalseDiscoveryRate + +Estimates and controls false discovery rate. + +```python +fdr = oms.FalseDiscoveryRate() + +params = fdr.getParameters() +params.setValue("add_decoy_peptides", "false") +params.setValue("add_decoy_proteins", "false") +fdr.setParameters(params) + +# Apply to identifications +fdr.apply(protein_ids, peptide_ids) + +# Filter by FDR threshold +fdr_threshold = 0.01 +filtered_peptides = [p for p in peptide_ids if p.getMetaValue("q-value") <= fdr_threshold] +``` + +## Algorithm Selection Guide + +### When to Use Which Algorithm + +**For Smoothing:** +- Use `GaussFilter` for general-purpose smoothing +- Use `SavitzkyGolayFilter` for preserving peak shapes + +**For Peak Picking:** +- Use `PeakPickerHiRes` for high-resolution Orbitrap/FT-ICR data +- Use `PeakPickerWavelet` for lower-resolution TOF data + +**For Feature Detection:** +- Use `FeatureFinderCentroided` for label-free proteomics (DDA) +- Use `FeatureFinderMultiplex` for SILAC/dimethyl labeling +- Use `FeatureFinderIdentification` when you have ID information +- Use `MRMFeatureFinderScoring` for targeted (MRM/SRM) experiments + +**For Feature Linking:** +- Use `FeatureGroupingAlgorithmQT` for small-medium datasets (<10 samples) +- Use `FeatureGroupingAlgorithmKD` for large datasets (>10 samples) + +## Parameter Tuning Tips + +1. **S/N Thresholds**: Start with 1-3 for clean data, increase for noisy data +2. **m/z Tolerance**: Use 5-10 ppm for high-resolution instruments, 0.5-1 Da for low-res +3. **RT Tolerance**: Typically 30-60 seconds depending on chromatographic stability +4. **Peak Width**: Measure from real data - varies by instrument and gradient length +5. **Charge States**: Set based on expected analytes (1-2 for metabolites, 2-4 for peptides) + +## Common Algorithm Workflows + +### Complete Proteomics Workflow + +```python +# 1. Load data +exp = oms.MSExperiment() +oms.MzMLFile().load("raw.mzML", exp) + +# 2. Smooth +gauss = oms.GaussFilter() +gauss.filterExperiment(exp) + +# 3. Peak picking +picker = oms.PeakPickerHiRes() +exp_centroid = oms.MSExperiment() +picker.pickExperiment(exp, exp_centroid) + +# 4. Feature detection +ff = oms.FeatureFinderCentroided() +features = oms.FeatureMap() +ff.run(exp_centroid, features, oms.Param(), oms.FeatureMap()) + +# 5. Save results +oms.FeatureXMLFile().store("features.featureXML", features) +``` + +### Multi-Sample Quantification + +```python +# Load multiple samples +feature_maps = [] +for filename in ["sample1.mzML", "sample2.mzML", "sample3.mzML"]: + exp = oms.MSExperiment() + oms.MzMLFile().load(filename, exp) + + # Process and detect features + features = detect_features(exp) # Your processing function + feature_maps.append(features) + +# Align retention times +align_feature_maps(feature_maps) # Implement alignment + +# Link features +grouper = oms.FeatureGroupingAlgorithmQT() +consensus_map = oms.ConsensusMap() +grouper.group(feature_maps, consensus_map) + +# Export quantification matrix +export_quant_matrix(consensus_map) +``` diff --git a/scientific-packages/pyopenms/references/chemistry.md b/scientific-packages/pyopenms/references/chemistry.md new file mode 100644 index 0000000..1a21340 --- /dev/null +++ b/scientific-packages/pyopenms/references/chemistry.md @@ -0,0 +1,715 @@ +# pyOpenMS Chemistry Reference + +This document provides comprehensive coverage of chemistry-related functionality in pyOpenMS, including elements, isotopes, molecular formulas, amino acids, peptides, proteins, and modifications. + +## Elements and Isotopes + +### ElementDB - Element Database + +Access atomic and isotopic data for all elements. + +```python +import pyopenms as oms + +# Get element database instance +element_db = oms.ElementDB() + +# Get element by symbol +carbon = element_db.getElement("C") +nitrogen = element_db.getElement("N") +oxygen = element_db.getElement("O") + +# Element properties +print(f"Carbon monoisotopic weight: {carbon.getMonoWeight()}") +print(f"Carbon average weight: {carbon.getAverageWeight()}") +print(f"Atomic number: {carbon.getAtomicNumber()}") +print(f"Symbol: {carbon.getSymbol()}") +print(f"Name: {carbon.getName()}") +``` + +### Isotope Information + +```python +# Get isotope distribution for an element +isotopes = carbon.getIsotopeDistribution() + +# Access specific isotope +c12 = element_db.getElement("C", 12) # Carbon-12 +c13 = element_db.getElement("C", 13) # Carbon-13 + +print(f"C-12 abundance: {isotopes.getContainer()[0].getIntensity()}") +print(f"C-13 abundance: {isotopes.getContainer()[1].getIntensity()}") + +# Isotope mass +print(f"C-12 mass: {c12.getMonoWeight()}") +print(f"C-13 mass: {c13.getMonoWeight()}") +``` + +### Constants + +```python +# Physical constants +avogadro = oms.Constants.AVOGADRO +electron_mass = oms.Constants.ELECTRON_MASS_U +proton_mass = oms.Constants.PROTON_MASS_U + +print(f"Avogadro's number: {avogadro}") +print(f"Electron mass: {electron_mass} u") +print(f"Proton mass: {proton_mass} u") +``` + +## Empirical Formulas + +### EmpiricalFormula - Molecular Formulas + +Represent and manipulate molecular formulas. + +#### Creating Formulas + +```python +# From string +glucose = oms.EmpiricalFormula("C6H12O6") +water = oms.EmpiricalFormula("H2O") +ammonia = oms.EmpiricalFormula("NH3") + +# From element composition +formula = oms.EmpiricalFormula() +formula.setCharge(1) # Set charge state +``` + +#### Formula Arithmetic + +```python +# Addition +sucrose = oms.EmpiricalFormula("C12H22O11") +hydrolyzed = sucrose + water # Hydrolysis adds water + +# Subtraction +dehydrated = glucose - water # Dehydration removes water + +# Multiplication +three_waters = water * 3 # 3 H2O = H6O3 + +# Division +formula_half = sucrose / 2 # Half the formula +``` + +#### Mass Calculations + +```python +# Monoisotopic mass +mono_mass = glucose.getMonoWeight() +print(f"Glucose monoisotopic mass: {mono_mass:.6f} Da") + +# Average mass +avg_mass = glucose.getAverageWeight() +print(f"Glucose average mass: {avg_mass:.6f} Da") + +# Mass difference +mass_diff = (glucose - water).getMonoWeight() +``` + +#### Elemental Composition + +```python +# Get element counts +formula = oms.EmpiricalFormula("C6H12O6") + +# Access individual elements +n_carbon = formula.getNumberOf(element_db.getElement("C")) +n_hydrogen = formula.getNumberOf(element_db.getElement("H")) +n_oxygen = formula.getNumberOf(element_db.getElement("O")) + +print(f"C: {n_carbon}, H: {n_hydrogen}, O: {n_oxygen}") + +# String representation +print(f"Formula: {formula.toString()}") +``` + +#### Isotope-Specific Formulas + +```python +# Specify specific isotopes using parentheses +labeled_glucose = oms.EmpiricalFormula("(13)C6H12O6") # All carbons are C-13 +partially_labeled = oms.EmpiricalFormula("C5(13)CH12O6") # One C-13 + +# Deuterium labeling +deuterated = oms.EmpiricalFormula("C6D12O6") # D2O instead of H2O +``` + +#### Charge States + +```python +# Set charge +formula = oms.EmpiricalFormula("C6H12O6") +formula.setCharge(1) # Positive charge + +# Get charge +charge = formula.getCharge() + +# Calculate m/z for charged molecule +mz = formula.getMonoWeight() / abs(charge) if charge != 0 else formula.getMonoWeight() +``` + +### Isotope Pattern Generation + +Generate theoretical isotope patterns for formulas. + +#### CoarseIsotopePatternGenerator + +For unit mass resolution (low-resolution instruments). + +```python +# Create generator +coarse_gen = oms.CoarseIsotopePatternGenerator() + +# Generate pattern +formula = oms.EmpiricalFormula("C6H12O6") +pattern = coarse_gen.run(formula) + +# Access isotope peaks +iso_dist = pattern.getContainer() +for peak in iso_dist: + mass = peak.getMZ() + abundance = peak.getIntensity() + print(f"m/z: {mass:.4f}, Abundance: {abundance:.4f}") +``` + +#### FineIsotopePatternGenerator + +For high-resolution instruments (hyperfine structure). + +```python +# Create generator with resolution +fine_gen = oms.FineIsotopePatternGenerator(0.01) # 0.01 Da resolution + +# Generate fine pattern +fine_pattern = fine_gen.run(formula) + +# Access fine isotope structure +for peak in fine_pattern.getContainer(): + print(f"m/z: {peak.getMZ():.6f}, Abundance: {peak.getIntensity():.6f}") +``` + +#### Isotope Pattern Matching + +```python +# Compare experimental to theoretical +def compare_isotope_patterns(experimental_mz, experimental_int, formula): + # Generate theoretical + coarse_gen = oms.CoarseIsotopePatternGenerator() + theoretical = coarse_gen.run(formula) + + # Extract theoretical peaks + theo_peaks = theoretical.getContainer() + theo_mz = [p.getMZ() for p in theo_peaks] + theo_int = [p.getIntensity() for p in theo_peaks] + + # Normalize both patterns + exp_int_norm = [i / max(experimental_int) for i in experimental_int] + theo_int_norm = [i / max(theo_int) for i in theo_int] + + # Calculate similarity (e.g., cosine similarity) + # ... implement similarity calculation + return similarity_score +``` + +## Amino Acids and Residues + +### Residue - Amino Acid Representation + +Access properties of amino acids. + +```python +# Get residue database +res_db = oms.ResidueDB() + +# Get specific residue +leucine = res_db.getResidue("Leucine") +# Or by one-letter code +leu = res_db.getResidue("L") + +# Residue properties +print(f"Name: {leucine.getName()}") +print(f"Three-letter code: {leucine.getThreeLetterCode()}") +print(f"One-letter code: {leucine.getOneLetterCode()}") +print(f"Monoisotopic mass: {leucine.getMonoWeight():.6f}") +print(f"Average mass: {leucine.getAverageWeight():.6f}") + +# Chemical formula +formula = leucine.getFormula() +print(f"Formula: {formula.toString()}") + +# pKa values +print(f"pKa (N-term): {leucine.getPka()}") +print(f"pKa (C-term): {leucine.getPkb()}") +print(f"pKa (side chain): {leucine.getPkc()}") + +# Side chain basicity/acidity +print(f"Basicity: {leucine.getBasicity()}") +print(f"Hydrophobicity: {leucine.getHydrophobicity()}") +``` + +### All Standard Amino Acids + +```python +# Iterate over all residues +for residue_name in ["Alanine", "Cysteine", "Aspartic acid", "Glutamic acid", + "Phenylalanine", "Glycine", "Histidine", "Isoleucine", + "Lysine", "Leucine", "Methionine", "Asparagine", + "Proline", "Glutamine", "Arginine", "Serine", + "Threonine", "Valine", "Tryptophan", "Tyrosine"]: + res = res_db.getResidue(residue_name) + print(f"{res.getOneLetterCode()}: {res.getMonoWeight():.4f} Da") +``` + +### Internal Residues vs. Termini + +```python +# Get internal residue mass (no terminal groups) +internal_mass = leucine.getInternalToFull() + +# Get residue with N-terminal modification +n_terminal = res_db.getResidue("L[1]") # With NH2 + +# Get residue with C-terminal modification +c_terminal = res_db.getResidue("L[2]") # With COOH +``` + +## Peptide Sequences + +### AASequence - Amino Acid Sequences + +Represent and manipulate peptide sequences. + +#### Creating Sequences + +```python +# From string +peptide = oms.AASequence.fromString("PEPTIDE") +longer = oms.AASequence.fromString("MKTAYIAKQRQISFVK") + +# Empty sequence +empty_seq = oms.AASequence() +``` + +#### Sequence Properties + +```python +peptide = oms.AASequence.fromString("PEPTIDE") + +# Length +length = peptide.size() +print(f"Length: {length} residues") + +# Mass +mono_mass = peptide.getMonoWeight() +avg_mass = peptide.getAverageWeight() +print(f"Monoisotopic mass: {mono_mass:.6f} Da") +print(f"Average mass: {avg_mass:.6f} Da") + +# Formula +formula = peptide.getFormula() +print(f"Formula: {formula.toString()}") + +# String representation +seq_str = peptide.toString() +print(f"Sequence: {seq_str}") +``` + +#### Accessing Individual Residues + +```python +peptide = oms.AASequence.fromString("PEPTIDE") + +# Access by index +first_aa = peptide[0] # Returns Residue object +print(f"First amino acid: {first_aa.getOneLetterCode()}") + +# Iterate +for i in range(peptide.size()): + residue = peptide[i] + print(f"Position {i}: {residue.getOneLetterCode()}") +``` + +#### Modifications + +Add post-translational modifications (PTMs) to sequences. + +```python +# Modifications in sequence string +# Format: AA(ModificationName) +oxidized_met = oms.AASequence.fromString("PEPTIDEM(Oxidation)") +phospho = oms.AASequence.fromString("PEPTIDES(Phospho)T(Phospho)") + +# Multiple modifications +multi_mod = oms.AASequence.fromString("M(Oxidation)PEPTIDEK(Acetyl)") + +# N-terminal modifications +n_term_acetyl = oms.AASequence.fromString("(Acetyl)PEPTIDE") + +# C-terminal modifications +c_term_amide = oms.AASequence.fromString("PEPTIDE(Amidated)") + +# Check mass change +unmodified = oms.AASequence.fromString("PEPTIDE") +modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)") +mass_diff = modified.getMonoWeight() - unmodified.getMonoWeight() +print(f"Mass shift from oxidation: {mass_diff:.6f} Da") +``` + +#### Sequence Manipulation + +```python +# Prefix (N-terminal fragment) +prefix = peptide.getPrefix(3) # First 3 residues +print(f"Prefix: {prefix.toString()}") + +# Suffix (C-terminal fragment) +suffix = peptide.getSuffix(3) # Last 3 residues +print(f"Suffix: {suffix.toString()}") + +# Subsequence +subseq = peptide.getSubsequence(2, 4) # Residues 2-4 +print(f"Subsequence: {subseq.toString()}") +``` + +#### Theoretical Fragmentation + +Generate theoretical fragment ions for MS/MS. + +```python +peptide = oms.AASequence.fromString("PEPTIDE") + +# b-ions (N-terminal fragments) +b_ions = [] +for i in range(1, peptide.size()): + b_fragment = peptide.getPrefix(i) + b_mass = b_fragment.getMonoWeight() + b_ions.append(('b', i, b_mass)) + print(f"b{i}: {b_mass:.4f}") + +# y-ions (C-terminal fragments) +y_ions = [] +for i in range(1, peptide.size()): + y_fragment = peptide.getSuffix(i) + y_mass = y_fragment.getMonoWeight() + y_ions.append(('y', i, y_mass)) + print(f"y{i}: {y_mass:.4f}") + +# a-ions (b - CO) +a_ions = [] +CO_mass = 27.994915 # CO loss +for ion_type, position, mass in b_ions: + a_mass = mass - CO_mass + a_ions.append(('a', position, a_mass)) + +# c-ions (b + NH3) +NH3_mass = 17.026549 # NH3 gain +c_ions = [] +for ion_type, position, mass in b_ions: + c_mass = mass + NH3_mass + c_ions.append(('c', position, c_mass)) + +# z-ions (y - NH3) +z_ions = [] +for ion_type, position, mass in y_ions: + z_mass = mass - NH3_mass + z_ions.append(('z', position, z_mass)) +``` + +#### Calculate m/z for Charge States + +```python +peptide = oms.AASequence.fromString("PEPTIDE") +proton_mass = 1.007276 + +# [M+H]+ +mz_1 = peptide.getMonoWeight() + proton_mass +print(f"[M+H]+: {mz_1:.4f}") + +# [M+2H]2+ +mz_2 = (peptide.getMonoWeight() + 2 * proton_mass) / 2 +print(f"[M+2H]2+: {mz_2:.4f}") + +# [M+3H]3+ +mz_3 = (peptide.getMonoWeight() + 3 * proton_mass) / 3 +print(f"[M+3H]3+: {mz_3:.4f}") + +# General formula for any charge +def calculate_mz(sequence, charge): + proton_mass = 1.007276 + return (sequence.getMonoWeight() + charge * proton_mass) / charge + +for z in range(1, 5): + print(f"[M+{z}H]{z}+: {calculate_mz(peptide, z):.4f}") +``` + +## Protein Digestion + +### ProteaseDigestion - Enzymatic Cleavage + +Simulate enzymatic protein digestion. + +#### Basic Digestion + +```python +# Create digestion object +dig = oms.ProteaseDigestion() + +# Set enzyme +dig.setEnzyme("Trypsin") # Cleaves after K, R + +# Other common enzymes: +# - "Trypsin" (K, R) +# - "Lys-C" (K) +# - "Arg-C" (R) +# - "Asp-N" (D) +# - "Glu-C" (E, D) +# - "Chymotrypsin" (F, Y, W, L) + +# Set missed cleavages +dig.setMissedCleavages(0) # No missed cleavages +dig.setMissedCleavages(2) # Allow up to 2 missed cleavages + +# Perform digestion +protein = oms.AASequence.fromString("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK") +peptides = [] +dig.digest(protein, peptides) + +# Print results +for pep in peptides: + print(f"{pep.toString()}: {pep.getMonoWeight():.2f} Da") +``` + +#### Advanced Digestion Options + +```python +# Get enzyme specificity +specificity = dig.getSpecificity() +# oms.EnzymaticDigestion.SPEC_FULL (both termini) +# oms.EnzymaticDigestion.SPEC_SEMI (one terminus) +# oms.EnzymaticDigestion.SPEC_NONE (no specificity) + +# Set specificity for semi-tryptic search +dig.setSpecificity(oms.EnzymaticDigestion.SPEC_SEMI) + +# Get cleavage sites +cleavage_residues = dig.getEnzyme().getCutAfterResidues() +restriction_residues = dig.getEnzyme().getRestriction() +``` + +#### Filter Peptides by Properties + +```python +# Filter by mass range +min_mass = 600.0 +max_mass = 4000.0 +filtered = [p for p in peptides if min_mass <= p.getMonoWeight() <= max_mass] + +# Filter by length +min_length = 6 +max_length = 30 +length_filtered = [p for p in peptides if min_length <= p.size() <= max_length] + +# Combine filters +valid_peptides = [p for p in peptides + if min_mass <= p.getMonoWeight() <= max_mass + and min_length <= p.size() <= max_length] +``` + +## Modifications + +### ModificationsDB - Modification Database + +Access and apply post-translational modifications. + +#### Accessing Modifications + +```python +# Get modifications database +mod_db = oms.ModificationsDB() + +# Get specific modification +oxidation = mod_db.getModification("Oxidation") +phospho = mod_db.getModification("Phospho") +acetyl = mod_db.getModification("Acetyl") + +# Modification properties +print(f"Name: {oxidation.getFullName()}") +print(f"Mass difference: {oxidation.getDiffMonoMass():.6f} Da") +print(f"Formula: {oxidation.getDiffFormula().toString()}") + +# Affected residues +print(f"Residues: {oxidation.getResidues()}") # e.g., ['M'] + +# Specificity (N-term, C-term, anywhere) +print(f"Term specificity: {oxidation.getTermSpecificity()}") +``` + +#### Common Modifications + +```python +# Oxidation (M) +oxidation = mod_db.getModification("Oxidation") +print(f"Oxidation: +{oxidation.getDiffMonoMass():.4f} Da") + +# Phosphorylation (S, T, Y) +phospho = mod_db.getModification("Phospho") +print(f"Phospho: +{phospho.getDiffMonoMass():.4f} Da") + +# Carbamidomethylation (C) - common alkylation +carbamido = mod_db.getModification("Carbamidomethyl") +print(f"Carbamidomethyl: +{carbamido.getDiffMonoMass():.4f} Da") + +# Acetylation (K, N-term) +acetyl = mod_db.getModification("Acetyl") +print(f"Acetyl: +{acetyl.getDiffMonoMass():.4f} Da") + +# Deamidation (N, Q) +deamid = mod_db.getModification("Deamidated") +print(f"Deamidation: +{deamid.getDiffMonoMass():.4f} Da") +``` + +#### Searching Modifications + +```python +# Search modifications by mass +mass_tolerance = 0.01 # Da +target_mass = 15.9949 # Oxidation + +# Get all modifications +all_mods = [] +mod_db.getAllSearchModifications(all_mods) + +# Find matching modifications +matching = [] +for mod_name in all_mods: + mod = mod_db.getModification(mod_name) + if abs(mod.getDiffMonoMass() - target_mass) < mass_tolerance: + matching.append(mod) + print(f"Match: {mod.getFullName()} ({mod.getDiffMonoMass():.4f} Da)") +``` + +#### Variable vs. Fixed Modifications + +```python +# In search engines, specify: +# Fixed modifications: applied to all occurrences +fixed_mods = ["Carbamidomethyl (C)"] + +# Variable modifications: optionally present +variable_mods = ["Oxidation (M)", "Phospho (S)", "Phospho (T)", "Phospho (Y)"] +``` + +## Ribonucleotides (RNA) + +### Ribonucleotide - RNA Building Blocks + +```python +# Get ribonucleotide database +ribo_db = oms.RibonucleotideDB() + +# Get specific ribonucleotide +adenine = ribo_db.getRibonucleotide("A") +uracil = ribo_db.getRibonucleotide("U") +guanine = ribo_db.getRibonucleotide("G") +cytosine = ribo_db.getRibonucleotide("C") + +# Properties +print(f"Adenine mono mass: {adenine.getMonoWeight()}") +print(f"Formula: {adenine.getFormula().toString()}") + +# Modified ribonucleotides +modified_ribo = ribo_db.getRibonucleotide("m6A") # N6-methyladenosine +``` + +## Practical Examples + +### Calculate Peptide Mass with Modifications + +```python +def calculate_peptide_mz(sequence_str, charge): + """Calculate m/z for a peptide sequence string with modifications.""" + peptide = oms.AASequence.fromString(sequence_str) + proton_mass = 1.007276 + mz = (peptide.getMonoWeight() + charge * proton_mass) / charge + return mz + +# Examples +print(calculate_peptide_mz("PEPTIDE", 2)) # Unmodified [M+2H]2+ +print(calculate_peptide_mz("PEPTIDEM(Oxidation)", 2)) # With oxidation +print(calculate_peptide_mz("(Acetyl)PEPTIDEK(Acetyl)", 2)) # Acetylated +``` + +### Generate Complete Fragment Ion Series + +```python +def generate_fragment_ions(sequence_str, charge_states=[1, 2]): + """Generate comprehensive fragment ion list.""" + peptide = oms.AASequence.fromString(sequence_str) + proton_mass = 1.007276 + fragments = [] + + for i in range(1, peptide.size()): + # b and y ions + b_frag = peptide.getPrefix(i) + y_frag = peptide.getSuffix(i) + + for z in charge_states: + b_mz = (b_frag.getMonoWeight() + z * proton_mass) / z + y_mz = (y_frag.getMonoWeight() + z * proton_mass) / z + + fragments.append({ + 'type': 'b', + 'position': i, + 'charge': z, + 'mz': b_mz + }) + fragments.append({ + 'type': 'y', + 'position': i, + 'charge': z, + 'mz': y_mz + }) + + return fragments + +# Usage +ions = generate_fragment_ions("PEPTIDE", charge_states=[1, 2]) +for ion in ions: + print(f"{ion['type']}{ion['position']}^{ion['charge']}+: {ion['mz']:.4f}") +``` + +### Digest Protein and Calculate Peptide Masses + +```python +def digest_and_calculate(protein_seq_str, enzyme="Trypsin", missed_cleavages=2, + min_mass=600, max_mass=4000): + """Digest protein and return valid peptides with masses.""" + dig = oms.ProteaseDigestion() + dig.setEnzyme(enzyme) + dig.setMissedCleavages(missed_cleavages) + + protein = oms.AASequence.fromString(protein_seq_str) + peptides = [] + dig.digest(protein, peptides) + + results = [] + for pep in peptides: + mass = pep.getMonoWeight() + if min_mass <= mass <= max_mass: + results.append({ + 'sequence': pep.toString(), + 'mass': mass, + 'length': pep.size() + }) + + return results + +# Usage +protein = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK" +peptides = digest_and_calculate(protein) +for pep in peptides: + print(f"{pep['sequence']}: {pep['mass']:.2f} Da ({pep['length']} aa)") +``` diff --git a/scientific-packages/pyopenms/references/data_structures.md b/scientific-packages/pyopenms/references/data_structures.md new file mode 100644 index 0000000..cb0f977 --- /dev/null +++ b/scientific-packages/pyopenms/references/data_structures.md @@ -0,0 +1,560 @@ +# pyOpenMS Data Structures Reference + +This document provides comprehensive coverage of core data structures in pyOpenMS for representing mass spectrometry data. + +## Core Hierarchy + +``` +MSExperiment # Top-level: Complete LC-MS/MS run +├── MSSpectrum[] # Collection of mass spectra +│ ├── Peak1D[] # Individual m/z, intensity pairs +│ └── SpectrumSettings # Metadata (RT, MS level, precursor) +└── MSChromatogram[] # Collection of chromatograms + ├── ChromatogramPeak[] # RT, intensity pairs + └── ChromatogramSettings # Metadata +``` + +## MSSpectrum + +Represents a single mass spectrum (1-dimensional peak data). + +### Creation and Basic Properties + +```python +import pyopenms as oms + +# Create empty spectrum +spectrum = oms.MSSpectrum() + +# Set metadata +spectrum.setRT(123.45) # Retention time in seconds +spectrum.setMSLevel(1) # MS level (1 for MS1, 2 for MS2, etc.) +spectrum.setNativeID("scan=1234") # Native ID from file + +# Additional metadata +spectrum.setDriftTime(15.2) # Ion mobility drift time +spectrum.setName("MyScan") # Optional name +``` + +### Peak Data Management + +**Setting Peaks (Method 1 - Lists):** +```python +mz_values = [100.5, 200.3, 300.7, 400.2, 500.1] +intensity_values = [1000, 5000, 3000, 2000, 1500] + +spectrum.set_peaks((mz_values, intensity_values)) +``` + +**Setting Peaks (Method 2 - NumPy arrays):** +```python +import numpy as np + +mz_array = np.array([100.5, 200.3, 300.7, 400.2, 500.1]) +intensity_array = np.array([1000, 5000, 3000, 2000, 1500]) + +spectrum.set_peaks((mz_array, intensity_array)) +``` + +**Retrieving Peaks:** +```python +# Get as numpy arrays (efficient) +mz_array, intensity_array = spectrum.get_peaks() + +# Check number of peaks +n_peaks = spectrum.size() + +# Get individual peak (slower) +for i in range(spectrum.size()): + peak = spectrum[i] + mz = peak.getMZ() + intensity = peak.getIntensity() +``` + +### Precursor Information (for MS2/MSn spectra) + +```python +# Create precursor +precursor = oms.Precursor() +precursor.setMZ(456.789) # Precursor m/z +precursor.setCharge(2) # Precursor charge +precursor.setIntensity(50000) # Precursor intensity +precursor.setIsolationWindowLowerOffset(1.5) # Lower isolation window +precursor.setIsolationWindowUpperOffset(1.5) # Upper isolation window + +# Set activation method +activation = oms.Activation() +activation.setActivationEnergy(35.0) # Collision energy +activation.setMethod(oms.Activation.ActivationMethod.CID) +precursor.setActivation(activation) + +# Assign to spectrum +spectrum.setPrecursors([precursor]) + +# Retrieve precursor information +precursors = spectrum.getPrecursors() +if len(precursors) > 0: + prec = precursors[0] + print(f"Precursor m/z: {prec.getMZ()}") + print(f"Precursor charge: {prec.getCharge()}") +``` + +### Spectrum Metadata Access + +```python +# Check if spectrum is sorted by m/z +is_sorted = spectrum.isSorted() + +# Sort spectrum by m/z +spectrum.sortByPosition() + +# Sort by intensity +spectrum.sortByIntensity() + +# Clear all peaks +spectrum.clear(False) # False = keep metadata, True = clear everything + +# Get retention time +rt = spectrum.getRT() + +# Get MS level +ms_level = spectrum.getMSLevel() +``` + +### Spectrum Types and Modes + +```python +# Set spectrum type +spectrum.setType(oms.SpectrumSettings.SpectrumType.CENTROID) # or PROFILE + +# Get spectrum type +spec_type = spectrum.getType() +if spec_type == oms.SpectrumSettings.SpectrumType.CENTROID: + print("Centroid spectrum") +elif spec_type == oms.SpectrumSettings.SpectrumType.PROFILE: + print("Profile spectrum") +``` + +### Data Processing Annotations + +```python +# Add processing information +processing = oms.DataProcessing() +processing.setMetaValue("smoothing", "gaussian") +spectrum.setDataProcessing([processing]) +``` + +## MSExperiment + +Represents a complete LC-MS/MS experiment containing multiple spectra and chromatograms. + +### Creation and Population + +```python +# Create empty experiment +exp = oms.MSExperiment() + +# Add spectra +spectrum1 = oms.MSSpectrum() +spectrum1.setRT(100.0) +spectrum1.set_peaks(([100, 200], [1000, 2000])) + +spectrum2 = oms.MSSpectrum() +spectrum2.setRT(200.0) +spectrum2.set_peaks(([100, 200], [1500, 2500])) + +exp.addSpectrum(spectrum1) +exp.addSpectrum(spectrum2) + +# Add chromatograms +chrom = oms.MSChromatogram() +chrom.set_peaks(([10.5, 11.0, 11.5], [1000, 5000, 3000])) +exp.addChromatogram(chrom) +``` + +### Accessing Spectra and Chromatograms + +```python +# Get number of spectra and chromatograms +n_spectra = exp.getNrSpectra() +n_chroms = exp.getNrChromatograms() + +# Access by index +first_spectrum = exp.getSpectrum(0) +last_spectrum = exp.getSpectrum(exp.getNrSpectra() - 1) + +# Iterate over all spectra +for spectrum in exp: + rt = spectrum.getRT() + ms_level = spectrum.getMSLevel() + n_peaks = spectrum.size() + print(f"RT: {rt:.2f}s, MS{ms_level}, Peaks: {n_peaks}") + +# Get all spectra as list +spectra = exp.getSpectra() + +# Access chromatograms +chrom = exp.getChromatogram(0) +``` + +### Filtering Operations + +```python +# Filter by MS level +exp.filterMSLevel(1) # Keep only MS1 spectra +exp.filterMSLevel(2) # Keep only MS2 spectra + +# Filter by retention time range +exp.filterRT(100.0, 500.0) # Keep RT between 100-500 seconds + +# Filter by m/z range (all spectra) +exp.filterMZ(300.0, 1500.0) # Keep m/z between 300-1500 + +# Filter by scan number +exp.filterScanNumber(100, 200) # Keep scans 100-200 +``` + +### Metadata and Properties + +```python +# Set experiment metadata +exp.setMetaValue("operator", "John Doe") +exp.setMetaValue("instrument", "Q Exactive HF") + +# Get metadata +operator = exp.getMetaValue("operator") + +# Get RT range +rt_range = exp.getMinRT(), exp.getMaxRT() + +# Get m/z range +mz_range = exp.getMinMZ(), exp.getMaxMZ() + +# Clear all data +exp.clear(False) # False = keep metadata +``` + +### Sorting and Organization + +```python +# Sort spectra by retention time +exp.sortSpectra() + +# Update ranges (call after modifications) +exp.updateRanges() + +# Check if experiment is empty +is_empty = exp.empty() + +# Reset (clear everything) +exp.reset() +``` + +## MSChromatogram + +Represents an extracted or reconstructed chromatogram (retention time vs. intensity). + +### Creation and Basic Usage + +```python +# Create chromatogram +chrom = oms.MSChromatogram() + +# Set peaks (RT, intensity pairs) +rt_values = [10.0, 10.5, 11.0, 11.5, 12.0] +intensity_values = [1000, 5000, 8000, 6000, 2000] +chrom.set_peaks((rt_values, intensity_values)) + +# Get peaks +rt_array, int_array = chrom.get_peaks() + +# Get size +n_points = chrom.size() +``` + +### Chromatogram Types + +```python +# Set chromatogram type +chrom.setChromatogramType(oms.ChromatogramSettings.ChromatogramType.SELECTED_ION_CURRENT_CHROMATOGRAM) + +# Other types: +# - TOTAL_ION_CURRENT_CHROMATOGRAM +# - BASEPEAK_CHROMATOGRAM +# - SELECTED_ION_CURRENT_CHROMATOGRAM +# - SELECTED_REACTION_MONITORING_CHROMATOGRAM +``` + +### Metadata + +```python +# Set native ID +chrom.setNativeID("TIC") + +# Set name +chrom.setName("Total Ion Current") + +# Access +native_id = chrom.getNativeID() +name = chrom.getName() +``` + +### Precursor and Product Information (for SRM/MRM) + +```python +# For targeted experiments +precursor = oms.Precursor() +precursor.setMZ(456.7) +chrom.setPrecursor(precursor) + +product = oms.Product() +product.setMZ(789.4) +chrom.setProduct(product) +``` + +## Peak1D and ChromatogramPeak + +Individual peak data points. + +### Peak1D (for mass spectra) + +```python +# Create individual peak +peak = oms.Peak1D() +peak.setMZ(456.789) +peak.setIntensity(10000) + +# Access +mz = peak.getMZ() +intensity = peak.getIntensity() + +# Set position and intensity +peak.setPosition([456.789]) +peak.setIntensity(10000) +``` + +### ChromatogramPeak (for chromatograms) + +```python +# Create chromatogram peak +chrom_peak = oms.ChromatogramPeak() +chrom_peak.setRT(125.5) +chrom_peak.setIntensity(5000) + +# Access +rt = chrom_peak.getRT() +intensity = chrom_peak.getIntensity() +``` + +## FeatureMap and Feature + +For quantification results. + +### Feature + +Represents a detected LC-MS feature (peptide or metabolite signal). + +```python +# Create feature +feature = oms.Feature() + +# Set properties +feature.setMZ(456.789) +feature.setRT(123.45) +feature.setIntensity(1000000) +feature.setCharge(2) +feature.setWidth(15.0) # RT width in seconds + +# Set quality score +feature.setOverallQuality(0.95) + +# Access +mz = feature.getMZ() +rt = feature.getRT() +intensity = feature.getIntensity() +charge = feature.getCharge() +``` + +### FeatureMap + +Collection of features. + +```python +# Create feature map +feature_map = oms.FeatureMap() + +# Add features +feature1 = oms.Feature() +feature1.setMZ(456.789) +feature1.setRT(123.45) +feature1.setIntensity(1000000) + +feature_map.push_back(feature1) + +# Get size +n_features = feature_map.size() + +# Iterate +for feature in feature_map: + print(f"m/z: {feature.getMZ():.4f}, RT: {feature.getRT():.2f}") + +# Access by index +first_feature = feature_map[0] + +# Clear +feature_map.clear() +``` + +## PeptideIdentification and ProteinIdentification + +For identification results. + +### PeptideIdentification + +```python +# Create peptide identification +pep_id = oms.PeptideIdentification() +pep_id.setRT(123.45) +pep_id.setMZ(456.789) + +# Create peptide hit +hit = oms.PeptideHit() +hit.setSequence(oms.AASequence.fromString("PEPTIDE")) +hit.setCharge(2) +hit.setScore(25.5) +hit.setRank(1) + +# Add to identification +pep_id.setHits([hit]) +pep_id.setHigherScoreBetter(True) +pep_id.setScoreType("XCorr") + +# Access +hits = pep_id.getHits() +for hit in hits: + seq = hit.getSequence().toString() + score = hit.getScore() + print(f"Sequence: {seq}, Score: {score}") +``` + +### ProteinIdentification + +```python +# Create protein identification +prot_id = oms.ProteinIdentification() + +# Create protein hit +prot_hit = oms.ProteinHit() +prot_hit.setAccession("P12345") +prot_hit.setSequence("MKTAYIAKQRQISFVK...") +prot_hit.setScore(100.5) + +# Add to identification +prot_id.setHits([prot_hit]) +prot_id.setScoreType("Mascot Score") +prot_id.setHigherScoreBetter(True) + +# Search parameters +search_params = oms.ProteinIdentification.SearchParameters() +search_params.db = "uniprot_human.fasta" +search_params.enzyme = "Trypsin" +prot_id.setSearchParameters(search_params) +``` + +## ConsensusMap and ConsensusFeature + +For linking features across multiple samples. + +### ConsensusFeature + +```python +# Create consensus feature +cons_feature = oms.ConsensusFeature() +cons_feature.setMZ(456.789) +cons_feature.setRT(123.45) +cons_feature.setIntensity(5000000) # Combined intensity + +# Access linked features +for handle in cons_feature.getFeatureList(): + map_index = handle.getMapIndex() + feature_index = handle.getIndex() + intensity = handle.getIntensity() +``` + +### ConsensusMap + +```python +# Create consensus map +consensus_map = oms.ConsensusMap() + +# Add consensus features +consensus_map.push_back(cons_feature) + +# Iterate +for cons_feat in consensus_map: + mz = cons_feat.getMZ() + rt = cons_feat.getRT() + n_features = cons_feat.size() # Number of linked features +``` + +## Best Practices + +1. **Use numpy arrays** for peak data when possible - much faster than individual peak access +2. **Sort spectra** by position (m/z) before searching or filtering +3. **Update ranges** after modifying MSExperiment: `exp.updateRanges()` +4. **Check MS level** before processing - different algorithms for MS1 vs MS2 +5. **Validate precursor info** for MS2 spectra - ensure charge and m/z are set +6. **Use appropriate containers** - MSExperiment for raw data, FeatureMap for quantification +7. **Clear metadata carefully** - use `clear(False)` to preserve metadata when clearing peaks + +## Common Patterns + +### Create MS2 Spectrum with Precursor + +```python +spectrum = oms.MSSpectrum() +spectrum.setRT(205.2) +spectrum.setMSLevel(2) +spectrum.set_peaks(([100, 200, 300], [1000, 5000, 3000])) + +precursor = oms.Precursor() +precursor.setMZ(450.5) +precursor.setCharge(2) +spectrum.setPrecursors([precursor]) +``` + +### Extract MS1 Spectra from Experiment + +```python +ms1_exp = oms.MSExperiment() +for spectrum in exp: + if spectrum.getMSLevel() == 1: + ms1_exp.addSpectrum(spectrum) +``` + +### Calculate Total Ion Current (TIC) + +```python +tic_values = [] +rt_values = [] +for spectrum in exp: + if spectrum.getMSLevel() == 1: + mz, intensity = spectrum.get_peaks() + tic = np.sum(intensity) + tic_values.append(tic) + rt_values.append(spectrum.getRT()) +``` + +### Find Spectrum Closest to RT + +```python +target_rt = 125.5 +closest_spectrum = None +min_diff = float('inf') + +for spectrum in exp: + diff = abs(spectrum.getRT() - target_rt) + if diff < min_diff: + min_diff = diff + closest_spectrum = spectrum +```