From f124e28509bc14385cf2f7d0fd1fe046ecdc60b1 Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Mon, 3 Nov 2025 16:55:01 -0800 Subject: [PATCH] Update the PyOpenMS skill --- scientific-packages/pyopenms/SKILL.md | 697 ++++---------- .../pyopenms/references/algorithms.md | 643 ------------- .../pyopenms/references/chemistry.md | 715 -------------- .../pyopenms/references/data_structures.md | 891 ++++++++---------- .../pyopenms/references/feature_detection.md | 410 ++++++++ .../pyopenms/references/file_io.md | 349 +++++++ .../pyopenms/references/identification.md | 422 +++++++++ .../pyopenms/references/metabolomics.md | 482 ++++++++++ .../pyopenms/references/signal_processing.md | 433 +++++++++ 9 files changed, 2699 insertions(+), 2343 deletions(-) delete mode 100644 scientific-packages/pyopenms/references/algorithms.md delete mode 100644 scientific-packages/pyopenms/references/chemistry.md create mode 100644 scientific-packages/pyopenms/references/feature_detection.md create mode 100644 scientific-packages/pyopenms/references/file_io.md create mode 100644 scientific-packages/pyopenms/references/identification.md create mode 100644 scientific-packages/pyopenms/references/metabolomics.md create mode 100644 scientific-packages/pyopenms/references/signal_processing.md diff --git a/scientific-packages/pyopenms/SKILL.md b/scientific-packages/pyopenms/SKILL.md index 65651fa..2806039 100644 --- a/scientific-packages/pyopenms/SKILL.md +++ b/scientific-packages/pyopenms/SKILL.md @@ -1,530 +1,211 @@ --- name: pyopenms -description: "Mass spectrometry toolkit (OpenMS Python). Process mzML/mzXML, peak picking, feature detection, peptide ID, proteomics/metabolomics workflows, for LC-MS/MS analysis." +description: Python interface to OpenMS for mass spectrometry data analysis. Use for LC-MS/MS proteomics and metabolomics workflows including file handling (mzML, mzXML, mzTab, FASTA, pepXML, protXML, mzIdentML), signal processing, feature detection, peptide identification, and quantitative analysis. Apply when working with mass spectrometry data, analyzing proteomics experiments, or processing metabolomics datasets. --- -# pyOpenMS +# PyOpenMS ## Overview -pyOpenMS is an open-source Python library for mass spectrometry data analysis in proteomics and metabolomics. Process LC-MS/MS data, perform peptide identification, detect and quantify features, and integrate with common proteomics tools (Comet, Mascot, MSGF+, Percolator, MSstats) using Python bindings to the OpenMS C++ library. - -## When to Use This Skill - -This skill should be used when: -- Processing mass spectrometry data (mzML, mzXML files) -- Performing peak picking and feature detection in LC-MS data -- Conducting peptide and protein identification workflows -- Quantifying metabolites or proteins -- Integrating proteomics or metabolomics tools into Python pipelines -- Working with OpenMS tools and file formats - -## Core Capabilities - -### 1. File I/O and Data Import/Export - -Handle diverse mass spectrometry file formats efficiently: - -**Supported Formats:** -- **mzML/mzXML**: Primary raw MS data formats (profile or centroid) -- **FASTA**: Protein/peptide sequence databases -- **mzTab**: Standardized reporting format for identification and quantification -- **mzIdentML**: Peptide and protein identification data -- **TraML**: Transition lists for targeted experiments -- **pepXML/protXML**: Search engine results - -**Reading mzML Files:** -```python -import pyopenms as oms - -# Load MS data -exp = oms.MSExperiment() -oms.MzMLFile().load("input_data.mzML", exp) - -# Access basic information -print(f"Number of spectra: {exp.getNrSpectra()}") -print(f"Number of chromatograms: {exp.getNrChromatograms()}") -``` - -**Writing mzML Files:** -```python -# Save processed data -oms.MzMLFile().store("output_data.mzML", exp) -``` - -**File Encoding:** pyOpenMS automatically handles Base64 encoding, zlib compression, and Numpress compression internally. - -### 2. MS Data Structures and Manipulation - -Work with core mass spectrometry data structures. See `references/data_structures.md` for comprehensive details. - -**MSSpectrum** - Individual mass spectrum: -```python -# Create spectrum with metadata -spectrum = oms.MSSpectrum() -spectrum.setRT(205.2) # Retention time in seconds -spectrum.setMSLevel(2) # MS2 spectrum - -# Set peak data (m/z, intensity arrays) -mz_array = [100.5, 200.3, 300.7, 400.2] -intensity_array = [1000, 5000, 3000, 2000] -spectrum.set_peaks((mz_array, intensity_array)) - -# Add precursor information for MS2 -precursor = oms.Precursor() -precursor.setMZ(450.5) -precursor.setCharge(2) -spectrum.setPrecursors([precursor]) -``` - -**MSExperiment** - Complete LC-MS/MS run: -```python -# Create experiment and add spectra -exp = oms.MSExperiment() -exp.addSpectrum(spectrum) - -# Access spectra -first_spectrum = exp.getSpectrum(0) -for spec in exp: - print(f"RT: {spec.getRT()}, MS Level: {spec.getMSLevel()}") -``` - -**MSChromatogram** - Extracted ion chromatogram: -```python -# Create chromatogram -chrom = oms.MSChromatogram() -chrom.set_peaks(([10.5, 11.2, 11.8], [1000, 5000, 3000])) # RT, intensity -exp.addChromatogram(chrom) -``` - -**Efficient Peak Access:** -```python -# Get peaks as numpy arrays for fast processing -mz_array, intensity_array = spectrum.get_peaks() - -# Modify and set back -intensity_array *= 2 # Double all intensities -spectrum.set_peaks((mz_array, intensity_array)) -``` - -### 3. Chemistry and Peptide Handling - -Perform chemical calculations for proteomics and metabolomics. See `references/chemistry.md` for detailed examples. - -**Molecular Formulas and Mass Calculations:** -```python -# Create empirical formula -formula = oms.EmpiricalFormula("C6H12O6") # Glucose -print(f"Monoisotopic mass: {formula.getMonoWeight()}") -print(f"Average mass: {formula.getAverageWeight()}") - -# Formula arithmetic -water = oms.EmpiricalFormula("H2O") -dehydrated = formula - water - -# Isotope-specific formulas -heavy_carbon = oms.EmpiricalFormula("(13)C6H12O6") -``` - -**Isotopic Distributions:** -```python -# Generate coarse isotope pattern (unit mass resolution) -coarse_gen = oms.CoarseIsotopePatternGenerator() -pattern = coarse_gen.run(formula) - -# Generate fine structure (high resolution) -fine_gen = oms.FineIsotopePatternGenerator(0.01) # 0.01 Da resolution -fine_pattern = fine_gen.run(formula) -``` - -**Amino Acids and Residues:** -```python -# Access residue information -res_db = oms.ResidueDB() -leucine = res_db.getResidue("Leucine") -print(f"L monoisotopic mass: {leucine.getMonoWeight()}") -print(f"L formula: {leucine.getFormula()}") -print(f"L pKa: {leucine.getPka()}") -``` - -**Peptide Sequences:** -```python -# Create peptide sequence -peptide = oms.AASequence.fromString("PEPTIDE") -print(f"Peptide mass: {peptide.getMonoWeight()}") -print(f"Formula: {peptide.getFormula()}") - -# Add modifications -modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)") -print(f"Modified mass: {modified.getMonoWeight()}") - -# Theoretical fragmentation -ions = [] -for i in range(1, peptide.size()): - b_ion = peptide.getPrefix(i) - y_ion = peptide.getSuffix(i) - ions.append(('b', i, b_ion.getMonoWeight())) - ions.append(('y', i, y_ion.getMonoWeight())) -``` - -**Protein Digestion:** -```python -# Enzymatic digestion -dig = oms.ProteaseDigestion() -dig.setEnzyme("Trypsin") -dig.setMissedCleavages(2) - -protein_seq = oms.AASequence.fromString("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK") -peptides = [] -dig.digest(protein_seq, peptides) - -for pep in peptides: - print(f"{pep.toString()}: {pep.getMonoWeight():.2f} Da") -``` - -**Modifications:** -```python -# Access modification database -mod_db = oms.ModificationsDB() -oxidation = mod_db.getModification("Oxidation") -print(f"Oxidation mass diff: {oxidation.getDiffMonoMass()}") -print(f"Residues: {oxidation.getResidues()}") -``` - -### 4. Signal Processing and Filtering - -Apply algorithms to process and filter MS data. See `references/algorithms.md` for comprehensive coverage. - -**Spectral Smoothing:** -```python -# Gaussian smoothing -gauss_filter = oms.GaussFilter() -params = gauss_filter.getParameters() -params.setValue("gaussian_width", 0.2) -gauss_filter.setParameters(params) -gauss_filter.filterExperiment(exp) - -# Savitzky-Golay filter -sg_filter = oms.SavitzkyGolayFilter() -sg_filter.filterExperiment(exp) -``` - -**Peak Filtering:** -```python -# Keep only N largest peaks per spectrum -n_largest = oms.NLargest() -params = n_largest.getParameters() -params.setValue("n", 100) # Keep top 100 peaks -n_largest.setParameters(params) -n_largest.filterExperiment(exp) - -# Threshold filtering -threshold_filter = oms.ThresholdMower() -params = threshold_filter.getParameters() -params.setValue("threshold", 1000.0) # Remove peaks below 1000 intensity -threshold_filter.setParameters(params) -threshold_filter.filterExperiment(exp) - -# Window-based filtering -window_filter = oms.WindowMower() -params = window_filter.getParameters() -params.setValue("windowsize", 50.0) # 50 m/z windows -params.setValue("peakcount", 10) # Keep 10 highest per window -window_filter.setParameters(params) -window_filter.filterExperiment(exp) -``` - -**Spectrum Normalization:** -```python -normalizer = oms.Normalizer() -normalizer.filterExperiment(exp) -``` - -**MS Level Filtering:** -```python -# Keep only MS2 spectra -exp.filterMSLevel(2) - -# Filter by retention time range -exp.filterRT(100.0, 500.0) # Keep RT between 100-500 seconds - -# Filter by m/z range -exp.filterMZ(400.0, 1500.0) # Keep m/z between 400-1500 -``` - -### 5. Feature Detection and Quantification - -Detect and quantify features in LC-MS data: - -**Peak Picking (Centroiding):** -```python -# Convert profile data to centroid -picker = oms.PeakPickerHiRes() -params = picker.getParameters() -params.setValue("signal_to_noise", 1.0) -picker.setParameters(params) - -exp_centroided = oms.MSExperiment() -picker.pickExperiment(exp, exp_centroided) -``` - -**Feature Detection:** -```python -# Detect features across LC-MS runs -feature_finder = oms.FeatureFinderMultiplex() - -features = oms.FeatureMap() -feature_finder.run(exp, features, params) - -print(f"Found {features.size()} features") -for feature in features: - print(f"m/z: {feature.getMZ():.4f}, RT: {feature.getRT():.2f}, " - f"Intensity: {feature.getIntensity():.0f}") -``` - -**Feature Linking (Map Alignment):** -```python -# Link features across multiple samples -feature_grouper = oms.FeatureGroupingAlgorithmQT() -consensus_map = oms.ConsensusMap() - -# Provide multiple feature maps from different samples -feature_maps = [features1, features2, features3] -feature_grouper.group(feature_maps, consensus_map) -``` - -### 6. Peptide Identification Workflows - -Integrate with search engines and process identification results: - -**Database Searching:** -```python -# Prepare parameters for search engine -params = oms.Param() -params.setValue("database", "uniprot_human.fasta") -params.setValue("precursor_mass_tolerance", 10.0) # ppm -params.setValue("fragment_mass_tolerance", 0.5) # Da -params.setValue("enzyme", "Trypsin") -params.setValue("missed_cleavages", 2) - -# Variable modifications -params.setValue("variable_modifications", ["Oxidation (M)", "Phospho (STY)"]) - -# Fixed modifications -params.setValue("fixed_modifications", ["Carbamidomethyl (C)"]) -``` - -**FDR Control:** -```python -# False discovery rate estimation -fdr = oms.FalseDiscoveryRate() -fdr_threshold = 0.01 # 1% FDR - -# Apply to peptide identifications -protein_ids = [] -peptide_ids = [] -oms.IdXMLFile().load("search_results.idXML", protein_ids, peptide_ids) - -fdr.apply(protein_ids, peptide_ids) -``` - -### 7. Metabolomics Workflows - -Analyze small molecule data: - -**Adduct Detection:** -```python -# Common metabolite adducts -adducts = ["[M+H]+", "[M+Na]+", "[M+K]+", "[M-H]-", "[M+Cl]-"] - -# Feature annotation with adducts -for feature in features: - mz = feature.getMZ() - # Calculate neutral mass for each adduct hypothesis - for adduct in adducts: - # Annotation logic - pass -``` - -**Isotope Pattern Matching:** -```python -# Compare experimental to theoretical isotope patterns -experimental_pattern = [] # Extract from feature -theoretical = coarse_gen.run(formula) - -# Calculate similarity score -similarity = compare_isotope_patterns(experimental_pattern, theoretical) -``` - -### 8. Quality Control and Visualization - -Monitor data quality and visualize results: - -**Basic Statistics:** -```python -# Calculate TIC (Total Ion Current) -tic_values = [] -rt_values = [] -for spectrum in exp: - if spectrum.getMSLevel() == 1: - tic = sum(spectrum.get_peaks()[1]) # Sum intensities - tic_values.append(tic) - rt_values.append(spectrum.getRT()) - -# Base peak chromatogram -bpc_values = [] -for spectrum in exp: - if spectrum.getMSLevel() == 1: - max_intensity = max(spectrum.get_peaks()[1]) if spectrum.size() > 0 else 0 - bpc_values.append(max_intensity) -``` - -**Plotting (with pyopenms.plotting or matplotlib):** -```python -import matplotlib.pyplot as plt - -# Plot TIC -plt.figure(figsize=(10, 4)) -plt.plot(rt_values, tic_values) -plt.xlabel('Retention Time (s)') -plt.ylabel('Total Ion Current') -plt.title('TIC') -plt.show() - -# Plot single spectrum -spectrum = exp.getSpectrum(0) -mz, intensity = spectrum.get_peaks() -plt.stem(mz, intensity, basefmt=' ') -plt.xlabel('m/z') -plt.ylabel('Intensity') -plt.title(f'Spectrum at RT {spectrum.getRT():.2f}s') -plt.show() -``` - -## Common Workflows - -### Complete LC-MS/MS Processing Pipeline - -```python -import pyopenms as oms - -# 1. Load data -exp = oms.MSExperiment() -oms.MzMLFile().load("raw_data.mzML", exp) - -# 2. Filter and smooth -exp.filterMSLevel(1) # Keep only MS1 for feature detection -gauss = oms.GaussFilter() -gauss.filterExperiment(exp) - -# 3. Peak picking -picker = oms.PeakPickerHiRes() -exp_centroid = oms.MSExperiment() -picker.pickExperiment(exp, exp_centroid) - -# 4. Feature detection -ff = oms.FeatureFinderMultiplex() -features = oms.FeatureMap() -ff.run(exp_centroid, features, oms.Param()) - -# 5. Export results -oms.FeatureXMLFile().store("features.featureXML", features) -print(f"Detected {features.size()} features") -``` - -### Theoretical Peptide Mass Calculation - -```python -# Calculate masses for peptide with modifications -peptide = oms.AASequence.fromString("PEPTIDEK") -print(f"Unmodified [M+H]+: {peptide.getMonoWeight() + 1.007276:.4f}") - -# With modification -modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)K") -print(f"Oxidized [M+H]+: {modified.getMonoWeight() + 1.007276:.4f}") - -# Calculate for different charge states -for z in [1, 2, 3]: - mz = (peptide.getMonoWeight() + z * 1.007276) / z - print(f"[M+{z}H]^{z}+: {mz:.4f}") -``` +PyOpenMS provides Python bindings to the OpenMS library for computational mass spectrometry, enabling analysis of proteomics and metabolomics data. Use for handling mass spectrometry file formats, processing spectral data, detecting features, identifying peptides/proteins, and performing quantitative analysis. ## Installation -Ensure pyOpenMS is installed before using this skill: +Install using uv: ```bash -# Via conda (recommended) -conda install -c bioconda pyopenms +uv pip install pyopenms +``` -# Via pip -pip install pyopenms +Verify installation: + +```python +import pyopenms +print(pyopenms.__version__) +``` + +## Core Capabilities + +PyOpenMS organizes functionality into these domains: + +### 1. File I/O and Data Formats + +Handle mass spectrometry file formats and convert between representations. + +**Supported formats**: mzML, mzXML, TraML, mzTab, FASTA, pepXML, protXML, mzIdentML, featureXML, consensusXML, idXML + +Basic file reading: + +```python +import pyopenms as ms + +# Read mzML file +exp = ms.MSExperiment() +ms.MzMLFile().load("data.mzML", exp) + +# Access spectra +for spectrum in exp: + mz, intensity = spectrum.get_peaks() + print(f"Spectrum: {len(mz)} peaks") +``` + +**For detailed file handling**: See `references/file_io.md` + +### 2. Signal Processing + +Process raw spectral data with smoothing, filtering, centroiding, and normalization. + +Basic spectrum processing: + +```python +# Smooth spectrum with Gaussian filter +gaussian = ms.GaussFilter() +params = gaussian.getParameters() +params.setValue("gaussian_width", 0.1) +gaussian.setParameters(params) +gaussian.filterExperiment(exp) +``` + +**For algorithm details**: See `references/signal_processing.md` + +### 3. Feature Detection + +Detect and link features across spectra and samples for quantitative analysis. + +```python +# Detect features +ff = ms.FeatureFinder() +ff.run("centroided", exp, features, params, ms.FeatureMap()) +``` + +**For complete workflows**: See `references/feature_detection.md` + +### 4. Peptide and Protein Identification + +Integrate with search engines and process identification results. + +**Supported engines**: Comet, Mascot, MSGFPlus, XTandem, OMSSA, Myrimatch + +Basic identification workflow: + +```python +# Load identification data +protein_ids = [] +peptide_ids = [] +ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) + +# Apply FDR filtering +fdr = ms.FalseDiscoveryRate() +fdr.apply(peptide_ids) +``` + +**For detailed workflows**: See `references/identification.md` + +### 5. Metabolomics Analysis + +Perform untargeted metabolomics preprocessing and analysis. + +Typical workflow: +1. Load and process raw data +2. Detect features +3. Align retention times across samples +4. Link features to consensus map +5. Annotate with compound databases + +**For complete metabolomics workflows**: See `references/metabolomics.md` + +## Data Structures + +PyOpenMS uses these primary objects: + +- **MSExperiment**: Collection of spectra and chromatograms +- **MSSpectrum**: Single mass spectrum with m/z and intensity pairs +- **MSChromatogram**: Chromatographic trace +- **Feature**: Detected chromatographic peak with quality metrics +- **FeatureMap**: Collection of features +- **PeptideIdentification**: Search results for peptides +- **ProteinIdentification**: Search results for proteins + +**For detailed documentation**: See `references/data_structures.md` + +## Common Workflows + +### Quick Start: Load and Explore Data + +```python +import pyopenms as ms + +# Load mzML file +exp = ms.MSExperiment() +ms.MzMLFile().load("sample.mzML", exp) + +# Get basic statistics +print(f"Number of spectra: {exp.getNrSpectra()}") +print(f"Number of chromatograms: {exp.getNrChromatograms()}") + +# Examine first spectrum +spec = exp.getSpectrum(0) +print(f"MS level: {spec.getMSLevel()}") +print(f"Retention time: {spec.getRT()}") +mz, intensity = spec.get_peaks() +print(f"Peaks: {len(mz)}") +``` + +### Parameter Management + +Most algorithms use a parameter system: + +```python +# Get algorithm parameters +algo = ms.GaussFilter() +params = algo.getParameters() + +# View available parameters +for param in params.keys(): + print(f"{param}: {params.getValue(param)}") + +# Modify parameters +params.setValue("gaussian_width", 0.2) +algo.setParameters(params) +``` + +### Export to Pandas + +Convert data to pandas DataFrames for analysis: + +```python +import pyopenms as ms +import pandas as pd + +# Load feature map +fm = ms.FeatureMap() +ms.FeatureXMLFile().load("features.featureXML", fm) + +# Convert to DataFrame +df = fm.get_df() +print(df.head()) ``` ## Integration with Other Tools -pyOpenMS integrates seamlessly with: - -- **Search Engines**: Comet, Mascot, MSGF+, MSFragger, Sage, SpectraST -- **Post-processing**: Percolator, MSstats, Epiphany -- **Metabolomics**: SIRIUS, CSI:FingerID -- **Data Analysis**: Pandas, NumPy, SciPy for downstream analysis -- **Visualization**: Matplotlib, Seaborn for plotting +PyOpenMS integrates with: +- **Pandas**: Export data to DataFrames +- **NumPy**: Work with peak arrays +- **Scikit-learn**: Machine learning on MS data +- **Matplotlib/Seaborn**: Visualization +- **R**: Via rpy2 bridge ## Resources -### references/ +- **Official documentation**: https://pyopenms.readthedocs.io +- **OpenMS documentation**: https://www.openms.org +- **GitHub**: https://github.com/OpenMS/OpenMS -Detailed documentation on core concepts: +## References -- **data_structures.md** - Comprehensive guide to MSExperiment, MSSpectrum, MSChromatogram, and peak data handling -- **algorithms.md** - Complete reference for signal processing, filtering, feature detection, and quantification algorithms -- **chemistry.md** - In-depth coverage of chemistry calculations, peptide handling, modifications, and isotope distributions - -Load these references when needing detailed information about specific pyOpenMS capabilities. - -## Best Practices - -1. **File Format**: Always use mzML for raw MS data (standardized, well-supported) -2. **Peak Access**: Use `get_peaks()` and `set_peaks()` with numpy arrays for efficient processing -3. **Parameters**: Always check and configure algorithm parameters via `getParameters()` and `setParameters()` -4. **Memory**: For large datasets, process spectra iteratively rather than loading entire experiments -5. **Validation**: Check data integrity (MS levels, RT ordering, precursor information) after loading -6. **Modifications**: Use standard modification names from UniMod database -7. **Units**: RT in seconds, m/z in Thomson (Da/charge), intensity in arbitrary units - -## Common Patterns - -**Algorithm Application Pattern:** -```python -# 1. Instantiate algorithm -algorithm = oms.SomeAlgorithm() - -# 2. Get and configure parameters -params = algorithm.getParameters() -params.setValue("parameter_name", value) -algorithm.setParameters(params) - -# 3. Apply to data -algorithm.filterExperiment(exp) # or .process(), .run(), depending on algorithm -``` - -**File I/O Pattern:** -```python -# Read -data_container = oms.DataContainer() # MSExperiment, FeatureMap, etc. -oms.FileHandler().load("input.format", data_container) - -# Process -# ... manipulate data_container ... - -# Write -oms.FileHandler().store("output.format", data_container) -``` - -## Getting Help - -- **Documentation**: https://pyopenms.readthedocs.io/ -- **API Reference**: Browse class documentation for detailed method signatures -- **OpenMS Website**: https://www.openms.org/ -- **GitHub Issues**: https://github.com/OpenMS/OpenMS/issues +- `references/file_io.md` - Comprehensive file format handling +- `references/signal_processing.md` - Signal processing algorithms +- `references/feature_detection.md` - Feature detection and linking +- `references/identification.md` - Peptide and protein identification +- `references/metabolomics.md` - Metabolomics-specific workflows +- `references/data_structures.md` - Core objects and data structures diff --git a/scientific-packages/pyopenms/references/algorithms.md b/scientific-packages/pyopenms/references/algorithms.md deleted file mode 100644 index c01e313..0000000 --- a/scientific-packages/pyopenms/references/algorithms.md +++ /dev/null @@ -1,643 +0,0 @@ -# pyOpenMS Algorithms Reference - -This document provides comprehensive coverage of algorithms available in pyOpenMS for signal processing, feature detection, and quantification. - -## Algorithm Usage Pattern - -Most pyOpenMS algorithms follow a consistent pattern: - -```python -import pyopenms as oms - -# 1. Instantiate algorithm -algorithm = oms.AlgorithmName() - -# 2. Get parameters -params = algorithm.getParameters() - -# 3. Modify parameters -params.setValue("parameter_name", value) - -# 4. Set parameters back -algorithm.setParameters(params) - -# 5. Apply to data -algorithm.filterExperiment(exp) # or .process(), .run(), etc. -``` - -## Signal Processing Algorithms - -### Smoothing Filters - -#### GaussFilter - Gaussian Smoothing - -Applies Gaussian smoothing to reduce noise. - -```python -gauss = oms.GaussFilter() - -# Configure parameters -params = gauss.getParameters() -params.setValue("gaussian_width", 0.2) # Gaussian width (larger = more smoothing) -params.setValue("ppm_tolerance", 10.0) # PPM tolerance for spacing -params.setValue("use_ppm_tolerance", "true") -gauss.setParameters(params) - -# Apply to experiment -gauss.filterExperiment(exp) - -# Or apply to single spectrum -spectrum_smoothed = oms.MSSpectrum() -gauss.filter(spectrum, spectrum_smoothed) -``` - -**Key Parameters:** -- `gaussian_width`: Width of Gaussian kernel (default: 0.2 Da) -- `ppm_tolerance`: Tolerance in ppm for spacing -- `use_ppm_tolerance`: Whether to use ppm instead of absolute spacing - -#### SavitzkyGolayFilter - -Applies Savitzky-Golay smoothing (polynomial fitting). - -```python -sg_filter = oms.SavitzkyGolayFilter() - -params = sg_filter.getParameters() -params.setValue("frame_length", 11) # Window size (must be odd) -params.setValue("polynomial_order", 3) # Polynomial degree -sg_filter.setParameters(params) - -sg_filter.filterExperiment(exp) -``` - -**Key Parameters:** -- `frame_length`: Size of smoothing window (must be odd) -- `polynomial_order`: Degree of polynomial (typically 2-4) - -### Peak Filtering - -#### NLargest - Keep Top N Peaks - -Retains only the N most intense peaks per spectrum. - -```python -n_largest = oms.NLargest() - -params = n_largest.getParameters() -params.setValue("n", 100) # Keep top 100 peaks -params.setValue("threshold", 0.0) # Optional minimum intensity -n_largest.setParameters(params) - -n_largest.filterExperiment(exp) -``` - -**Key Parameters:** -- `n`: Number of peaks to keep per spectrum -- `threshold`: Minimum absolute intensity threshold - -#### ThresholdMower - Intensity Threshold Filtering - -Removes peaks below a specified intensity threshold. - -```python -threshold_filter = oms.ThresholdMower() - -params = threshold_filter.getParameters() -params.setValue("threshold", 1000.0) # Absolute intensity threshold -threshold_filter.setParameters(params) - -threshold_filter.filterExperiment(exp) -``` - -**Key Parameters:** -- `threshold`: Absolute intensity cutoff - -#### WindowMower - Window-Based Peak Selection - -Divides m/z range into windows and keeps top N peaks per window. - -```python -window_mower = oms.WindowMower() - -params = window_mower.getParameters() -params.setValue("windowsize", 50.0) # Window size in Da (or Thomson) -params.setValue("peakcount", 10) # Peaks to keep per window -params.setValue("movetype", "jump") # "jump" or "slide" -window_mower.setParameters(params) - -window_mower.filterExperiment(exp) -``` - -**Key Parameters:** -- `windowsize`: Size of m/z window (Da) -- `peakcount`: Number of peaks to retain per window -- `movetype`: "jump" (non-overlapping) or "slide" (overlapping windows) - -#### BernNorm - Bernoulli Normalization - -Statistical normalization based on Bernoulli distribution. - -```python -bern_norm = oms.BernNorm() - -params = bern_norm.getParameters() -params.setValue("threshold", 0.7) # Threshold for normalization -bern_norm.setParameters(params) - -bern_norm.filterExperiment(exp) -``` - -### Spectrum Normalization - -#### Normalizer - -Normalizes spectrum intensities to unit total intensity or maximum intensity. - -```python -normalizer = oms.Normalizer() - -params = normalizer.getParameters() -params.setValue("method", "to_one") # "to_one" or "to_TIC" -normalizer.setParameters(params) - -normalizer.filterExperiment(exp) -``` - -**Methods:** -- `to_one`: Normalize max peak to 1.0 -- `to_TIC`: Normalize to total ion current = 1.0 - -#### Scaler - -Scales intensities by a constant factor. - -```python -scaler = oms.Scaler() - -params = scaler.getParameters() -params.setValue("scaling", 1000.0) # Scaling factor -scaler.setParameters(params) - -scaler.filterExperiment(exp) -``` - -## Centroiding and Peak Picking - -### PeakPickerHiRes - High-Resolution Peak Picking - -Converts profile spectra to centroid mode for high-resolution data. - -```python -picker = oms.PeakPickerHiRes() - -params = picker.getParameters() -params.setValue("signal_to_noise", 1.0) # S/N threshold -params.setValue("spacing_difference", 1.5) # Peak spacing factor -params.setValue("sn_win_len", 20.0) # S/N window length -params.setValue("sn_bin_count", 30) # Bins for S/N estimation -params.setValue("ms1_only", "false") # Process only MS1 -params.setValue("ms_levels", [1, 2]) # MS levels to process -picker.setParameters(params) - -# Pick peaks -exp_centroided = oms.MSExperiment() -picker.pickExperiment(exp, exp_centroided) -``` - -**Key Parameters:** -- `signal_to_noise`: Minimum signal-to-noise ratio -- `spacing_difference`: Minimum spacing between peaks -- `ms_levels`: List of MS levels to process - -### PeakPickerWavelet - Wavelet-Based Peak Picking - -Uses continuous wavelet transform for peak detection. - -```python -wavelet_picker = oms.PeakPickerWavelet() - -params = wavelet_picker.getParameters() -params.setValue("signal_to_noise", 1.0) -params.setValue("peak_width", 0.15) # Expected peak width (Da) -wavelet_picker.setParameters(params) - -wavelet_picker.pickExperiment(exp, exp_centroided) -``` - -## Feature Detection - -### FeatureFinder Algorithms - -Feature finders detect 2D features (m/z and RT) in LC-MS data. - -#### FeatureFinderMultiplex - -For multiplex labeling experiments (SILAC, dimethyl labeling). - -```python -ff = oms.FeatureFinderMultiplex() - -params = ff.getParameters() -params.setValue("algorithm:labels", "[]") # Empty for label-free -params.setValue("algorithm:charge", "2:4") # Charge range -params.setValue("algorithm:rt_typical", 40.0) # Expected feature RT width -params.setValue("algorithm:rt_min", 2.0) # Minimum RT width -params.setValue("algorithm:mz_tolerance", 10.0) # m/z tolerance (ppm) -params.setValue("algorithm:intensity_cutoff", 1000.0) # Minimum intensity -ff.setParameters(params) - -# Run feature detection -features = oms.FeatureMap() -ff.run(exp, features, oms.Param()) - -print(f"Found {features.size()} features") -``` - -**Key Parameters:** -- `algorithm:charge`: Charge state range to consider -- `algorithm:rt_typical`: Expected peak width in RT dimension -- `algorithm:mz_tolerance`: Mass tolerance in ppm -- `algorithm:intensity_cutoff`: Minimum intensity threshold - -#### FeatureFinderCentroided - -For centroided data, identifies isotope patterns and traces over RT. - -```python -ff_centroided = oms.FeatureFinderCentroided() - -params = ff_centroided.getParameters() -params.setValue("mass_trace:mz_tolerance", 10.0) # ppm -params.setValue("mass_trace:min_spectra", 5) # Min consecutive spectra -params.setValue("isotopic_pattern:charge_low", 1) -params.setValue("isotopic_pattern:charge_high", 4) -params.setValue("seed:min_score", 0.5) -ff_centroided.setParameters(params) - -features = oms.FeatureMap() -seeds = oms.FeatureMap() # Optional seed features -ff_centroided.run(exp, features, params, seeds) -``` - -#### FeatureFinderIdentification - -Uses peptide identifications to guide feature detection. - -```python -ff_id = oms.FeatureFinderIdentification() - -params = ff_id.getParameters() -params.setValue("extract:mz_window", 10.0) # ppm -params.setValue("extract:rt_window", 60.0) # seconds -params.setValue("detect:peak_width", 30.0) # Expected peak width -ff_id.setParameters(params) - -# Requires peptide identifications -protein_ids = [] -peptide_ids = [] -features = oms.FeatureMap() - -ff_id.run(exp, protein_ids, peptide_ids, features) -``` - -## Charge and Isotope Deconvolution - -### Decharging and Charge State Deconvolution - -#### FeatureDeconvolution - -Resolves charge states and combines features. - -```python -deconv = oms.FeatureDeconvolution() - -params = deconv.getParameters() -params.setValue("charge_min", 1) -params.setValue("charge_max", 4) -params.setValue("q_value", 0.01) # FDR threshold -deconv.setParameters(params) - -features_deconv = oms.FeatureMap() -consensus_map = oms.ConsensusMap() -deconv.compute(features, features_deconv, consensus_map) -``` - -## Map Alignment - -### MapAlignmentAlgorithm - -Aligns retention times across multiple LC-MS runs. - -#### MapAlignmentAlgorithmPoseClustering - -Pose clustering-based RT alignment. - -```python -aligner = oms.MapAlignmentAlgorithmPoseClustering() - -params = aligner.getParameters() -params.setValue("max_num_peaks_considered", 1000) -params.setValue("pairfinder:distance_MZ:max_difference", 0.3) # Da -params.setValue("pairfinder:distance_RT:max_difference", 60.0) # seconds -aligner.setParameters(params) - -# Align multiple feature maps -feature_maps = [features1, features2, features3] -transformations = [] - -# Create reference (e.g., use first map) -reference = oms.FeatureMap(feature_maps[0]) - -# Align others to reference -for fm in feature_maps[1:]: - transformation = oms.TransformationDescription() - aligner.align(fm, reference, transformation) - transformations.append(transformation) - - # Apply transformation - transformer = oms.MapAlignmentTransformer() - transformer.transformRetentionTimes(fm, transformation) -``` - -## Feature Linking - -### FeatureGroupingAlgorithm - -Links features across samples to create consensus features. - -#### FeatureGroupingAlgorithmQT - -Quality threshold-based feature linking. - -```python -grouper = oms.FeatureGroupingAlgorithmQT() - -params = grouper.getParameters() -params.setValue("distance_RT:max_difference", 60.0) # seconds -params.setValue("distance_MZ:max_difference", 10.0) # ppm -params.setValue("distance_MZ:unit", "ppm") -grouper.setParameters(params) - -# Create consensus map -consensus_map = oms.ConsensusMap() - -# Group features from multiple samples -feature_maps = [features1, features2, features3] -grouper.group(feature_maps, consensus_map) - -print(f"Created {consensus_map.size()} consensus features") -``` - -#### FeatureGroupingAlgorithmKD - -KD-tree based linking (faster for large datasets). - -```python -grouper_kd = oms.FeatureGroupingAlgorithmKD() - -params = grouper_kd.getParameters() -params.setValue("mz_unit", "ppm") -params.setValue("mz_tolerance", 10.0) -params.setValue("rt_tolerance", 30.0) -grouper_kd.setParameters(params) - -consensus_map = oms.ConsensusMap() -grouper_kd.group(feature_maps, consensus_map) -``` - -## Chromatographic Analysis - -### ElutionPeakDetection - -Detects elution peaks in chromatograms. - -```python -epd = oms.ElutionPeakDetection() - -params = epd.getParameters() -params.setValue("chrom_peak_snr", 3.0) # Signal-to-noise threshold -params.setValue("chrom_fwhm", 5.0) # Expected FWHM (seconds) -epd.setParameters(params) - -# Apply to chromatograms -for chrom in exp.getChromatograms(): - peaks = epd.detectPeaks(chrom) -``` - -### MRMFeatureFinderScoring - -Scoring and peak picking for targeted (MRM/SRM) experiments. - -```python -mrm_finder = oms.MRMFeatureFinderScoring() - -params = mrm_finder.getParameters() -params.setValue("TransitionGroupPicker:min_peak_width", 2.0) -params.setValue("TransitionGroupPicker:recalculate_peaks", "true") -params.setValue("TransitionGroupPicker:PeakPickerMRM:signal_to_noise", 1.0) -mrm_finder.setParameters(params) - -# Requires chromatograms -features = oms.FeatureMap() -mrm_finder.pickExperiment(chrom_exp, features, targets, transformation, swath_maps) -``` - -## Quantification - -### ProteinInference - -Infers proteins from peptide identifications. - -```python -protein_inference = oms.BasicProteinInferenceAlgorithm() - -# Apply to identification results -protein_inference.run(peptide_ids, protein_ids) -``` - -### IsobaricQuantification - -Quantification for isobaric labeling (TMT, iTRAQ). - -```python -# For TMT/iTRAQ quantification -iso_quant = oms.IsobaricQuantification() - -params = iso_quant.getParameters() -params.setValue("channel_116_description", "Sample1") -params.setValue("channel_117_description", "Sample2") -# ... configure all channels -iso_quant.setParameters(params) - -# Run quantification -quant_method = oms.IsobaricQuantitationMethod.TMT_10PLEX -quant_info = oms.IsobaricQuantifierStatistics() -iso_quant.quantify(exp, quant_info) -``` - -## Data Processing - -### BaselineFiltering - -Removes baseline from spectra. - -```python -baseline = oms.TopHatFilter() - -params = baseline.getParameters() -params.setValue("struc_elem_length", 3.0) # Structuring element size -params.setValue("struc_elem_unit", "Thomson") -baseline.setParameters(params) - -baseline.filterExperiment(exp) -``` - -### SpectraMerger - -Merges consecutive similar spectra. - -```python -merger = oms.SpectraMerger() - -params = merger.getParameters() -params.setValue("mz_binning_width", 0.05) # Binning width (Da) -params.setValue("sort_blocks", "RT_ascending") -merger.setParameters(params) - -merger.mergeSpectra(exp) -``` - -## Quality Control - -### MzMLFileQuality - -Analyzes mzML file quality. - -```python -# Calculate basic QC metrics -def calculate_qc_metrics(exp): - metrics = { - 'n_spectra': exp.getNrSpectra(), - 'n_ms1': sum(1 for s in exp if s.getMSLevel() == 1), - 'n_ms2': sum(1 for s in exp if s.getMSLevel() == 2), - 'rt_range': (exp.getMinRT(), exp.getMaxRT()), - 'mz_range': (exp.getMinMZ(), exp.getMaxMZ()), - } - - # Calculate TIC - tics = [] - for spectrum in exp: - if spectrum.getMSLevel() == 1: - mz, intensity = spectrum.get_peaks() - tics.append(sum(intensity)) - - metrics['median_tic'] = np.median(tics) - metrics['mean_tic'] = np.mean(tics) - - return metrics -``` - -## FDR Control - -### FalseDiscoveryRate - -Estimates and controls false discovery rate. - -```python -fdr = oms.FalseDiscoveryRate() - -params = fdr.getParameters() -params.setValue("add_decoy_peptides", "false") -params.setValue("add_decoy_proteins", "false") -fdr.setParameters(params) - -# Apply to identifications -fdr.apply(protein_ids, peptide_ids) - -# Filter by FDR threshold -fdr_threshold = 0.01 -filtered_peptides = [p for p in peptide_ids if p.getMetaValue("q-value") <= fdr_threshold] -``` - -## Algorithm Selection Guide - -### When to Use Which Algorithm - -**For Smoothing:** -- Use `GaussFilter` for general-purpose smoothing -- Use `SavitzkyGolayFilter` for preserving peak shapes - -**For Peak Picking:** -- Use `PeakPickerHiRes` for high-resolution Orbitrap/FT-ICR data -- Use `PeakPickerWavelet` for lower-resolution TOF data - -**For Feature Detection:** -- Use `FeatureFinderCentroided` for label-free proteomics (DDA) -- Use `FeatureFinderMultiplex` for SILAC/dimethyl labeling -- Use `FeatureFinderIdentification` when you have ID information -- Use `MRMFeatureFinderScoring` for targeted (MRM/SRM) experiments - -**For Feature Linking:** -- Use `FeatureGroupingAlgorithmQT` for small-medium datasets (<10 samples) -- Use `FeatureGroupingAlgorithmKD` for large datasets (>10 samples) - -## Parameter Tuning Tips - -1. **S/N Thresholds**: Start with 1-3 for clean data, increase for noisy data -2. **m/z Tolerance**: Use 5-10 ppm for high-resolution instruments, 0.5-1 Da for low-res -3. **RT Tolerance**: Typically 30-60 seconds depending on chromatographic stability -4. **Peak Width**: Measure from real data - varies by instrument and gradient length -5. **Charge States**: Set based on expected analytes (1-2 for metabolites, 2-4 for peptides) - -## Common Algorithm Workflows - -### Complete Proteomics Workflow - -```python -# 1. Load data -exp = oms.MSExperiment() -oms.MzMLFile().load("raw.mzML", exp) - -# 2. Smooth -gauss = oms.GaussFilter() -gauss.filterExperiment(exp) - -# 3. Peak picking -picker = oms.PeakPickerHiRes() -exp_centroid = oms.MSExperiment() -picker.pickExperiment(exp, exp_centroid) - -# 4. Feature detection -ff = oms.FeatureFinderCentroided() -features = oms.FeatureMap() -ff.run(exp_centroid, features, oms.Param(), oms.FeatureMap()) - -# 5. Save results -oms.FeatureXMLFile().store("features.featureXML", features) -``` - -### Multi-Sample Quantification - -```python -# Load multiple samples -feature_maps = [] -for filename in ["sample1.mzML", "sample2.mzML", "sample3.mzML"]: - exp = oms.MSExperiment() - oms.MzMLFile().load(filename, exp) - - # Process and detect features - features = detect_features(exp) # Your processing function - feature_maps.append(features) - -# Align retention times -align_feature_maps(feature_maps) # Implement alignment - -# Link features -grouper = oms.FeatureGroupingAlgorithmQT() -consensus_map = oms.ConsensusMap() -grouper.group(feature_maps, consensus_map) - -# Export quantification matrix -export_quant_matrix(consensus_map) -``` diff --git a/scientific-packages/pyopenms/references/chemistry.md b/scientific-packages/pyopenms/references/chemistry.md deleted file mode 100644 index 1a21340..0000000 --- a/scientific-packages/pyopenms/references/chemistry.md +++ /dev/null @@ -1,715 +0,0 @@ -# pyOpenMS Chemistry Reference - -This document provides comprehensive coverage of chemistry-related functionality in pyOpenMS, including elements, isotopes, molecular formulas, amino acids, peptides, proteins, and modifications. - -## Elements and Isotopes - -### ElementDB - Element Database - -Access atomic and isotopic data for all elements. - -```python -import pyopenms as oms - -# Get element database instance -element_db = oms.ElementDB() - -# Get element by symbol -carbon = element_db.getElement("C") -nitrogen = element_db.getElement("N") -oxygen = element_db.getElement("O") - -# Element properties -print(f"Carbon monoisotopic weight: {carbon.getMonoWeight()}") -print(f"Carbon average weight: {carbon.getAverageWeight()}") -print(f"Atomic number: {carbon.getAtomicNumber()}") -print(f"Symbol: {carbon.getSymbol()}") -print(f"Name: {carbon.getName()}") -``` - -### Isotope Information - -```python -# Get isotope distribution for an element -isotopes = carbon.getIsotopeDistribution() - -# Access specific isotope -c12 = element_db.getElement("C", 12) # Carbon-12 -c13 = element_db.getElement("C", 13) # Carbon-13 - -print(f"C-12 abundance: {isotopes.getContainer()[0].getIntensity()}") -print(f"C-13 abundance: {isotopes.getContainer()[1].getIntensity()}") - -# Isotope mass -print(f"C-12 mass: {c12.getMonoWeight()}") -print(f"C-13 mass: {c13.getMonoWeight()}") -``` - -### Constants - -```python -# Physical constants -avogadro = oms.Constants.AVOGADRO -electron_mass = oms.Constants.ELECTRON_MASS_U -proton_mass = oms.Constants.PROTON_MASS_U - -print(f"Avogadro's number: {avogadro}") -print(f"Electron mass: {electron_mass} u") -print(f"Proton mass: {proton_mass} u") -``` - -## Empirical Formulas - -### EmpiricalFormula - Molecular Formulas - -Represent and manipulate molecular formulas. - -#### Creating Formulas - -```python -# From string -glucose = oms.EmpiricalFormula("C6H12O6") -water = oms.EmpiricalFormula("H2O") -ammonia = oms.EmpiricalFormula("NH3") - -# From element composition -formula = oms.EmpiricalFormula() -formula.setCharge(1) # Set charge state -``` - -#### Formula Arithmetic - -```python -# Addition -sucrose = oms.EmpiricalFormula("C12H22O11") -hydrolyzed = sucrose + water # Hydrolysis adds water - -# Subtraction -dehydrated = glucose - water # Dehydration removes water - -# Multiplication -three_waters = water * 3 # 3 H2O = H6O3 - -# Division -formula_half = sucrose / 2 # Half the formula -``` - -#### Mass Calculations - -```python -# Monoisotopic mass -mono_mass = glucose.getMonoWeight() -print(f"Glucose monoisotopic mass: {mono_mass:.6f} Da") - -# Average mass -avg_mass = glucose.getAverageWeight() -print(f"Glucose average mass: {avg_mass:.6f} Da") - -# Mass difference -mass_diff = (glucose - water).getMonoWeight() -``` - -#### Elemental Composition - -```python -# Get element counts -formula = oms.EmpiricalFormula("C6H12O6") - -# Access individual elements -n_carbon = formula.getNumberOf(element_db.getElement("C")) -n_hydrogen = formula.getNumberOf(element_db.getElement("H")) -n_oxygen = formula.getNumberOf(element_db.getElement("O")) - -print(f"C: {n_carbon}, H: {n_hydrogen}, O: {n_oxygen}") - -# String representation -print(f"Formula: {formula.toString()}") -``` - -#### Isotope-Specific Formulas - -```python -# Specify specific isotopes using parentheses -labeled_glucose = oms.EmpiricalFormula("(13)C6H12O6") # All carbons are C-13 -partially_labeled = oms.EmpiricalFormula("C5(13)CH12O6") # One C-13 - -# Deuterium labeling -deuterated = oms.EmpiricalFormula("C6D12O6") # D2O instead of H2O -``` - -#### Charge States - -```python -# Set charge -formula = oms.EmpiricalFormula("C6H12O6") -formula.setCharge(1) # Positive charge - -# Get charge -charge = formula.getCharge() - -# Calculate m/z for charged molecule -mz = formula.getMonoWeight() / abs(charge) if charge != 0 else formula.getMonoWeight() -``` - -### Isotope Pattern Generation - -Generate theoretical isotope patterns for formulas. - -#### CoarseIsotopePatternGenerator - -For unit mass resolution (low-resolution instruments). - -```python -# Create generator -coarse_gen = oms.CoarseIsotopePatternGenerator() - -# Generate pattern -formula = oms.EmpiricalFormula("C6H12O6") -pattern = coarse_gen.run(formula) - -# Access isotope peaks -iso_dist = pattern.getContainer() -for peak in iso_dist: - mass = peak.getMZ() - abundance = peak.getIntensity() - print(f"m/z: {mass:.4f}, Abundance: {abundance:.4f}") -``` - -#### FineIsotopePatternGenerator - -For high-resolution instruments (hyperfine structure). - -```python -# Create generator with resolution -fine_gen = oms.FineIsotopePatternGenerator(0.01) # 0.01 Da resolution - -# Generate fine pattern -fine_pattern = fine_gen.run(formula) - -# Access fine isotope structure -for peak in fine_pattern.getContainer(): - print(f"m/z: {peak.getMZ():.6f}, Abundance: {peak.getIntensity():.6f}") -``` - -#### Isotope Pattern Matching - -```python -# Compare experimental to theoretical -def compare_isotope_patterns(experimental_mz, experimental_int, formula): - # Generate theoretical - coarse_gen = oms.CoarseIsotopePatternGenerator() - theoretical = coarse_gen.run(formula) - - # Extract theoretical peaks - theo_peaks = theoretical.getContainer() - theo_mz = [p.getMZ() for p in theo_peaks] - theo_int = [p.getIntensity() for p in theo_peaks] - - # Normalize both patterns - exp_int_norm = [i / max(experimental_int) for i in experimental_int] - theo_int_norm = [i / max(theo_int) for i in theo_int] - - # Calculate similarity (e.g., cosine similarity) - # ... implement similarity calculation - return similarity_score -``` - -## Amino Acids and Residues - -### Residue - Amino Acid Representation - -Access properties of amino acids. - -```python -# Get residue database -res_db = oms.ResidueDB() - -# Get specific residue -leucine = res_db.getResidue("Leucine") -# Or by one-letter code -leu = res_db.getResidue("L") - -# Residue properties -print(f"Name: {leucine.getName()}") -print(f"Three-letter code: {leucine.getThreeLetterCode()}") -print(f"One-letter code: {leucine.getOneLetterCode()}") -print(f"Monoisotopic mass: {leucine.getMonoWeight():.6f}") -print(f"Average mass: {leucine.getAverageWeight():.6f}") - -# Chemical formula -formula = leucine.getFormula() -print(f"Formula: {formula.toString()}") - -# pKa values -print(f"pKa (N-term): {leucine.getPka()}") -print(f"pKa (C-term): {leucine.getPkb()}") -print(f"pKa (side chain): {leucine.getPkc()}") - -# Side chain basicity/acidity -print(f"Basicity: {leucine.getBasicity()}") -print(f"Hydrophobicity: {leucine.getHydrophobicity()}") -``` - -### All Standard Amino Acids - -```python -# Iterate over all residues -for residue_name in ["Alanine", "Cysteine", "Aspartic acid", "Glutamic acid", - "Phenylalanine", "Glycine", "Histidine", "Isoleucine", - "Lysine", "Leucine", "Methionine", "Asparagine", - "Proline", "Glutamine", "Arginine", "Serine", - "Threonine", "Valine", "Tryptophan", "Tyrosine"]: - res = res_db.getResidue(residue_name) - print(f"{res.getOneLetterCode()}: {res.getMonoWeight():.4f} Da") -``` - -### Internal Residues vs. Termini - -```python -# Get internal residue mass (no terminal groups) -internal_mass = leucine.getInternalToFull() - -# Get residue with N-terminal modification -n_terminal = res_db.getResidue("L[1]") # With NH2 - -# Get residue with C-terminal modification -c_terminal = res_db.getResidue("L[2]") # With COOH -``` - -## Peptide Sequences - -### AASequence - Amino Acid Sequences - -Represent and manipulate peptide sequences. - -#### Creating Sequences - -```python -# From string -peptide = oms.AASequence.fromString("PEPTIDE") -longer = oms.AASequence.fromString("MKTAYIAKQRQISFVK") - -# Empty sequence -empty_seq = oms.AASequence() -``` - -#### Sequence Properties - -```python -peptide = oms.AASequence.fromString("PEPTIDE") - -# Length -length = peptide.size() -print(f"Length: {length} residues") - -# Mass -mono_mass = peptide.getMonoWeight() -avg_mass = peptide.getAverageWeight() -print(f"Monoisotopic mass: {mono_mass:.6f} Da") -print(f"Average mass: {avg_mass:.6f} Da") - -# Formula -formula = peptide.getFormula() -print(f"Formula: {formula.toString()}") - -# String representation -seq_str = peptide.toString() -print(f"Sequence: {seq_str}") -``` - -#### Accessing Individual Residues - -```python -peptide = oms.AASequence.fromString("PEPTIDE") - -# Access by index -first_aa = peptide[0] # Returns Residue object -print(f"First amino acid: {first_aa.getOneLetterCode()}") - -# Iterate -for i in range(peptide.size()): - residue = peptide[i] - print(f"Position {i}: {residue.getOneLetterCode()}") -``` - -#### Modifications - -Add post-translational modifications (PTMs) to sequences. - -```python -# Modifications in sequence string -# Format: AA(ModificationName) -oxidized_met = oms.AASequence.fromString("PEPTIDEM(Oxidation)") -phospho = oms.AASequence.fromString("PEPTIDES(Phospho)T(Phospho)") - -# Multiple modifications -multi_mod = oms.AASequence.fromString("M(Oxidation)PEPTIDEK(Acetyl)") - -# N-terminal modifications -n_term_acetyl = oms.AASequence.fromString("(Acetyl)PEPTIDE") - -# C-terminal modifications -c_term_amide = oms.AASequence.fromString("PEPTIDE(Amidated)") - -# Check mass change -unmodified = oms.AASequence.fromString("PEPTIDE") -modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)") -mass_diff = modified.getMonoWeight() - unmodified.getMonoWeight() -print(f"Mass shift from oxidation: {mass_diff:.6f} Da") -``` - -#### Sequence Manipulation - -```python -# Prefix (N-terminal fragment) -prefix = peptide.getPrefix(3) # First 3 residues -print(f"Prefix: {prefix.toString()}") - -# Suffix (C-terminal fragment) -suffix = peptide.getSuffix(3) # Last 3 residues -print(f"Suffix: {suffix.toString()}") - -# Subsequence -subseq = peptide.getSubsequence(2, 4) # Residues 2-4 -print(f"Subsequence: {subseq.toString()}") -``` - -#### Theoretical Fragmentation - -Generate theoretical fragment ions for MS/MS. - -```python -peptide = oms.AASequence.fromString("PEPTIDE") - -# b-ions (N-terminal fragments) -b_ions = [] -for i in range(1, peptide.size()): - b_fragment = peptide.getPrefix(i) - b_mass = b_fragment.getMonoWeight() - b_ions.append(('b', i, b_mass)) - print(f"b{i}: {b_mass:.4f}") - -# y-ions (C-terminal fragments) -y_ions = [] -for i in range(1, peptide.size()): - y_fragment = peptide.getSuffix(i) - y_mass = y_fragment.getMonoWeight() - y_ions.append(('y', i, y_mass)) - print(f"y{i}: {y_mass:.4f}") - -# a-ions (b - CO) -a_ions = [] -CO_mass = 27.994915 # CO loss -for ion_type, position, mass in b_ions: - a_mass = mass - CO_mass - a_ions.append(('a', position, a_mass)) - -# c-ions (b + NH3) -NH3_mass = 17.026549 # NH3 gain -c_ions = [] -for ion_type, position, mass in b_ions: - c_mass = mass + NH3_mass - c_ions.append(('c', position, c_mass)) - -# z-ions (y - NH3) -z_ions = [] -for ion_type, position, mass in y_ions: - z_mass = mass - NH3_mass - z_ions.append(('z', position, z_mass)) -``` - -#### Calculate m/z for Charge States - -```python -peptide = oms.AASequence.fromString("PEPTIDE") -proton_mass = 1.007276 - -# [M+H]+ -mz_1 = peptide.getMonoWeight() + proton_mass -print(f"[M+H]+: {mz_1:.4f}") - -# [M+2H]2+ -mz_2 = (peptide.getMonoWeight() + 2 * proton_mass) / 2 -print(f"[M+2H]2+: {mz_2:.4f}") - -# [M+3H]3+ -mz_3 = (peptide.getMonoWeight() + 3 * proton_mass) / 3 -print(f"[M+3H]3+: {mz_3:.4f}") - -# General formula for any charge -def calculate_mz(sequence, charge): - proton_mass = 1.007276 - return (sequence.getMonoWeight() + charge * proton_mass) / charge - -for z in range(1, 5): - print(f"[M+{z}H]{z}+: {calculate_mz(peptide, z):.4f}") -``` - -## Protein Digestion - -### ProteaseDigestion - Enzymatic Cleavage - -Simulate enzymatic protein digestion. - -#### Basic Digestion - -```python -# Create digestion object -dig = oms.ProteaseDigestion() - -# Set enzyme -dig.setEnzyme("Trypsin") # Cleaves after K, R - -# Other common enzymes: -# - "Trypsin" (K, R) -# - "Lys-C" (K) -# - "Arg-C" (R) -# - "Asp-N" (D) -# - "Glu-C" (E, D) -# - "Chymotrypsin" (F, Y, W, L) - -# Set missed cleavages -dig.setMissedCleavages(0) # No missed cleavages -dig.setMissedCleavages(2) # Allow up to 2 missed cleavages - -# Perform digestion -protein = oms.AASequence.fromString("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK") -peptides = [] -dig.digest(protein, peptides) - -# Print results -for pep in peptides: - print(f"{pep.toString()}: {pep.getMonoWeight():.2f} Da") -``` - -#### Advanced Digestion Options - -```python -# Get enzyme specificity -specificity = dig.getSpecificity() -# oms.EnzymaticDigestion.SPEC_FULL (both termini) -# oms.EnzymaticDigestion.SPEC_SEMI (one terminus) -# oms.EnzymaticDigestion.SPEC_NONE (no specificity) - -# Set specificity for semi-tryptic search -dig.setSpecificity(oms.EnzymaticDigestion.SPEC_SEMI) - -# Get cleavage sites -cleavage_residues = dig.getEnzyme().getCutAfterResidues() -restriction_residues = dig.getEnzyme().getRestriction() -``` - -#### Filter Peptides by Properties - -```python -# Filter by mass range -min_mass = 600.0 -max_mass = 4000.0 -filtered = [p for p in peptides if min_mass <= p.getMonoWeight() <= max_mass] - -# Filter by length -min_length = 6 -max_length = 30 -length_filtered = [p for p in peptides if min_length <= p.size() <= max_length] - -# Combine filters -valid_peptides = [p for p in peptides - if min_mass <= p.getMonoWeight() <= max_mass - and min_length <= p.size() <= max_length] -``` - -## Modifications - -### ModificationsDB - Modification Database - -Access and apply post-translational modifications. - -#### Accessing Modifications - -```python -# Get modifications database -mod_db = oms.ModificationsDB() - -# Get specific modification -oxidation = mod_db.getModification("Oxidation") -phospho = mod_db.getModification("Phospho") -acetyl = mod_db.getModification("Acetyl") - -# Modification properties -print(f"Name: {oxidation.getFullName()}") -print(f"Mass difference: {oxidation.getDiffMonoMass():.6f} Da") -print(f"Formula: {oxidation.getDiffFormula().toString()}") - -# Affected residues -print(f"Residues: {oxidation.getResidues()}") # e.g., ['M'] - -# Specificity (N-term, C-term, anywhere) -print(f"Term specificity: {oxidation.getTermSpecificity()}") -``` - -#### Common Modifications - -```python -# Oxidation (M) -oxidation = mod_db.getModification("Oxidation") -print(f"Oxidation: +{oxidation.getDiffMonoMass():.4f} Da") - -# Phosphorylation (S, T, Y) -phospho = mod_db.getModification("Phospho") -print(f"Phospho: +{phospho.getDiffMonoMass():.4f} Da") - -# Carbamidomethylation (C) - common alkylation -carbamido = mod_db.getModification("Carbamidomethyl") -print(f"Carbamidomethyl: +{carbamido.getDiffMonoMass():.4f} Da") - -# Acetylation (K, N-term) -acetyl = mod_db.getModification("Acetyl") -print(f"Acetyl: +{acetyl.getDiffMonoMass():.4f} Da") - -# Deamidation (N, Q) -deamid = mod_db.getModification("Deamidated") -print(f"Deamidation: +{deamid.getDiffMonoMass():.4f} Da") -``` - -#### Searching Modifications - -```python -# Search modifications by mass -mass_tolerance = 0.01 # Da -target_mass = 15.9949 # Oxidation - -# Get all modifications -all_mods = [] -mod_db.getAllSearchModifications(all_mods) - -# Find matching modifications -matching = [] -for mod_name in all_mods: - mod = mod_db.getModification(mod_name) - if abs(mod.getDiffMonoMass() - target_mass) < mass_tolerance: - matching.append(mod) - print(f"Match: {mod.getFullName()} ({mod.getDiffMonoMass():.4f} Da)") -``` - -#### Variable vs. Fixed Modifications - -```python -# In search engines, specify: -# Fixed modifications: applied to all occurrences -fixed_mods = ["Carbamidomethyl (C)"] - -# Variable modifications: optionally present -variable_mods = ["Oxidation (M)", "Phospho (S)", "Phospho (T)", "Phospho (Y)"] -``` - -## Ribonucleotides (RNA) - -### Ribonucleotide - RNA Building Blocks - -```python -# Get ribonucleotide database -ribo_db = oms.RibonucleotideDB() - -# Get specific ribonucleotide -adenine = ribo_db.getRibonucleotide("A") -uracil = ribo_db.getRibonucleotide("U") -guanine = ribo_db.getRibonucleotide("G") -cytosine = ribo_db.getRibonucleotide("C") - -# Properties -print(f"Adenine mono mass: {adenine.getMonoWeight()}") -print(f"Formula: {adenine.getFormula().toString()}") - -# Modified ribonucleotides -modified_ribo = ribo_db.getRibonucleotide("m6A") # N6-methyladenosine -``` - -## Practical Examples - -### Calculate Peptide Mass with Modifications - -```python -def calculate_peptide_mz(sequence_str, charge): - """Calculate m/z for a peptide sequence string with modifications.""" - peptide = oms.AASequence.fromString(sequence_str) - proton_mass = 1.007276 - mz = (peptide.getMonoWeight() + charge * proton_mass) / charge - return mz - -# Examples -print(calculate_peptide_mz("PEPTIDE", 2)) # Unmodified [M+2H]2+ -print(calculate_peptide_mz("PEPTIDEM(Oxidation)", 2)) # With oxidation -print(calculate_peptide_mz("(Acetyl)PEPTIDEK(Acetyl)", 2)) # Acetylated -``` - -### Generate Complete Fragment Ion Series - -```python -def generate_fragment_ions(sequence_str, charge_states=[1, 2]): - """Generate comprehensive fragment ion list.""" - peptide = oms.AASequence.fromString(sequence_str) - proton_mass = 1.007276 - fragments = [] - - for i in range(1, peptide.size()): - # b and y ions - b_frag = peptide.getPrefix(i) - y_frag = peptide.getSuffix(i) - - for z in charge_states: - b_mz = (b_frag.getMonoWeight() + z * proton_mass) / z - y_mz = (y_frag.getMonoWeight() + z * proton_mass) / z - - fragments.append({ - 'type': 'b', - 'position': i, - 'charge': z, - 'mz': b_mz - }) - fragments.append({ - 'type': 'y', - 'position': i, - 'charge': z, - 'mz': y_mz - }) - - return fragments - -# Usage -ions = generate_fragment_ions("PEPTIDE", charge_states=[1, 2]) -for ion in ions: - print(f"{ion['type']}{ion['position']}^{ion['charge']}+: {ion['mz']:.4f}") -``` - -### Digest Protein and Calculate Peptide Masses - -```python -def digest_and_calculate(protein_seq_str, enzyme="Trypsin", missed_cleavages=2, - min_mass=600, max_mass=4000): - """Digest protein and return valid peptides with masses.""" - dig = oms.ProteaseDigestion() - dig.setEnzyme(enzyme) - dig.setMissedCleavages(missed_cleavages) - - protein = oms.AASequence.fromString(protein_seq_str) - peptides = [] - dig.digest(protein, peptides) - - results = [] - for pep in peptides: - mass = pep.getMonoWeight() - if min_mass <= mass <= max_mass: - results.append({ - 'sequence': pep.toString(), - 'mass': mass, - 'length': pep.size() - }) - - return results - -# Usage -protein = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK" -peptides = digest_and_calculate(protein) -for pep in peptides: - print(f"{pep['sequence']}: {pep['mass']:.2f} Da ({pep['length']} aa)") -``` diff --git a/scientific-packages/pyopenms/references/data_structures.md b/scientific-packages/pyopenms/references/data_structures.md index cb0f977..9f599b6 100644 --- a/scientific-packages/pyopenms/references/data_structures.md +++ b/scientific-packages/pyopenms/references/data_structures.md @@ -1,560 +1,497 @@ -# pyOpenMS Data Structures Reference +# Core Data Structures -This document provides comprehensive coverage of core data structures in pyOpenMS for representing mass spectrometry data. +## Overview -## Core Hierarchy +PyOpenMS uses C++ objects with Python bindings. Understanding these core data structures is essential for effective data manipulation. -``` -MSExperiment # Top-level: Complete LC-MS/MS run -├── MSSpectrum[] # Collection of mass spectra -│ ├── Peak1D[] # Individual m/z, intensity pairs -│ └── SpectrumSettings # Metadata (RT, MS level, precursor) -└── MSChromatogram[] # Collection of chromatograms - ├── ChromatogramPeak[] # RT, intensity pairs - └── ChromatogramSettings # Metadata -``` +## Spectrum and Experiment Objects -## MSSpectrum +### MSExperiment -Represents a single mass spectrum (1-dimensional peak data). - -### Creation and Basic Properties +Container for complete LC-MS experiment data (spectra and chromatograms). ```python -import pyopenms as oms +import pyopenms as ms -# Create empty spectrum -spectrum = oms.MSSpectrum() +# Create experiment +exp = ms.MSExperiment() -# Set metadata -spectrum.setRT(123.45) # Retention time in seconds -spectrum.setMSLevel(1) # MS level (1 for MS1, 2 for MS2, etc.) -spectrum.setNativeID("scan=1234") # Native ID from file +# Load from file +ms.MzMLFile().load("data.mzML", exp) -# Additional metadata -spectrum.setDriftTime(15.2) # Ion mobility drift time -spectrum.setName("MyScan") # Optional name -``` - -### Peak Data Management - -**Setting Peaks (Method 1 - Lists):** -```python -mz_values = [100.5, 200.3, 300.7, 400.2, 500.1] -intensity_values = [1000, 5000, 3000, 2000, 1500] - -spectrum.set_peaks((mz_values, intensity_values)) -``` - -**Setting Peaks (Method 2 - NumPy arrays):** -```python -import numpy as np - -mz_array = np.array([100.5, 200.3, 300.7, 400.2, 500.1]) -intensity_array = np.array([1000, 5000, 3000, 2000, 1500]) - -spectrum.set_peaks((mz_array, intensity_array)) -``` - -**Retrieving Peaks:** -```python -# Get as numpy arrays (efficient) -mz_array, intensity_array = spectrum.get_peaks() - -# Check number of peaks -n_peaks = spectrum.size() - -# Get individual peak (slower) -for i in range(spectrum.size()): - peak = spectrum[i] - mz = peak.getMZ() - intensity = peak.getIntensity() -``` - -### Precursor Information (for MS2/MSn spectra) - -```python -# Create precursor -precursor = oms.Precursor() -precursor.setMZ(456.789) # Precursor m/z -precursor.setCharge(2) # Precursor charge -precursor.setIntensity(50000) # Precursor intensity -precursor.setIsolationWindowLowerOffset(1.5) # Lower isolation window -precursor.setIsolationWindowUpperOffset(1.5) # Upper isolation window - -# Set activation method -activation = oms.Activation() -activation.setActivationEnergy(35.0) # Collision energy -activation.setMethod(oms.Activation.ActivationMethod.CID) -precursor.setActivation(activation) - -# Assign to spectrum -spectrum.setPrecursors([precursor]) - -# Retrieve precursor information -precursors = spectrum.getPrecursors() -if len(precursors) > 0: - prec = precursors[0] - print(f"Precursor m/z: {prec.getMZ()}") - print(f"Precursor charge: {prec.getCharge()}") -``` - -### Spectrum Metadata Access - -```python -# Check if spectrum is sorted by m/z -is_sorted = spectrum.isSorted() - -# Sort spectrum by m/z -spectrum.sortByPosition() - -# Sort by intensity -spectrum.sortByIntensity() - -# Clear all peaks -spectrum.clear(False) # False = keep metadata, True = clear everything - -# Get retention time -rt = spectrum.getRT() - -# Get MS level -ms_level = spectrum.getMSLevel() -``` - -### Spectrum Types and Modes - -```python -# Set spectrum type -spectrum.setType(oms.SpectrumSettings.SpectrumType.CENTROID) # or PROFILE - -# Get spectrum type -spec_type = spectrum.getType() -if spec_type == oms.SpectrumSettings.SpectrumType.CENTROID: - print("Centroid spectrum") -elif spec_type == oms.SpectrumSettings.SpectrumType.PROFILE: - print("Profile spectrum") -``` - -### Data Processing Annotations - -```python -# Add processing information -processing = oms.DataProcessing() -processing.setMetaValue("smoothing", "gaussian") -spectrum.setDataProcessing([processing]) -``` - -## MSExperiment - -Represents a complete LC-MS/MS experiment containing multiple spectra and chromatograms. - -### Creation and Population - -```python -# Create empty experiment -exp = oms.MSExperiment() - -# Add spectra -spectrum1 = oms.MSSpectrum() -spectrum1.setRT(100.0) -spectrum1.set_peaks(([100, 200], [1000, 2000])) - -spectrum2 = oms.MSSpectrum() -spectrum2.setRT(200.0) -spectrum2.set_peaks(([100, 200], [1500, 2500])) - -exp.addSpectrum(spectrum1) -exp.addSpectrum(spectrum2) - -# Add chromatograms -chrom = oms.MSChromatogram() -chrom.set_peaks(([10.5, 11.0, 11.5], [1000, 5000, 3000])) -exp.addChromatogram(chrom) -``` - -### Accessing Spectra and Chromatograms - -```python -# Get number of spectra and chromatograms -n_spectra = exp.getNrSpectra() -n_chroms = exp.getNrChromatograms() - -# Access by index -first_spectrum = exp.getSpectrum(0) -last_spectrum = exp.getSpectrum(exp.getNrSpectra() - 1) - -# Iterate over all spectra -for spectrum in exp: - rt = spectrum.getRT() - ms_level = spectrum.getMSLevel() - n_peaks = spectrum.size() - print(f"RT: {rt:.2f}s, MS{ms_level}, Peaks: {n_peaks}") - -# Get all spectra as list -spectra = exp.getSpectra() - -# Access chromatograms -chrom = exp.getChromatogram(0) -``` - -### Filtering Operations - -```python -# Filter by MS level -exp.filterMSLevel(1) # Keep only MS1 spectra -exp.filterMSLevel(2) # Keep only MS2 spectra - -# Filter by retention time range -exp.filterRT(100.0, 500.0) # Keep RT between 100-500 seconds - -# Filter by m/z range (all spectra) -exp.filterMZ(300.0, 1500.0) # Keep m/z between 300-1500 - -# Filter by scan number -exp.filterScanNumber(100, 200) # Keep scans 100-200 -``` - -### Metadata and Properties - -```python -# Set experiment metadata -exp.setMetaValue("operator", "John Doe") -exp.setMetaValue("instrument", "Q Exactive HF") - -# Get metadata -operator = exp.getMetaValue("operator") +# Access properties +print(f"Number of spectra: {exp.getNrSpectra()}") +print(f"Number of chromatograms: {exp.getNrChromatograms()}") # Get RT range -rt_range = exp.getMinRT(), exp.getMaxRT() +rts = [spec.getRT() for spec in exp] +print(f"RT range: {min(rts):.1f} - {max(rts):.1f} seconds") -# Get m/z range -mz_range = exp.getMinMZ(), exp.getMaxMZ() +# Access individual spectrum +spec = exp.getSpectrum(0) -# Clear all data -exp.clear(False) # False = keep metadata +# Iterate through spectra +for spec in exp: + if spec.getMSLevel() == 2: + print(f"MS2 spectrum at RT {spec.getRT():.2f}") + +# Get metadata +exp_settings = exp.getExperimentalSettings() +instrument = exp_settings.getInstrument() +print(f"Instrument: {instrument.getName()}") ``` -### Sorting and Organization +### MSSpectrum + +Individual mass spectrum with m/z and intensity arrays. ```python -# Sort spectra by retention time -exp.sortSpectra() +# Create empty spectrum +spec = ms.MSSpectrum() -# Update ranges (call after modifications) -exp.updateRanges() +# Get from experiment +exp = ms.MSExperiment() +ms.MzMLFile().load("data.mzML", exp) +spec = exp.getSpectrum(0) -# Check if experiment is empty -is_empty = exp.empty() +# Basic properties +print(f"MS level: {spec.getMSLevel()}") +print(f"Retention time: {spec.getRT():.2f} seconds") +print(f"Number of peaks: {spec.size()}") -# Reset (clear everything) -exp.reset() +# Get peak data as numpy arrays +mz, intensity = spec.get_peaks() +print(f"m/z range: {mz.min():.2f} - {mz.max():.2f}") +print(f"Max intensity: {intensity.max():.0f}") + +# Access individual peaks +for i in range(min(5, spec.size())): # First 5 peaks + print(f"Peak {i}: m/z={mz[i]:.4f}, intensity={intensity[i]:.0f}") + +# Precursor information (for MS2) +if spec.getMSLevel() == 2: + precursors = spec.getPrecursors() + if precursors: + precursor = precursors[0] + print(f"Precursor m/z: {precursor.getMZ():.4f}") + print(f"Precursor charge: {precursor.getCharge()}") + print(f"Precursor intensity: {precursor.getIntensity():.0f}") + +# Set peak data +new_mz = [100.0, 200.0, 300.0] +new_intensity = [1000.0, 2000.0, 1500.0] +spec.set_peaks((new_mz, new_intensity)) ``` -## MSChromatogram +### MSChromatogram -Represents an extracted or reconstructed chromatogram (retention time vs. intensity). - -### Creation and Basic Usage +Chromatographic trace (TIC, XIC, or SRM transition). ```python -# Create chromatogram -chrom = oms.MSChromatogram() +# Access chromatogram from experiment +for chrom in exp.getChromatograms(): + print(f"Chromatogram ID: {chrom.getNativeID()}") -# Set peaks (RT, intensity pairs) -rt_values = [10.0, 10.5, 11.0, 11.5, 12.0] -intensity_values = [1000, 5000, 8000, 6000, 2000] -chrom.set_peaks((rt_values, intensity_values)) + # Get data + rt, intensity = chrom.get_peaks() -# Get peaks -rt_array, int_array = chrom.get_peaks() + print(f" RT points: {len(rt)}") + print(f" Max intensity: {intensity.max():.0f}") -# Get size -n_points = chrom.size() + # Precursor info (for XIC) + precursor = chrom.getPrecursor() + print(f" Precursor m/z: {precursor.getMZ():.4f}") ``` -### Chromatogram Types - -```python -# Set chromatogram type -chrom.setChromatogramType(oms.ChromatogramSettings.ChromatogramType.SELECTED_ION_CURRENT_CHROMATOGRAM) - -# Other types: -# - TOTAL_ION_CURRENT_CHROMATOGRAM -# - BASEPEAK_CHROMATOGRAM -# - SELECTED_ION_CURRENT_CHROMATOGRAM -# - SELECTED_REACTION_MONITORING_CHROMATOGRAM -``` - -### Metadata - -```python -# Set native ID -chrom.setNativeID("TIC") - -# Set name -chrom.setName("Total Ion Current") - -# Access -native_id = chrom.getNativeID() -name = chrom.getName() -``` - -### Precursor and Product Information (for SRM/MRM) - -```python -# For targeted experiments -precursor = oms.Precursor() -precursor.setMZ(456.7) -chrom.setPrecursor(precursor) - -product = oms.Product() -product.setMZ(789.4) -chrom.setProduct(product) -``` - -## Peak1D and ChromatogramPeak - -Individual peak data points. - -### Peak1D (for mass spectra) - -```python -# Create individual peak -peak = oms.Peak1D() -peak.setMZ(456.789) -peak.setIntensity(10000) - -# Access -mz = peak.getMZ() -intensity = peak.getIntensity() - -# Set position and intensity -peak.setPosition([456.789]) -peak.setIntensity(10000) -``` - -### ChromatogramPeak (for chromatograms) - -```python -# Create chromatogram peak -chrom_peak = oms.ChromatogramPeak() -chrom_peak.setRT(125.5) -chrom_peak.setIntensity(5000) - -# Access -rt = chrom_peak.getRT() -intensity = chrom_peak.getIntensity() -``` - -## FeatureMap and Feature - -For quantification results. +## Feature Objects ### Feature -Represents a detected LC-MS feature (peptide or metabolite signal). +Detected chromatographic peak with 2D spatial extent (RT-m/z). ```python -# Create feature -feature = oms.Feature() +# Load features +feature_map = ms.FeatureMap() +ms.FeatureXMLFile().load("features.featureXML", feature_map) -# Set properties -feature.setMZ(456.789) -feature.setRT(123.45) -feature.setIntensity(1000000) -feature.setCharge(2) -feature.setWidth(15.0) # RT width in seconds +# Access individual feature +feature = feature_map[0] -# Set quality score -feature.setOverallQuality(0.95) +# Core properties +print(f"m/z: {feature.getMZ():.4f}") +print(f"RT: {feature.getRT():.2f} seconds") +print(f"Intensity: {feature.getIntensity():.0f}") +print(f"Charge: {feature.getCharge()}") -# Access -mz = feature.getMZ() -rt = feature.getRT() -intensity = feature.getIntensity() -charge = feature.getCharge() +# Quality metrics +print(f"Overall quality: {feature.getOverallQuality():.3f}") +print(f"Width (RT): {feature.getWidth():.2f}") + +# Convex hull (spatial extent) +hull = feature.getConvexHull() +print(f"Hull points: {hull.getHullPoints().size()}") + +# Bounding box +bbox = hull.getBoundingBox() +print(f"RT range: {bbox.minPosition()[0]:.2f} - {bbox.maxPosition()[0]:.2f}") +print(f"m/z range: {bbox.minPosition()[1]:.4f} - {bbox.maxPosition()[1]:.4f}") + +# Subordinate features (isotopes) +subordinates = feature.getSubordinates() +if subordinates: + print(f"Isotopic features: {len(subordinates)}") + for sub in subordinates: + print(f" m/z: {sub.getMZ():.4f}, intensity: {sub.getIntensity():.0f}") + +# Metadata values +if feature.metaValueExists("label"): + label = feature.getMetaValue("label") + print(f"Label: {label}") ``` ### FeatureMap -Collection of features. +Collection of features from a single LC-MS run. ```python # Create feature map -feature_map = oms.FeatureMap() +feature_map = ms.FeatureMap() -# Add features -feature1 = oms.Feature() -feature1.setMZ(456.789) -feature1.setRT(123.45) -feature1.setIntensity(1000000) +# Load from file +ms.FeatureXMLFile().load("features.featureXML", feature_map) -feature_map.push_back(feature1) +# Access properties +print(f"Number of features: {feature_map.size()}") -# Get size -n_features = feature_map.size() +# Get unique features +print(f"Unique features: {feature_map.getUniqueId()}") -# Iterate +# Metadata +primary_path = feature_map.getPrimaryMSRunPath() +if primary_path: + print(f"Source file: {primary_path[0].decode()}") + +# Iterate through features for feature in feature_map: - print(f"m/z: {feature.getMZ():.4f}, RT: {feature.getRT():.2f}") + print(f"Feature: m/z={feature.getMZ():.4f}, RT={feature.getRT():.2f}") -# Access by index -first_feature = feature_map[0] +# Add new feature +new_feature = ms.Feature() +new_feature.setMZ(500.0) +new_feature.setRT(300.0) +new_feature.setIntensity(10000.0) +feature_map.push_back(new_feature) -# Clear -feature_map.clear() +# Sort features +feature_map.sortByRT() # or sortByMZ(), sortByIntensity() + +# Export to pandas +df = feature_map.get_df() +print(df.head()) ``` -## PeptideIdentification and ProteinIdentification - -For identification results. - -### PeptideIdentification - -```python -# Create peptide identification -pep_id = oms.PeptideIdentification() -pep_id.setRT(123.45) -pep_id.setMZ(456.789) - -# Create peptide hit -hit = oms.PeptideHit() -hit.setSequence(oms.AASequence.fromString("PEPTIDE")) -hit.setCharge(2) -hit.setScore(25.5) -hit.setRank(1) - -# Add to identification -pep_id.setHits([hit]) -pep_id.setHigherScoreBetter(True) -pep_id.setScoreType("XCorr") - -# Access -hits = pep_id.getHits() -for hit in hits: - seq = hit.getSequence().toString() - score = hit.getScore() - print(f"Sequence: {seq}, Score: {score}") -``` - -### ProteinIdentification - -```python -# Create protein identification -prot_id = oms.ProteinIdentification() - -# Create protein hit -prot_hit = oms.ProteinHit() -prot_hit.setAccession("P12345") -prot_hit.setSequence("MKTAYIAKQRQISFVK...") -prot_hit.setScore(100.5) - -# Add to identification -prot_id.setHits([prot_hit]) -prot_id.setScoreType("Mascot Score") -prot_id.setHigherScoreBetter(True) - -# Search parameters -search_params = oms.ProteinIdentification.SearchParameters() -search_params.db = "uniprot_human.fasta" -search_params.enzyme = "Trypsin" -prot_id.setSearchParameters(search_params) -``` - -## ConsensusMap and ConsensusFeature - -For linking features across multiple samples. - ### ConsensusFeature -```python -# Create consensus feature -cons_feature = oms.ConsensusFeature() -cons_feature.setMZ(456.789) -cons_feature.setRT(123.45) -cons_feature.setIntensity(5000000) # Combined intensity +Feature linked across multiple samples. -# Access linked features -for handle in cons_feature.getFeatureList(): - map_index = handle.getMapIndex() - feature_index = handle.getIndex() +```python +# Load consensus map +consensus_map = ms.ConsensusMap() +ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) + +# Access consensus feature +cons_feature = consensus_map[0] + +# Consensus properties +print(f"Consensus m/z: {cons_feature.getMZ():.4f}") +print(f"Consensus RT: {cons_feature.getRT():.2f}") +print(f"Consensus intensity: {cons_feature.getIntensity():.0f}") + +# Get feature handles (individual map features) +feature_list = cons_feature.getFeatureList() +print(f"Present in {len(feature_list)} maps") + +for handle in feature_list: + map_idx = handle.getMapIndex() intensity = handle.getIntensity() + mz = handle.getMZ() + rt = handle.getRT() + + print(f" Map {map_idx}: m/z={mz:.4f}, RT={rt:.2f}, intensity={intensity:.0f}") + +# Get unique ID in originating map +for handle in feature_list: + unique_id = handle.getUniqueId() + print(f"Unique ID: {unique_id}") ``` ### ConsensusMap +Collection of consensus features across samples. + ```python # Create consensus map -consensus_map = oms.ConsensusMap() +consensus_map = ms.ConsensusMap() -# Add consensus features -consensus_map.push_back(cons_feature) +# Load from file +ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) -# Iterate -for cons_feat in consensus_map: - mz = cons_feat.getMZ() - rt = cons_feat.getRT() - n_features = cons_feat.size() # Number of linked features +# Access properties +print(f"Consensus features: {consensus_map.size()}") + +# Column headers (file descriptions) +headers = consensus_map.getColumnHeaders() +print(f"Number of files: {len(headers)}") + +for map_idx, description in headers.items(): + print(f"Map {map_idx}:") + print(f" Filename: {description.filename}") + print(f" Label: {description.label}") + print(f" Size: {description.size}") + +# Iterate through consensus features +for cons_feature in consensus_map: + print(f"Consensus feature: m/z={cons_feature.getMZ():.4f}") + +# Export to DataFrame +df = consensus_map.get_df() +``` + +## Identification Objects + +### PeptideIdentification + +Identification results for a single spectrum. + +```python +# Load identifications +protein_ids = [] +peptide_ids = [] +ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) + +# Access peptide identification +peptide_id = peptide_ids[0] + +# Spectrum metadata +print(f"RT: {peptide_id.getRT():.2f}") +print(f"m/z: {peptide_id.getMZ():.4f}") + +# Identification metadata +print(f"Identifier: {peptide_id.getIdentifier()}") +print(f"Score type: {peptide_id.getScoreType()}") +print(f"Higher score better: {peptide_id.isHigherScoreBetter()}") + +# Get peptide hits +hits = peptide_id.getHits() +print(f"Number of hits: {len(hits)}") + +for hit in hits: + print(f" Sequence: {hit.getSequence().toString()}") + print(f" Score: {hit.getScore()}") + print(f" Charge: {hit.getCharge()}") +``` + +### PeptideHit + +Individual peptide match to a spectrum. + +```python +# Access hit +hit = peptide_id.getHits()[0] + +# Sequence information +sequence = hit.getSequence() +print(f"Sequence: {sequence.toString()}") +print(f"Mass: {sequence.getMonoWeight():.4f}") + +# Score and rank +print(f"Score: {hit.getScore()}") +print(f"Rank: {hit.getRank()}") + +# Charge state +print(f"Charge: {hit.getCharge()}") + +# Protein accessions +accessions = hit.extractProteinAccessionsSet() +for acc in accessions: + print(f"Protein: {acc.decode()}") + +# Meta values (additional scores, errors) +if hit.metaValueExists("MS:1002252"): # mass error + mass_error = hit.getMetaValue("MS:1002252") + print(f"Mass error: {mass_error:.4f} ppm") +``` + +### ProteinIdentification + +Protein-level identification information. + +```python +# Access protein identification +protein_id = protein_ids[0] + +# Search engine info +print(f"Search engine: {protein_id.getSearchEngine()}") +print(f"Search engine version: {protein_id.getSearchEngineVersion()}") + +# Search parameters +search_params = protein_id.getSearchParameters() +print(f"Database: {search_params.db}") +print(f"Enzyme: {search_params.digestion_enzyme.getName()}") +print(f"Missed cleavages: {search_params.missed_cleavages}") +print(f"Precursor tolerance: {search_params.precursor_mass_tolerance}") + +# Protein hits +hits = protein_id.getHits() +for hit in hits: + print(f"Accession: {hit.getAccession()}") + print(f"Score: {hit.getScore()}") + print(f"Coverage: {hit.getCoverage():.1f}%") +``` + +### ProteinHit + +Individual protein identification. + +```python +# Access protein hit +protein_hit = protein_id.getHits()[0] + +# Protein information +print(f"Accession: {protein_hit.getAccession()}") +print(f"Description: {protein_hit.getDescription()}") +print(f"Sequence: {protein_hit.getSequence()}") + +# Scoring +print(f"Score: {protein_hit.getScore()}") +print(f"Coverage: {protein_hit.getCoverage():.1f}%") + +# Rank +print(f"Rank: {protein_hit.getRank()}") +``` + +## Sequence Objects + +### AASequence + +Amino acid sequence with modifications. + +```python +# Create sequence from string +seq = ms.AASequence.fromString("PEPTIDE") + +# Basic properties +print(f"Sequence: {seq.toString()}") +print(f"Length: {seq.size()}") +print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}") +print(f"Average mass: {seq.getAverageWeight():.4f}") + +# Individual residues +for i in range(seq.size()): + residue = seq.getResidue(i) + print(f"Position {i}: {residue.getOneLetterCode()}") + print(f" Mass: {residue.getMonoWeight():.4f}") + print(f" Formula: {residue.getFormula().toString()}") + +# Modified sequence +mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K") +print(f"Modified: {mod_seq.isModified()}") + +# Check modifications +for i in range(mod_seq.size()): + residue = mod_seq.getResidue(i) + if residue.isModified(): + print(f"Modification at {i}: {residue.getModificationName()}") + +# N-terminal and C-terminal modifications +term_mod_seq = ms.AASequence.fromString("(Acetyl)PEPTIDE(Amidated)") +``` + +### EmpiricalFormula + +Molecular formula representation. + +```python +# Create formula +formula = ms.EmpiricalFormula("C6H12O6") # Glucose + +# Properties +print(f"Formula: {formula.toString()}") +print(f"Monoisotopic mass: {formula.getMonoWeight():.4f}") +print(f"Average mass: {formula.getAverageWeight():.4f}") + +# Element composition +print(f"Carbon atoms: {formula.getNumberOf(b'C')}") +print(f"Hydrogen atoms: {formula.getNumberOf(b'H')}") +print(f"Oxygen atoms: {formula.getNumberOf(b'O')}") + +# Arithmetic operations +formula2 = ms.EmpiricalFormula("H2O") +combined = formula + formula2 # Add water +print(f"Combined: {combined.toString()}") +``` + +## Parameter Objects + +### Param + +Generic parameter container used by algorithms. + +```python +# Get algorithm parameters +algo = ms.GaussFilter() +params = algo.getParameters() + +# List all parameters +for key in params.keys(): + value = params.getValue(key) + print(f"{key}: {value}") + +# Get specific parameter +gaussian_width = params.getValue("gaussian_width") +print(f"Gaussian width: {gaussian_width}") + +# Set parameter +params.setValue("gaussian_width", 0.2) + +# Apply modified parameters +algo.setParameters(params) + +# Copy parameters +params_copy = ms.Param(params) ``` ## Best Practices -1. **Use numpy arrays** for peak data when possible - much faster than individual peak access -2. **Sort spectra** by position (m/z) before searching or filtering -3. **Update ranges** after modifying MSExperiment: `exp.updateRanges()` -4. **Check MS level** before processing - different algorithms for MS1 vs MS2 -5. **Validate precursor info** for MS2 spectra - ensure charge and m/z are set -6. **Use appropriate containers** - MSExperiment for raw data, FeatureMap for quantification -7. **Clear metadata carefully** - use `clear(False)` to preserve metadata when clearing peaks - -## Common Patterns - -### Create MS2 Spectrum with Precursor +### Memory Management ```python -spectrum = oms.MSSpectrum() -spectrum.setRT(205.2) -spectrum.setMSLevel(2) -spectrum.set_peaks(([100, 200, 300], [1000, 5000, 3000])) +# For large files, use indexed access instead of full loading +indexed_mzml = ms.IndexedMzMLFileLoader() +indexed_mzml.load("large_file.mzML") -precursor = oms.Precursor() -precursor.setMZ(450.5) -precursor.setCharge(2) -spectrum.setPrecursors([precursor]) +# Access specific spectrum without loading entire file +spec = indexed_mzml.getSpectrumById(100) ``` -### Extract MS1 Spectra from Experiment +### Type Conversion ```python -ms1_exp = oms.MSExperiment() -for spectrum in exp: - if spectrum.getMSLevel() == 1: - ms1_exp.addSpectrum(spectrum) +# Convert peak arrays to numpy +import numpy as np + +mz, intensity = spec.get_peaks() +# These are already numpy arrays + +# Can perform numpy operations +filtered_mz = mz[intensity > 1000] ``` -### Calculate Total Ion Current (TIC) +### Object Copying ```python -tic_values = [] -rt_values = [] -for spectrum in exp: - if spectrum.getMSLevel() == 1: - mz, intensity = spectrum.get_peaks() - tic = np.sum(intensity) - tic_values.append(tic) - rt_values.append(spectrum.getRT()) -``` - -### Find Spectrum Closest to RT - -```python -target_rt = 125.5 -closest_spectrum = None -min_diff = float('inf') - -for spectrum in exp: - diff = abs(spectrum.getRT() - target_rt) - if diff < min_diff: - min_diff = diff - closest_spectrum = spectrum +# Create deep copy +exp_copy = ms.MSExperiment(exp) + +# Modifications to copy don't affect original ``` diff --git a/scientific-packages/pyopenms/references/feature_detection.md b/scientific-packages/pyopenms/references/feature_detection.md new file mode 100644 index 0000000..b3cb1ff --- /dev/null +++ b/scientific-packages/pyopenms/references/feature_detection.md @@ -0,0 +1,410 @@ +# Feature Detection and Linking + +## Overview + +Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison. + +## Feature Detection Basics + +A feature represents a chromatographic peak characterized by: +- m/z value (mass-to-charge ratio) +- Retention time (RT) +- Intensity +- Quality score +- Convex hull (spatial extent in RT-m/z space) + +## Feature Finding + +### Feature Finder Multiples (FFM) + +Standard algorithm for feature detection in centroided data: + +```python +import pyopenms as ms + +# Load centroided data +exp = ms.MSExperiment() +ms.MzMLFile().load("centroided.mzML", exp) + +# Create feature finder +ff = ms.FeatureFinder() + +# Get default parameters +params = ff.getParameters("centroided") + +# Modify key parameters +params.setValue("mass_trace:mz_tolerance", 10.0) # ppm +params.setValue("mass_trace:min_spectra", 7) # Min scans per feature +params.setValue("isotopic_pattern:charge_low", 1) +params.setValue("isotopic_pattern:charge_high", 4) + +# Run feature detection +features = ms.FeatureMap() +ff.run("centroided", exp, features, params, ms.FeatureMap()) + +print(f"Detected {features.size()} features") + +# Save features +ms.FeatureXMLFile().store("features.featureXML", features) +``` + +### Feature Finder for Metabolomics + +Optimized for small molecules: + +```python +# Create feature finder for metabolomics +ff = ms.FeatureFinder() + +# Get metabolomics-specific parameters +params = ff.getParameters("centroided") + +# Configure for metabolomics +params.setValue("mass_trace:mz_tolerance", 5.0) # Lower tolerance +params.setValue("mass_trace:min_spectra", 5) +params.setValue("isotopic_pattern:charge_low", 1) # Mostly singly charged +params.setValue("isotopic_pattern:charge_high", 2) + +# Run detection +features = ms.FeatureMap() +ff.run("centroided", exp, features, params, ms.FeatureMap()) +``` + +## Accessing Feature Data + +### Iterate Through Features + +```python +# Load features +feature_map = ms.FeatureMap() +ms.FeatureXMLFile().load("features.featureXML", feature_map) + +# Access individual features +for feature in feature_map: + print(f"m/z: {feature.getMZ():.4f}") + print(f"RT: {feature.getRT():.2f}") + print(f"Intensity: {feature.getIntensity():.0f}") + print(f"Charge: {feature.getCharge()}") + print(f"Quality: {feature.getOverallQuality():.3f}") + print(f"Width (RT): {feature.getWidth():.2f}") + + # Get convex hull + hull = feature.getConvexHull() + print(f"Hull points: {hull.getHullPoints().size()}") +``` + +### Feature Subordinates (Isotope Pattern) + +```python +# Access isotopic pattern +for feature in feature_map: + # Get subordinate features (isotopes) + subordinates = feature.getSubordinates() + + if subordinates: + print(f"Main feature m/z: {feature.getMZ():.4f}") + for sub in subordinates: + print(f" Isotope m/z: {sub.getMZ():.4f}") + print(f" Isotope intensity: {sub.getIntensity():.0f}") +``` + +### Export to Pandas + +```python +import pandas as pd + +# Convert to DataFrame +df = feature_map.get_df() + +print(df.columns) +# Typical columns: RT, mz, intensity, charge, quality + +# Analyze features +print(f"Mean intensity: {df['intensity'].mean()}") +print(f"RT range: {df['RT'].min():.1f} - {df['RT'].max():.1f}") +``` + +## Feature Linking + +### Map Alignment + +Align retention times before linking: + +```python +# Load multiple feature maps +fm1 = ms.FeatureMap() +fm2 = ms.FeatureMap() +ms.FeatureXMLFile().load("sample1.featureXML", fm1) +ms.FeatureXMLFile().load("sample2.featureXML", fm2) + +# Create aligner +aligner = ms.MapAlignmentAlgorithmPoseClustering() + +# Align maps +fm_aligned = [] +transformations = [] +aligner.align([fm1, fm2], fm_aligned, transformations) +``` + +### Feature Linking Algorithm + +Link features across samples: + +```python +# Create feature grouping algorithm +grouper = ms.FeatureGroupingAlgorithmQT() + +# Configure parameters +params = grouper.getParameters() +params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s) +params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm) +params.setValue("distance_MZ:unit", "ppm") +grouper.setParameters(params) + +# Prepare feature maps +feature_maps = [fm1, fm2, fm3] + +# Create consensus map +consensus_map = ms.ConsensusMap() + +# Link features +grouper.group(feature_maps, consensus_map) + +print(f"Created {consensus_map.size()} consensus features") + +# Save consensus map +ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map) +``` + +## Consensus Features + +### Access Consensus Data + +```python +# Load consensus map +consensus_map = ms.ConsensusMap() +ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) + +# Iterate through consensus features +for cons_feature in consensus_map: + print(f"Consensus m/z: {cons_feature.getMZ():.4f}") + print(f"Consensus RT: {cons_feature.getRT():.2f}") + + # Get features from individual maps + for handle in cons_feature.getFeatureList(): + map_idx = handle.getMapIndex() + intensity = handle.getIntensity() + print(f" Sample {map_idx}: intensity {intensity:.0f}") +``` + +### Consensus Map Metadata + +```python +# Access file descriptions (map metadata) +file_descriptions = consensus_map.getColumnHeaders() + +for map_idx, description in file_descriptions.items(): + print(f"Map {map_idx}:") + print(f" Filename: {description.filename}") + print(f" Label: {description.label}") + print(f" Size: {description.size}") +``` + +## Adduct Detection + +Identify different ionization forms of the same molecule: + +```python +# Create adduct detector +adduct_detector = ms.MetaboliteAdductDecharger() + +# Configure parameters +params = adduct_detector.getParameters() +params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M-H]-") +params.setValue("charge_min", 1) +params.setValue("charge_max", 1) +params.setValue("max_neutrals", 1) +adduct_detector.setParameters(params) + +# Detect adducts +feature_map_out = ms.FeatureMap() +adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap()) +``` + +## Complete Feature Detection Workflow + +### End-to-End Example + +```python +import pyopenms as ms + +def feature_detection_workflow(input_files, output_consensus): + """ + Complete workflow: feature detection and linking across samples. + + Args: + input_files: List of mzML file paths + output_consensus: Output consensusXML file path + """ + + feature_maps = [] + + # Step 1: Detect features in each file + for mzml_file in input_files: + print(f"Processing {mzml_file}...") + + # Load experiment + exp = ms.MSExperiment() + ms.MzMLFile().load(mzml_file, exp) + + # Find features + ff = ms.FeatureFinder() + params = ff.getParameters("centroided") + params.setValue("mass_trace:mz_tolerance", 10.0) + params.setValue("mass_trace:min_spectra", 7) + + features = ms.FeatureMap() + ff.run("centroided", exp, features, params, ms.FeatureMap()) + + # Store filename in feature map + features.setPrimaryMSRunPath([mzml_file.encode()]) + + feature_maps.append(features) + print(f" Found {features.size()} features") + + # Step 2: Align retention times + print("Aligning retention times...") + aligner = ms.MapAlignmentAlgorithmPoseClustering() + aligned_maps = [] + transformations = [] + aligner.align(feature_maps, aligned_maps, transformations) + + # Step 3: Link features + print("Linking features across samples...") + grouper = ms.FeatureGroupingAlgorithmQT() + params = grouper.getParameters() + params.setValue("distance_RT:max_difference", 30.0) + params.setValue("distance_MZ:max_difference", 10.0) + params.setValue("distance_MZ:unit", "ppm") + grouper.setParameters(params) + + consensus_map = ms.ConsensusMap() + grouper.group(aligned_maps, consensus_map) + + # Save results + ms.ConsensusXMLFile().store(output_consensus, consensus_map) + + print(f"Created {consensus_map.size()} consensus features") + print(f"Results saved to {output_consensus}") + + return consensus_map + +# Run workflow +input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"] +consensus = feature_detection_workflow(input_files, "consensus.consensusXML") +``` + +## Feature Filtering + +### Filter by Quality + +```python +# Filter features by quality score +filtered_features = ms.FeatureMap() + +for feature in feature_map: + if feature.getOverallQuality() > 0.5: # Quality threshold + filtered_features.push_back(feature) + +print(f"Kept {filtered_features.size()} high-quality features") +``` + +### Filter by Intensity + +```python +# Keep only intense features +min_intensity = 10000 + +filtered_features = ms.FeatureMap() +for feature in feature_map: + if feature.getIntensity() >= min_intensity: + filtered_features.push_back(feature) +``` + +### Filter by m/z Range + +```python +# Extract features in specific m/z range +mz_min = 200.0 +mz_max = 800.0 + +filtered_features = ms.FeatureMap() +for feature in feature_map: + mz = feature.getMZ() + if mz_min <= mz <= mz_max: + filtered_features.push_back(feature) +``` + +## Feature Annotation + +### Add Identification Information + +```python +# Annotate features with peptide identifications +# Load identifications +protein_ids = [] +peptide_ids = [] +ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) + +# Create ID mapper +mapper = ms.IDMapper() + +# Map IDs to features +mapper.annotate(feature_map, peptide_ids, protein_ids) + +# Check annotations +for feature in feature_map: + peptide_ids_for_feature = feature.getPeptideIdentifications() + if peptide_ids_for_feature: + print(f"Feature at {feature.getMZ():.4f} m/z identified") +``` + +## Best Practices + +### Parameter Optimization + +Optimize parameters for your data type: + +```python +# Test different tolerance values +mz_tolerances = [5.0, 10.0, 20.0] # ppm + +for tol in mz_tolerances: + ff = ms.FeatureFinder() + params = ff.getParameters("centroided") + params.setValue("mass_trace:mz_tolerance", tol) + + features = ms.FeatureMap() + ff.run("centroided", exp, features, params, ms.FeatureMap()) + + print(f"Tolerance {tol} ppm: {features.size()} features") +``` + +### Visual Inspection + +Export features for visualization: + +```python +# Convert to DataFrame for plotting +df = feature_map.get_df() + +import matplotlib.pyplot as plt + +plt.figure(figsize=(10, 6)) +plt.scatter(df['RT'], df['mz'], s=df['intensity']/1000, alpha=0.5) +plt.xlabel('Retention Time (s)') +plt.ylabel('m/z') +plt.title('Feature Map') +plt.colorbar(label='Intensity (scaled)') +plt.show() +``` diff --git a/scientific-packages/pyopenms/references/file_io.md b/scientific-packages/pyopenms/references/file_io.md new file mode 100644 index 0000000..07d9e32 --- /dev/null +++ b/scientific-packages/pyopenms/references/file_io.md @@ -0,0 +1,349 @@ +# File I/O and Data Formats + +## Overview + +PyOpenMS supports multiple mass spectrometry file formats for reading and writing. This guide covers file handling strategies and format-specific operations. + +## Supported Formats + +### Spectrum Data Formats + +- **mzML**: Standard XML-based format for mass spectrometry data +- **mzXML**: Earlier XML-based format +- **mzData**: XML format (deprecated but supported) + +### Identification Formats + +- **idXML**: OpenMS native identification format +- **mzIdentML**: Standard XML format for identification data +- **pepXML**: X! Tandem format +- **protXML**: Protein identification format + +### Feature and Quantitation Formats + +- **featureXML**: OpenMS format for detected features +- **consensusXML**: Format for consensus features across samples +- **mzTab**: Tab-delimited format for reporting + +### Sequence and Library Formats + +- **FASTA**: Protein/peptide sequences +- **TraML**: Transition lists for targeted experiments + +## Reading mzML Files + +### In-Memory Loading + +Load entire file into memory (suitable for smaller files): + +```python +import pyopenms as ms + +# Create experiment container +exp = ms.MSExperiment() + +# Load file +ms.MzMLFile().load("sample.mzML", exp) + +# Access data +print(f"Spectra: {exp.getNrSpectra()}") +print(f"Chromatograms: {exp.getNrChromatograms()}") +``` + +### Indexed Access + +Efficient random access for large files: + +```python +# Create indexed access +indexed_mzml = ms.IndexedMzMLFileLoader() +indexed_mzml.load("large_file.mzML") + +# Get specific spectrum by index +spec = indexed_mzml.getSpectrumById(100) + +# Access by native ID +spec = indexed_mzml.getSpectrumByNativeId("scan=5000") +``` + +### Streaming Access + +Memory-efficient processing for very large files: + +```python +# Define consumer function +class SpectrumProcessor(ms.MSExperimentConsumer): + def __init__(self): + super().__init__() + self.count = 0 + + def consumeSpectrum(self, spec): + # Process spectrum + if spec.getMSLevel() == 2: + self.count += 1 + +# Stream file +consumer = SpectrumProcessor() +ms.MzMLFile().transform("large.mzML", consumer) +print(f"Processed {consumer.count} MS2 spectra") +``` + +### Cached Access + +Balance between memory usage and speed: + +```python +# Use on-disk caching +options = ms.CachedmzML() +options.setMetaDataOnly(False) + +exp = ms.MSExperiment() +ms.CachedmzMLHandler().load("sample.mzML", exp, options) +``` + +## Writing mzML Files + +### Basic Writing + +```python +# Create or modify experiment +exp = ms.MSExperiment() +# ... add spectra ... + +# Write to file +ms.MzMLFile().store("output.mzML", exp) +``` + +### Compression Options + +```python +# Configure compression +file_handler = ms.MzMLFile() + +options = ms.PeakFileOptions() +options.setCompression(True) # Enable compression +file_handler.setOptions(options) + +file_handler.store("compressed.mzML", exp) +``` + +## Reading Identification Data + +### idXML Format + +```python +# Load identification results +protein_ids = [] +peptide_ids = [] + +ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) + +# Access peptide identifications +for peptide_id in peptide_ids: + print(f"RT: {peptide_id.getRT()}") + print(f"MZ: {peptide_id.getMZ()}") + + # Get peptide hits + for hit in peptide_id.getHits(): + print(f" Sequence: {hit.getSequence().toString()}") + print(f" Score: {hit.getScore()}") + print(f" Charge: {hit.getCharge()}") +``` + +### mzIdentML Format + +```python +# Read mzIdentML +protein_ids = [] +peptide_ids = [] + +ms.MzIdentMLFile().load("results.mzid", protein_ids, peptide_ids) +``` + +### pepXML Format + +```python +# Load pepXML +protein_ids = [] +peptide_ids = [] + +ms.PepXMLFile().load("results.pep.xml", protein_ids, peptide_ids) +``` + +## Reading Feature Data + +### featureXML + +```python +# Load features +feature_map = ms.FeatureMap() +ms.FeatureXMLFile().load("features.featureXML", feature_map) + +# Access features +for feature in feature_map: + print(f"RT: {feature.getRT()}") + print(f"MZ: {feature.getMZ()}") + print(f"Intensity: {feature.getIntensity()}") + print(f"Quality: {feature.getOverallQuality()}") +``` + +### consensusXML + +```python +# Load consensus features +consensus_map = ms.ConsensusMap() +ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) + +# Access consensus features +for consensus_feature in consensus_map: + print(f"RT: {consensus_feature.getRT()}") + print(f"MZ: {consensus_feature.getMZ()}") + + # Get feature handles (sub-features from different maps) + for handle in consensus_feature.getFeatureList(): + map_index = handle.getMapIndex() + intensity = handle.getIntensity() + print(f" Map {map_index}: {intensity}") +``` + +## Reading FASTA Files + +```python +# Load protein sequences +fasta_entries = [] +ms.FASTAFile().load("database.fasta", fasta_entries) + +for entry in fasta_entries: + print(f"Identifier: {entry.identifier}") + print(f"Description: {entry.description}") + print(f"Sequence: {entry.sequence}") +``` + +## Reading TraML Files + +```python +# Load transition lists for targeted experiments +targeted_exp = ms.TargetedExperiment() +ms.TraMLFile().load("transitions.TraML", targeted_exp) + +# Access transitions +for transition in targeted_exp.getTransitions(): + print(f"Precursor MZ: {transition.getPrecursorMZ()}") + print(f"Product MZ: {transition.getProductMZ()}") +``` + +## Writing mzTab Files + +```python +# Create mzTab for reporting +mztab = ms.MzTab() + +# Add metadata +metadata = mztab.getMetaData() +metadata.mz_tab_version.set("1.0.0") +metadata.title.set("Proteomics Analysis Results") + +# Add protein data +protein_section = mztab.getProteinSectionRows() +# ... populate protein data ... + +# Write to file +ms.MzTabFile().store("report.mzTab", mztab) +``` + +## Format Conversion + +### mzXML to mzML + +```python +# Read mzXML +exp = ms.MSExperiment() +ms.MzXMLFile().load("data.mzXML", exp) + +# Write as mzML +ms.MzMLFile().store("data.mzML", exp) +``` + +### Extract Chromatograms from mzML + +```python +# Load experiment +exp = ms.MSExperiment() +ms.MzMLFile().load("data.mzML", exp) + +# Extract specific chromatogram +for chrom in exp.getChromatograms(): + if chrom.getNativeID() == "TIC": + rt, intensity = chrom.get_peaks() + print(f"TIC has {len(rt)} data points") +``` + +## File Metadata + +### Access mzML Metadata + +```python +# Load file +exp = ms.MSExperiment() +ms.MzMLFile().load("sample.mzML", exp) + +# Get experimental settings +exp_settings = exp.getExperimentalSettings() + +# Instrument info +instrument = exp_settings.getInstrument() +print(f"Instrument: {instrument.getName()}") +print(f"Model: {instrument.getModel()}") + +# Sample info +sample = exp_settings.getSample() +print(f"Sample name: {sample.getName()}") + +# Source files +for source_file in exp_settings.getSourceFiles(): + print(f"Source: {source_file.getNameOfFile()}") +``` + +## Best Practices + +### Memory Management + +For large files: +1. Use indexed or streaming access instead of full in-memory loading +2. Process data in chunks +3. Clear data structures when no longer needed + +```python +# Good for large files +indexed_mzml = ms.IndexedMzMLFileLoader() +indexed_mzml.load("huge_file.mzML") + +# Process spectra one at a time +for i in range(indexed_mzml.getNrSpectra()): + spec = indexed_mzml.getSpectrumById(i) + # Process spectrum + # Spectrum automatically cleaned up after processing +``` + +### Error Handling + +```python +try: + exp = ms.MSExperiment() + ms.MzMLFile().load("data.mzML", exp) +except Exception as e: + print(f"Failed to load file: {e}") +``` + +### File Validation + +```python +# Check if file exists and is readable +import os + +if os.path.exists("data.mzML") and os.path.isfile("data.mzML"): + exp = ms.MSExperiment() + ms.MzMLFile().load("data.mzML", exp) +else: + print("File not found") +``` diff --git a/scientific-packages/pyopenms/references/identification.md b/scientific-packages/pyopenms/references/identification.md new file mode 100644 index 0000000..3da9796 --- /dev/null +++ b/scientific-packages/pyopenms/references/identification.md @@ -0,0 +1,422 @@ +# Peptide and Protein Identification + +## Overview + +PyOpenMS supports peptide/protein identification through integration with search engines and provides tools for post-processing identification results including FDR control, protein inference, and annotation. + +## Supported Search Engines + +PyOpenMS integrates with these search engines: + +- **Comet**: Fast tandem MS search +- **Mascot**: Commercial search engine +- **MSGFPlus**: Spectral probability-based search +- **XTandem**: Open-source search tool +- **OMSSA**: NCBI search engine +- **Myrimatch**: High-throughput search +- **MSFragger**: Ultra-fast search + +## Reading Identification Data + +### idXML Format + +```python +import pyopenms as ms + +# Load identification results +protein_ids = [] +peptide_ids = [] + +ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) + +print(f"Protein identifications: {len(protein_ids)}") +print(f"Peptide identifications: {len(peptide_ids)}") +``` + +### Access Peptide Identifications + +```python +# Iterate through peptide IDs +for peptide_id in peptide_ids: + # Spectrum metadata + print(f"RT: {peptide_id.getRT():.2f}") + print(f"m/z: {peptide_id.getMZ():.4f}") + + # Get peptide hits (ranked by score) + hits = peptide_id.getHits() + print(f"Number of hits: {len(hits)}") + + for hit in hits: + sequence = hit.getSequence() + print(f" Sequence: {sequence.toString()}") + print(f" Score: {hit.getScore()}") + print(f" Charge: {hit.getCharge()}") + print(f" Mass error (ppm): {hit.getMetaValue('mass_error_ppm')}") + + # Get modifications + if sequence.isModified(): + for i in range(sequence.size()): + residue = sequence.getResidue(i) + if residue.isModified(): + print(f" Modification at position {i}: {residue.getModificationName()}") +``` + +### Access Protein Identifications + +```python +# Access protein-level information +for protein_id in protein_ids: + # Search parameters + search_params = protein_id.getSearchParameters() + print(f"Search engine: {protein_id.getSearchEngine()}") + print(f"Database: {search_params.db}") + + # Protein hits + hits = protein_id.getHits() + for hit in hits: + print(f" Accession: {hit.getAccession()}") + print(f" Score: {hit.getScore()}") + print(f" Coverage: {hit.getCoverage()}") + print(f" Sequence: {hit.getSequence()}") +``` + +## False Discovery Rate (FDR) + +### FDR Filtering + +Apply FDR filtering to control false positives: + +```python +# Create FDR object +fdr = ms.FalseDiscoveryRate() + +# Apply FDR at PSM level +fdr.apply(peptide_ids) + +# Filter by FDR threshold +fdr_threshold = 0.01 # 1% FDR +filtered_peptide_ids = [] + +for peptide_id in peptide_ids: + # Keep hits below FDR threshold + filtered_hits = [] + for hit in peptide_id.getHits(): + if hit.getScore() <= fdr_threshold: # Lower score = better + filtered_hits.append(hit) + + if filtered_hits: + peptide_id.setHits(filtered_hits) + filtered_peptide_ids.append(peptide_id) + +print(f"Peptides passing FDR: {len(filtered_peptide_ids)}") +``` + +### Score Transformation + +Convert scores to q-values: + +```python +# Apply score transformation +fdr.apply(peptide_ids) + +# Access q-values +for peptide_id in peptide_ids: + for hit in peptide_id.getHits(): + q_value = hit.getMetaValue("q-value") + print(f"Sequence: {hit.getSequence().toString()}, q-value: {q_value}") +``` + +## Protein Inference + +### ID Mapper + +Map peptide identifications to proteins: + +```python +# Create mapper +mapper = ms.IDMapper() + +# Map to features +feature_map = ms.FeatureMap() +ms.FeatureXMLFile().load("features.featureXML", feature_map) + +# Annotate features with IDs +mapper.annotate(feature_map, peptide_ids, protein_ids) + +# Check annotated features +for feature in feature_map: + pep_ids = feature.getPeptideIdentifications() + if pep_ids: + for pep_id in pep_ids: + for hit in pep_id.getHits(): + print(f"Feature {feature.getMZ():.4f}: {hit.getSequence().toString()}") +``` + +### Protein Grouping + +Group proteins by shared peptides: + +```python +# Create protein inference algorithm +inference = ms.BasicProteinInferenceAlgorithm() + +# Run inference +inference.run(peptide_ids, protein_ids) + +# Access protein groups +for protein_id in protein_ids: + hits = protein_id.getHits() + if len(hits) > 1: + print("Protein group:") + for hit in hits: + print(f" {hit.getAccession()}") +``` + +## Peptide Sequence Handling + +### AASequence Object + +Work with peptide sequences: + +```python +# Create peptide sequence +seq = ms.AASequence.fromString("PEPTIDE") + +print(f"Sequence: {seq.toString()}") +print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}") +print(f"Average mass: {seq.getAverageWeight():.4f}") +print(f"Length: {seq.size()}") + +# Access individual amino acids +for i in range(seq.size()): + residue = seq.getResidue(i) + print(f"Position {i}: {residue.getOneLetterCode()}, mass: {residue.getMonoWeight():.4f}") +``` + +### Modified Sequences + +Handle post-translational modifications: + +```python +# Sequence with modifications +mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K") + +print(f"Modified sequence: {mod_seq.toString()}") +print(f"Mass with mods: {mod_seq.getMonoWeight():.4f}") + +# Check if modified +print(f"Is modified: {mod_seq.isModified()}") + +# Get modification info +for i in range(mod_seq.size()): + residue = mod_seq.getResidue(i) + if residue.isModified(): + print(f"Residue {residue.getOneLetterCode()} at position {i}") + print(f" Modification: {residue.getModificationName()}") +``` + +### Peptide Digestion + +Simulate enzymatic digestion: + +```python +# Create digestion enzyme +enzyme = ms.ProteaseDigestion() +enzyme.setEnzyme("Trypsin") + +# Set missed cleavages +enzyme.setMissedCleavages(2) + +# Digest protein sequence +protein_seq = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL" + +# Get peptides +peptides = [] +enzyme.digest(ms.AASequence.fromString(protein_seq), peptides) + +print(f"Generated {len(peptides)} peptides") +for peptide in peptides[:5]: # Show first 5 + print(f" {peptide.toString()}, mass: {peptide.getMonoWeight():.2f}") +``` + +## Theoretical Spectrum Generation + +### Fragment Ion Calculation + +Generate theoretical fragment ions: + +```python +# Create peptide +peptide = ms.AASequence.fromString("PEPTIDE") + +# Generate b and y ions +fragments = [] +ms.TheoreticalSpectrumGenerator().getSpectrum(fragments, peptide, 1, 1) + +print(f"Generated {fragments.size()} fragment ions") + +# Access fragments +mz, intensity = fragments.get_peaks() +for m, i in zip(mz[:10], intensity[:10]): # Show first 10 + print(f"m/z: {m:.4f}, intensity: {i}") +``` + +## Complete Identification Workflow + +### End-to-End Example + +```python +import pyopenms as ms + +def identification_workflow(spectrum_file, fasta_file, output_file): + """ + Complete identification workflow with FDR control. + + Args: + spectrum_file: Input mzML file + fasta_file: Protein database (FASTA) + output_file: Output idXML file + """ + + # Step 1: Load spectra + exp = ms.MSExperiment() + ms.MzMLFile().load(spectrum_file, exp) + print(f"Loaded {exp.getNrSpectra()} spectra") + + # Step 2: Configure search parameters + search_params = ms.SearchParameters() + search_params.db = fasta_file + search_params.precursor_mass_tolerance = 10.0 # ppm + search_params.fragment_mass_tolerance = 0.5 # Da + search_params.enzyme = "Trypsin" + search_params.missed_cleavages = 2 + search_params.modifications = ["Oxidation (M)", "Carbamidomethyl (C)"] + + # Step 3: Run search (example with Comet adapter) + # Note: Requires search engine to be installed + # comet = ms.CometAdapter() + # protein_ids, peptide_ids = comet.search(exp, search_params) + + # For this example, load pre-computed results + protein_ids = [] + peptide_ids = [] + ms.IdXMLFile().load("raw_identifications.idXML", protein_ids, peptide_ids) + + print(f"Initial peptide IDs: {len(peptide_ids)}") + + # Step 4: Apply FDR filtering + fdr = ms.FalseDiscoveryRate() + fdr.apply(peptide_ids) + + # Filter by 1% FDR + filtered_peptide_ids = [] + for peptide_id in peptide_ids: + filtered_hits = [] + for hit in peptide_id.getHits(): + q_value = hit.getMetaValue("q-value") + if q_value <= 0.01: + filtered_hits.append(hit) + + if filtered_hits: + peptide_id.setHits(filtered_hits) + filtered_peptide_ids.append(peptide_id) + + print(f"Peptides after FDR (1%): {len(filtered_peptide_ids)}") + + # Step 5: Protein inference + inference = ms.BasicProteinInferenceAlgorithm() + inference.run(filtered_peptide_ids, protein_ids) + + print(f"Identified proteins: {len(protein_ids)}") + + # Step 6: Save results + ms.IdXMLFile().store(output_file, protein_ids, filtered_peptide_ids) + print(f"Results saved to {output_file}") + + return protein_ids, filtered_peptide_ids + +# Run workflow +protein_ids, peptide_ids = identification_workflow( + "spectra.mzML", + "database.fasta", + "identifications_fdr.idXML" +) +``` + +## Spectral Library Search + +### Library Matching + +```python +# Load spectral library +library = ms.MSPFile() +library_spectra = [] +library.load("spectral_library.msp", library_spectra) + +# Load experimental spectra +exp = ms.MSExperiment() +ms.MzMLFile().load("data.mzML", exp) + +# Compare spectra +spectra_compare = ms.SpectraSTSimilarityScore() + +for exp_spec in exp: + if exp_spec.getMSLevel() == 2: + best_match_score = 0 + best_match_lib = None + + for lib_spec in library_spectra: + score = spectra_compare.operator()(exp_spec, lib_spec) + if score > best_match_score: + best_match_score = score + best_match_lib = lib_spec + + if best_match_score > 0.7: # Threshold + print(f"Match found: score {best_match_score:.3f}") +``` + +## Best Practices + +### Decoy Database + +Use target-decoy approach for FDR calculation: + +```python +# Generate decoy database +decoy_generator = ms.DecoyGenerator() + +# Load target database +fasta_entries = [] +ms.FASTAFile().load("target.fasta", fasta_entries) + +# Generate decoys +decoy_entries = [] +for entry in fasta_entries: + decoy_entry = decoy_generator.reverseProtein(entry) + decoy_entries.append(decoy_entry) + +# Save combined database +all_entries = fasta_entries + decoy_entries +ms.FASTAFile().store("target_decoy.fasta", all_entries) +``` + +### Score Interpretation + +Understand score types from different engines: + +```python +# Interpret scores based on search engine +for peptide_id in peptide_ids: + search_engine = peptide_id.getIdentifier() + + for hit in peptide_id.getHits(): + score = hit.getScore() + + # Score interpretation varies by engine + if "Comet" in search_engine: + # Comet: higher E-value = worse + print(f"E-value: {score}") + elif "Mascot" in search_engine: + # Mascot: higher score = better + print(f"Ion score: {score}") +``` diff --git a/scientific-packages/pyopenms/references/metabolomics.md b/scientific-packages/pyopenms/references/metabolomics.md new file mode 100644 index 0000000..617bf64 --- /dev/null +++ b/scientific-packages/pyopenms/references/metabolomics.md @@ -0,0 +1,482 @@ +# Metabolomics Workflows + +## Overview + +PyOpenMS provides specialized tools for untargeted metabolomics analysis including feature detection optimized for small molecules, adduct grouping, compound identification, and integration with metabolomics databases. + +## Untargeted Metabolomics Pipeline + +### Complete Workflow + +```python +import pyopenms as ms + +def metabolomics_pipeline(input_files, output_dir): + """ + Complete untargeted metabolomics workflow. + + Args: + input_files: List of mzML file paths (one per sample) + output_dir: Directory for output files + """ + + # Step 1: Peak picking and feature detection + feature_maps = [] + + for mzml_file in input_files: + print(f"Processing {mzml_file}...") + + # Load data + exp = ms.MSExperiment() + ms.MzMLFile().load(mzml_file, exp) + + # Peak picking if needed + if not exp.getSpectrum(0).isSorted(): + picker = ms.PeakPickerHiRes() + exp_picked = ms.MSExperiment() + picker.pickExperiment(exp, exp_picked) + exp = exp_picked + + # Feature detection + ff = ms.FeatureFinder() + params = ff.getParameters("centroided") + + # Metabolomics-specific parameters + params.setValue("mass_trace:mz_tolerance", 5.0) # ppm, tighter for metabolites + params.setValue("mass_trace:min_spectra", 5) + params.setValue("isotopic_pattern:charge_low", 1) + params.setValue("isotopic_pattern:charge_high", 2) # Mostly singly charged + + features = ms.FeatureMap() + ff.run("centroided", exp, features, params, ms.FeatureMap()) + + features.setPrimaryMSRunPath([mzml_file.encode()]) + feature_maps.append(features) + + print(f" Detected {features.size()} features") + + # Step 2: Adduct detection and grouping + print("Detecting adducts...") + adduct_grouped_maps = [] + + adduct_detector = ms.MetaboliteAdductDecharger() + params = adduct_detector.getParameters() + params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M+NH4]+,[M-H]-,[M+Cl]-") + params.setValue("charge_min", 1) + params.setValue("charge_max", 1) + adduct_detector.setParameters(params) + + for fm in feature_maps: + fm_out = ms.FeatureMap() + adduct_detector.compute(fm, fm_out, ms.ConsensusMap()) + adduct_grouped_maps.append(fm_out) + + # Step 3: RT alignment + print("Aligning retention times...") + aligner = ms.MapAlignmentAlgorithmPoseClustering() + + params = aligner.getParameters() + params.setValue("max_num_peaks_considered", 1000) + params.setValue("pairfinder:distance_MZ:max_difference", 10.0) + params.setValue("pairfinder:distance_MZ:unit", "ppm") + aligner.setParameters(params) + + aligned_maps = [] + transformations = [] + aligner.align(adduct_grouped_maps, aligned_maps, transformations) + + # Step 4: Feature linking + print("Linking features...") + grouper = ms.FeatureGroupingAlgorithmQT() + + params = grouper.getParameters() + params.setValue("distance_RT:max_difference", 60.0) # seconds + params.setValue("distance_MZ:max_difference", 5.0) # ppm + params.setValue("distance_MZ:unit", "ppm") + grouper.setParameters(params) + + consensus_map = ms.ConsensusMap() + grouper.group(aligned_maps, consensus_map) + + print(f"Created {consensus_map.size()} consensus features") + + # Step 5: Gap filling (fill missing values) + print("Filling gaps...") + # Gap filling not directly available in Python API + # Would use TOPP tool FeatureFinderMetaboIdent + + # Step 6: Export results + consensus_file = f"{output_dir}/consensus.consensusXML" + ms.ConsensusXMLFile().store(consensus_file, consensus_map) + + # Export to CSV for downstream analysis + df = consensus_map.get_df() + csv_file = f"{output_dir}/metabolite_table.csv" + df.to_csv(csv_file, index=False) + + print(f"Results saved to {output_dir}") + + return consensus_map + +# Run pipeline +input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"] +consensus = metabolomics_pipeline(input_files, "output") +``` + +## Adduct Detection + +### Configure Adduct Types + +```python +# Create adduct detector +adduct_detector = ms.MetaboliteAdductDecharger() + +# Configure common adducts +params = adduct_detector.getParameters() + +# Positive mode adducts +positive_adducts = [ + "[M+H]+", + "[M+Na]+", + "[M+K]+", + "[M+NH4]+", + "[2M+H]+", + "[M+H-H2O]+" +] + +# Negative mode adducts +negative_adducts = [ + "[M-H]-", + "[M+Cl]-", + "[M+FA-H]-", # Formate + "[2M-H]-" +] + +# Set for positive mode +params.setValue("potential_adducts", ",".join(positive_adducts)) +params.setValue("charge_min", 1) +params.setValue("charge_max", 1) +params.setValue("max_neutrals", 1) +adduct_detector.setParameters(params) + +# Apply adduct detection +feature_map_out = ms.FeatureMap() +adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap()) +``` + +### Access Adduct Information + +```python +# Check adduct annotations +for feature in feature_map_out: + # Get adduct type if annotated + if feature.metaValueExists("adduct"): + adduct = feature.getMetaValue("adduct") + neutral_mass = feature.getMetaValue("neutral_mass") + print(f"m/z: {feature.getMZ():.4f}") + print(f" Adduct: {adduct}") + print(f" Neutral mass: {neutral_mass:.4f}") +``` + +## Compound Identification + +### Mass-Based Annotation + +```python +# Annotate features with compound database +from pyopenms import MassDecomposition + +# Load compound database (example structure) +# In practice, use external database like HMDB, METLIN + +compound_db = [ + {"name": "Glucose", "formula": "C6H12O6", "mass": 180.0634}, + {"name": "Citric acid", "formula": "C6H8O7", "mass": 192.0270}, + # ... more compounds +] + +# Annotate features +mass_tolerance = 5.0 # ppm + +for feature in feature_map: + observed_mz = feature.getMZ() + + # Calculate neutral mass (assuming [M+H]+) + neutral_mass = observed_mz - 1.007276 # Proton mass + + # Search database + for compound in compound_db: + mass_error_ppm = abs(neutral_mass - compound["mass"]) / compound["mass"] * 1e6 + + if mass_error_ppm <= mass_tolerance: + print(f"Potential match: {compound['name']}") + print(f" Observed m/z: {observed_mz:.4f}") + print(f" Expected mass: {compound['mass']:.4f}") + print(f" Error: {mass_error_ppm:.2f} ppm") +``` + +### MS/MS-Based Identification + +```python +# Load MS2 data +exp = ms.MSExperiment() +ms.MzMLFile().load("data_with_ms2.mzML", exp) + +# Extract MS2 spectra +ms2_spectra = [] +for spec in exp: + if spec.getMSLevel() == 2: + ms2_spectra.append(spec) + +print(f"Found {len(ms2_spectra)} MS2 spectra") + +# Match to spectral library +# (Requires external tool or custom implementation) +``` + +## Data Normalization + +### Total Ion Current (TIC) Normalization + +```python +import numpy as np + +# Load consensus map +consensus_map = ms.ConsensusMap() +ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) + +# Calculate TIC per sample +n_samples = len(consensus_map.getColumnHeaders()) +tic_per_sample = np.zeros(n_samples) + +for cons_feature in consensus_map: + for handle in cons_feature.getFeatureList(): + map_idx = handle.getMapIndex() + tic_per_sample[map_idx] += handle.getIntensity() + +print("TIC per sample:", tic_per_sample) + +# Normalize to median TIC +median_tic = np.median(tic_per_sample) +normalization_factors = median_tic / tic_per_sample + +print("Normalization factors:", normalization_factors) + +# Apply normalization +consensus_map_normalized = ms.ConsensusMap(consensus_map) +for cons_feature in consensus_map_normalized: + feature_list = cons_feature.getFeatureList() + for handle in feature_list: + map_idx = handle.getMapIndex() + normalized_intensity = handle.getIntensity() * normalization_factors[map_idx] + handle.setIntensity(normalized_intensity) +``` + +## Quality Control + +### Coefficient of Variation (CV) Filtering + +```python +import pandas as pd +import numpy as np + +# Export to pandas +df = consensus_map.get_df() + +# Assume QC samples are columns with 'QC' in name +qc_cols = [col for col in df.columns if 'QC' in col] + +if qc_cols: + # Calculate CV for each feature in QC samples + qc_data = df[qc_cols] + cv = (qc_data.std(axis=1) / qc_data.mean(axis=1)) * 100 + + # Filter features with CV < 30% in QC samples + good_features = df[cv < 30] + + print(f"Features before CV filter: {len(df)}") + print(f"Features after CV filter: {len(good_features)}") +``` + +### Blank Filtering + +```python +# Remove features present in blank samples +blank_cols = [col for col in df.columns if 'Blank' in col] +sample_cols = [col for col in df.columns if 'Sample' in col] + +if blank_cols and sample_cols: + # Calculate mean intensity in blanks and samples + blank_mean = df[blank_cols].mean(axis=1) + sample_mean = df[sample_cols].mean(axis=1) + + # Keep features with 3x higher intensity in samples than blanks + ratio = sample_mean / (blank_mean + 1) # Add 1 to avoid division by zero + filtered_df = df[ratio > 3] + + print(f"Features before blank filtering: {len(df)}") + print(f"Features after blank filtering: {len(filtered_df)}") +``` + +## Missing Value Imputation + +```python +import pandas as pd +import numpy as np + +# Load data +df = consensus_map.get_df() + +# Replace zeros with NaN +df = df.replace(0, np.nan) + +# Count missing values +missing_per_feature = df.isnull().sum(axis=1) +print(f"Features with >50% missing: {sum(missing_per_feature > len(df.columns)/2)}") + +# Simple imputation: replace with minimum value +for col in df.columns: + if df[col].dtype in [np.float64, np.int64]: + min_val = df[col].min() / 2 # Half minimum + df[col].fillna(min_val, inplace=True) +``` + +## Metabolite Table Export + +### Create Analysis-Ready Table + +```python +import pandas as pd + +def create_metabolite_table(consensus_map, output_file): + """ + Create metabolite quantification table for statistical analysis. + """ + + # Get column headers (file descriptions) + headers = consensus_map.getColumnHeaders() + + # Initialize data structure + data = { + 'mz': [], + 'rt': [], + 'feature_id': [] + } + + # Add sample columns + for map_idx, header in headers.items(): + sample_name = header.label or f"Sample_{map_idx}" + data[sample_name] = [] + + # Extract feature data + for idx, cons_feature in enumerate(consensus_map): + data['mz'].append(cons_feature.getMZ()) + data['rt'].append(cons_feature.getRT()) + data['feature_id'].append(f"F{idx:06d}") + + # Initialize intensities + intensities = {map_idx: 0.0 for map_idx in headers.keys()} + + # Fill in measured intensities + for handle in cons_feature.getFeatureList(): + map_idx = handle.getMapIndex() + intensities[map_idx] = handle.getIntensity() + + # Add to data structure + for map_idx, header in headers.items(): + sample_name = header.label or f"Sample_{map_idx}" + data[sample_name].append(intensities[map_idx]) + + # Create DataFrame + df = pd.DataFrame(data) + + # Sort by RT + df = df.sort_values('rt') + + # Save to CSV + df.to_csv(output_file, index=False) + + print(f"Metabolite table with {len(df)} features saved to {output_file}") + + return df + +# Create table +df = create_metabolite_table(consensus_map, "metabolite_table.csv") +``` + +## Integration with External Tools + +### Export for MetaboAnalyst + +```python +def export_for_metaboanalyst(df, output_file): + """ + Format data for MetaboAnalyst input. + + Requires sample names as columns, features as rows. + """ + + # Transpose DataFrame + # Remove metadata columns + sample_cols = [col for col in df.columns if col not in ['mz', 'rt', 'feature_id']] + + # Extract sample data + sample_data = df[sample_cols] + + # Transpose (samples as rows, features as columns) + df_transposed = sample_data.T + + # Add feature identifiers as column names + df_transposed.columns = df['feature_id'] + + # Save + df_transposed.to_csv(output_file) + + print(f"MetaboAnalyst format saved to {output_file}") + +# Export +export_for_metaboanalyst(df, "for_metaboanalyst.csv") +``` + +## Best Practices + +### Sample Size and Replicates + +- Include QC samples (pooled sample) every 5-10 injections +- Run blank samples to identify contamination +- Use at least 3 biological replicates per group +- Randomize sample injection order + +### Parameter Optimization + +Test parameters on pooled QC sample: + +```python +# Test different mass trace parameters +mz_tolerances = [3.0, 5.0, 10.0] +min_spectra_values = [3, 5, 7] + +for tol in mz_tolerances: + for min_spec in min_spectra_values: + ff = ms.FeatureFinder() + params = ff.getParameters("centroided") + params.setValue("mass_trace:mz_tolerance", tol) + params.setValue("mass_trace:min_spectra", min_spec) + + features = ms.FeatureMap() + ff.run("centroided", exp, features, params, ms.FeatureMap()) + + print(f"tol={tol}, min_spec={min_spec}: {features.size()} features") +``` + +### Retention Time Windows + +Adjust based on chromatographic method: + +```python +# For 10-minute LC gradient +params.setValue("distance_RT:max_difference", 30.0) # 30 seconds + +# For 60-minute LC gradient +params.setValue("distance_RT:max_difference", 90.0) # 90 seconds +``` diff --git a/scientific-packages/pyopenms/references/signal_processing.md b/scientific-packages/pyopenms/references/signal_processing.md new file mode 100644 index 0000000..f094096 --- /dev/null +++ b/scientific-packages/pyopenms/references/signal_processing.md @@ -0,0 +1,433 @@ +# Signal Processing + +## Overview + +PyOpenMS provides algorithms for processing raw mass spectrometry data including smoothing, filtering, peak picking, centroiding, normalization, and deconvolution. + +## Algorithm Pattern + +Most signal processing algorithms follow a standard pattern: + +```python +import pyopenms as ms + +# 1. Create algorithm instance +algo = ms.AlgorithmName() + +# 2. Get and modify parameters +params = algo.getParameters() +params.setValue("parameter_name", value) +algo.setParameters(params) + +# 3. Apply to data +algo.filterExperiment(exp) # or filterSpectrum(spec) +``` + +## Smoothing + +### Gaussian Filter + +Apply Gaussian smoothing to reduce noise: + +```python +# Create Gaussian filter +gaussian = ms.GaussFilter() + +# Configure parameters +params = gaussian.getParameters() +params.setValue("gaussian_width", 0.2) # Width in m/z or RT units +params.setValue("ppm_tolerance", 10.0) # For m/z dimension +params.setValue("use_ppm_tolerance", "true") +gaussian.setParameters(params) + +# Apply to experiment +gaussian.filterExperiment(exp) + +# Or apply to single spectrum +spec = exp.getSpectrum(0) +gaussian.filterSpectrum(spec) +``` + +### Savitzky-Golay Filter + +Polynomial smoothing that preserves peak shapes: + +```python +# Create Savitzky-Golay filter +sg_filter = ms.SavitzkyGolayFilter() + +# Configure parameters +params = sg_filter.getParameters() +params.setValue("frame_length", 11) # Window size (must be odd) +params.setValue("polynomial_order", 4) # Polynomial degree +sg_filter.setParameters(params) + +# Apply smoothing +sg_filter.filterExperiment(exp) +``` + +## Peak Picking and Centroiding + +### Peak Picker High Resolution + +Detect peaks in high-resolution data: + +```python +# Create peak picker +peak_picker = ms.PeakPickerHiRes() + +# Configure parameters +params = peak_picker.getParameters() +params.setValue("signal_to_noise", 3.0) # S/N threshold +params.setValue("spacing_difference", 1.5) # Minimum peak spacing +peak_picker.setParameters(params) + +# Pick peaks +exp_picked = ms.MSExperiment() +peak_picker.pickExperiment(exp, exp_picked) +``` + +### Peak Picker for CWT + +Continuous wavelet transform-based peak picking: + +```python +# Create CWT peak picker +cwt_picker = ms.PeakPickerCWT() + +# Configure parameters +params = cwt_picker.getParameters() +params.setValue("signal_to_noise", 1.0) +params.setValue("peak_width", 0.15) # Expected peak width +cwt_picker.setParameters(params) + +# Pick peaks +cwt_picker.pickExperiment(exp, exp_picked) +``` + +## Normalization + +### Normalizer + +Normalize peak intensities within spectra: + +```python +# Create normalizer +normalizer = ms.Normalizer() + +# Configure normalization method +params = normalizer.getParameters() +params.setValue("method", "to_one") # Options: "to_one", "to_TIC" +normalizer.setParameters(params) + +# Apply normalization +normalizer.filterExperiment(exp) +``` + +## Peak Filtering + +### Threshold Mower + +Remove peaks below intensity threshold: + +```python +# Create threshold filter +mower = ms.ThresholdMower() + +# Configure threshold +params = mower.getParameters() +params.setValue("threshold", 1000.0) # Absolute intensity threshold +mower.setParameters(params) + +# Apply filter +mower.filterExperiment(exp) +``` + +### Window Mower + +Keep only highest peaks in sliding windows: + +```python +# Create window mower +window_mower = ms.WindowMower() + +# Configure parameters +params = window_mower.getParameters() +params.setValue("windowsize", 50.0) # Window size in m/z +params.setValue("peakcount", 2) # Keep top N peaks per window +window_mower.setParameters(params) + +# Apply filter +window_mower.filterExperiment(exp) +``` + +### N Largest Peaks + +Keep only the N most intense peaks: + +```python +# Create N largest filter +n_largest = ms.NLargest() + +# Configure parameters +params = n_largest.getParameters() +params.setValue("n", 200) # Keep 200 most intense peaks +n_largest.setParameters(params) + +# Apply filter +n_largest.filterExperiment(exp) +``` + +## Baseline Reduction + +### Morphological Filter + +Remove baseline using morphological operations: + +```python +# Create morphological filter +morph_filter = ms.MorphologicalFilter() + +# Configure parameters +params = morph_filter.getParameters() +params.setValue("struc_elem_length", 3.0) # Structuring element size +params.setValue("method", "tophat") # Method: "tophat", "bothat", "erosion", "dilation" +morph_filter.setParameters(params) + +# Apply filter +morph_filter.filterExperiment(exp) +``` + +## Spectrum Merging + +### Spectra Merger + +Combine multiple spectra into one: + +```python +# Create merger +merger = ms.SpectraMerger() + +# Configure parameters +params = merger.getParameters() +params.setValue("average_gaussian:spectrum_type", "profile") +params.setValue("average_gaussian:rt_FWHM", 5.0) # RT window +merger.setParameters(params) + +# Merge spectra +merger.mergeSpectraBlockWise(exp) +``` + +## Deconvolution + +### Charge Deconvolution + +Determine charge states and convert to neutral masses: + +```python +# Create feature deconvoluter +deconvoluter = ms.FeatureDeconvolution() + +# Configure parameters +params = deconvoluter.getParameters() +params.setValue("charge_min", 1) +params.setValue("charge_max", 4) +params.setValue("potential_charge_states", "1,2,3,4") +deconvoluter.setParameters(params) + +# Apply deconvolution +feature_map_out = ms.FeatureMap() +deconvoluter.compute(exp, feature_map, feature_map_out, ms.ConsensusMap()) +``` + +### Isotope Deconvolution + +Remove isotopic patterns: + +```python +# Create isotope wavelet transform +isotope_wavelet = ms.IsotopeWaveletTransform() + +# Configure parameters +params = isotope_wavelet.getParameters() +params.setValue("max_charge", 3) +params.setValue("intensity_threshold", 10.0) +isotope_wavelet.setParameters(params) + +# Apply transformation +isotope_wavelet.transform(exp) +``` + +## Retention Time Alignment + +### Map Alignment + +Align retention times across multiple runs: + +```python +# Create map aligner +aligner = ms.MapAlignmentAlgorithmPoseClustering() + +# Load multiple experiments +exp1 = ms.MSExperiment() +exp2 = ms.MSExperiment() +ms.MzMLFile().load("run1.mzML", exp1) +ms.MzMLFile().load("run2.mzML", exp2) + +# Create reference +reference = ms.MSExperiment() + +# Align experiments +transformations = [] +aligner.align(exp1, exp2, transformations) + +# Apply transformation +transformer = ms.MapAlignmentTransformer() +transformer.transformRetentionTimes(exp2, transformations[0]) +``` + +## Mass Calibration + +### Internal Calibration + +Calibrate mass axis using known reference masses: + +```python +# Create internal calibration +calibration = ms.InternalCalibration() + +# Set reference masses +reference_masses = [500.0, 1000.0, 1500.0] # Known m/z values + +# Calibrate +calibration.calibrate(exp, reference_masses) +``` + +## Quality Control + +### Spectrum Statistics + +Calculate quality metrics: + +```python +# Get spectrum +spec = exp.getSpectrum(0) + +# Calculate statistics +mz, intensity = spec.get_peaks() + +# Total ion current +tic = sum(intensity) + +# Base peak +base_peak_intensity = max(intensity) +base_peak_mz = mz[intensity.argmax()] + +print(f"TIC: {tic}") +print(f"Base peak: {base_peak_mz} m/z at {base_peak_intensity}") +``` + +## Spectrum Preprocessing Pipeline + +### Complete Preprocessing Example + +```python +import pyopenms as ms + +def preprocess_experiment(input_file, output_file): + """Complete preprocessing pipeline.""" + + # Load data + exp = ms.MSExperiment() + ms.MzMLFile().load(input_file, exp) + + # 1. Smooth with Gaussian filter + gaussian = ms.GaussFilter() + gaussian.filterExperiment(exp) + + # 2. Pick peaks + picker = ms.PeakPickerHiRes() + exp_picked = ms.MSExperiment() + picker.pickExperiment(exp, exp_picked) + + # 3. Normalize intensities + normalizer = ms.Normalizer() + params = normalizer.getParameters() + params.setValue("method", "to_TIC") + normalizer.setParameters(params) + normalizer.filterExperiment(exp_picked) + + # 4. Filter low-intensity peaks + mower = ms.ThresholdMower() + params = mower.getParameters() + params.setValue("threshold", 10.0) + mower.setParameters(params) + mower.filterExperiment(exp_picked) + + # Save processed data + ms.MzMLFile().store(output_file, exp_picked) + + return exp_picked + +# Run pipeline +exp_processed = preprocess_experiment("raw_data.mzML", "processed_data.mzML") +``` + +## Best Practices + +### Parameter Optimization + +Test parameters on representative data: + +```python +# Try different Gaussian widths +widths = [0.1, 0.2, 0.5] + +for width in widths: + exp_test = ms.MSExperiment() + ms.MzMLFile().load("test_data.mzML", exp_test) + + gaussian = ms.GaussFilter() + params = gaussian.getParameters() + params.setValue("gaussian_width", width) + gaussian.setParameters(params) + gaussian.filterExperiment(exp_test) + + # Evaluate quality + # ... add evaluation code ... +``` + +### Preserve Original Data + +Keep original data for comparison: + +```python +# Load original +exp_original = ms.MSExperiment() +ms.MzMLFile().load("data.mzML", exp_original) + +# Create copy for processing +exp_processed = ms.MSExperiment(exp_original) + +# Process copy +gaussian = ms.GaussFilter() +gaussian.filterExperiment(exp_processed) + +# Original remains unchanged +``` + +### Profile vs Centroid Data + +Check data type before processing: + +```python +# Check if spectrum is centroided +spec = exp.getSpectrum(0) + +if spec.isSorted(): + # Likely centroided + print("Centroid data") +else: + # Likely profile + print("Profile data - apply peak picking") +```