mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
Update the PyOpenMS skill
This commit is contained in:
@@ -1,643 +0,0 @@
|
||||
# pyOpenMS Algorithms Reference
|
||||
|
||||
This document provides comprehensive coverage of algorithms available in pyOpenMS for signal processing, feature detection, and quantification.
|
||||
|
||||
## Algorithm Usage Pattern
|
||||
|
||||
Most pyOpenMS algorithms follow a consistent pattern:
|
||||
|
||||
```python
|
||||
import pyopenms as oms
|
||||
|
||||
# 1. Instantiate algorithm
|
||||
algorithm = oms.AlgorithmName()
|
||||
|
||||
# 2. Get parameters
|
||||
params = algorithm.getParameters()
|
||||
|
||||
# 3. Modify parameters
|
||||
params.setValue("parameter_name", value)
|
||||
|
||||
# 4. Set parameters back
|
||||
algorithm.setParameters(params)
|
||||
|
||||
# 5. Apply to data
|
||||
algorithm.filterExperiment(exp) # or .process(), .run(), etc.
|
||||
```
|
||||
|
||||
## Signal Processing Algorithms
|
||||
|
||||
### Smoothing Filters
|
||||
|
||||
#### GaussFilter - Gaussian Smoothing
|
||||
|
||||
Applies Gaussian smoothing to reduce noise.
|
||||
|
||||
```python
|
||||
gauss = oms.GaussFilter()
|
||||
|
||||
# Configure parameters
|
||||
params = gauss.getParameters()
|
||||
params.setValue("gaussian_width", 0.2) # Gaussian width (larger = more smoothing)
|
||||
params.setValue("ppm_tolerance", 10.0) # PPM tolerance for spacing
|
||||
params.setValue("use_ppm_tolerance", "true")
|
||||
gauss.setParameters(params)
|
||||
|
||||
# Apply to experiment
|
||||
gauss.filterExperiment(exp)
|
||||
|
||||
# Or apply to single spectrum
|
||||
spectrum_smoothed = oms.MSSpectrum()
|
||||
gauss.filter(spectrum, spectrum_smoothed)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `gaussian_width`: Width of Gaussian kernel (default: 0.2 Da)
|
||||
- `ppm_tolerance`: Tolerance in ppm for spacing
|
||||
- `use_ppm_tolerance`: Whether to use ppm instead of absolute spacing
|
||||
|
||||
#### SavitzkyGolayFilter
|
||||
|
||||
Applies Savitzky-Golay smoothing (polynomial fitting).
|
||||
|
||||
```python
|
||||
sg_filter = oms.SavitzkyGolayFilter()
|
||||
|
||||
params = sg_filter.getParameters()
|
||||
params.setValue("frame_length", 11) # Window size (must be odd)
|
||||
params.setValue("polynomial_order", 3) # Polynomial degree
|
||||
sg_filter.setParameters(params)
|
||||
|
||||
sg_filter.filterExperiment(exp)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `frame_length`: Size of smoothing window (must be odd)
|
||||
- `polynomial_order`: Degree of polynomial (typically 2-4)
|
||||
|
||||
### Peak Filtering
|
||||
|
||||
#### NLargest - Keep Top N Peaks
|
||||
|
||||
Retains only the N most intense peaks per spectrum.
|
||||
|
||||
```python
|
||||
n_largest = oms.NLargest()
|
||||
|
||||
params = n_largest.getParameters()
|
||||
params.setValue("n", 100) # Keep top 100 peaks
|
||||
params.setValue("threshold", 0.0) # Optional minimum intensity
|
||||
n_largest.setParameters(params)
|
||||
|
||||
n_largest.filterExperiment(exp)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `n`: Number of peaks to keep per spectrum
|
||||
- `threshold`: Minimum absolute intensity threshold
|
||||
|
||||
#### ThresholdMower - Intensity Threshold Filtering
|
||||
|
||||
Removes peaks below a specified intensity threshold.
|
||||
|
||||
```python
|
||||
threshold_filter = oms.ThresholdMower()
|
||||
|
||||
params = threshold_filter.getParameters()
|
||||
params.setValue("threshold", 1000.0) # Absolute intensity threshold
|
||||
threshold_filter.setParameters(params)
|
||||
|
||||
threshold_filter.filterExperiment(exp)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `threshold`: Absolute intensity cutoff
|
||||
|
||||
#### WindowMower - Window-Based Peak Selection
|
||||
|
||||
Divides m/z range into windows and keeps top N peaks per window.
|
||||
|
||||
```python
|
||||
window_mower = oms.WindowMower()
|
||||
|
||||
params = window_mower.getParameters()
|
||||
params.setValue("windowsize", 50.0) # Window size in Da (or Thomson)
|
||||
params.setValue("peakcount", 10) # Peaks to keep per window
|
||||
params.setValue("movetype", "jump") # "jump" or "slide"
|
||||
window_mower.setParameters(params)
|
||||
|
||||
window_mower.filterExperiment(exp)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `windowsize`: Size of m/z window (Da)
|
||||
- `peakcount`: Number of peaks to retain per window
|
||||
- `movetype`: "jump" (non-overlapping) or "slide" (overlapping windows)
|
||||
|
||||
#### BernNorm - Bernoulli Normalization
|
||||
|
||||
Statistical normalization based on Bernoulli distribution.
|
||||
|
||||
```python
|
||||
bern_norm = oms.BernNorm()
|
||||
|
||||
params = bern_norm.getParameters()
|
||||
params.setValue("threshold", 0.7) # Threshold for normalization
|
||||
bern_norm.setParameters(params)
|
||||
|
||||
bern_norm.filterExperiment(exp)
|
||||
```
|
||||
|
||||
### Spectrum Normalization
|
||||
|
||||
#### Normalizer
|
||||
|
||||
Normalizes spectrum intensities to unit total intensity or maximum intensity.
|
||||
|
||||
```python
|
||||
normalizer = oms.Normalizer()
|
||||
|
||||
params = normalizer.getParameters()
|
||||
params.setValue("method", "to_one") # "to_one" or "to_TIC"
|
||||
normalizer.setParameters(params)
|
||||
|
||||
normalizer.filterExperiment(exp)
|
||||
```
|
||||
|
||||
**Methods:**
|
||||
- `to_one`: Normalize max peak to 1.0
|
||||
- `to_TIC`: Normalize to total ion current = 1.0
|
||||
|
||||
#### Scaler
|
||||
|
||||
Scales intensities by a constant factor.
|
||||
|
||||
```python
|
||||
scaler = oms.Scaler()
|
||||
|
||||
params = scaler.getParameters()
|
||||
params.setValue("scaling", 1000.0) # Scaling factor
|
||||
scaler.setParameters(params)
|
||||
|
||||
scaler.filterExperiment(exp)
|
||||
```
|
||||
|
||||
## Centroiding and Peak Picking
|
||||
|
||||
### PeakPickerHiRes - High-Resolution Peak Picking
|
||||
|
||||
Converts profile spectra to centroid mode for high-resolution data.
|
||||
|
||||
```python
|
||||
picker = oms.PeakPickerHiRes()
|
||||
|
||||
params = picker.getParameters()
|
||||
params.setValue("signal_to_noise", 1.0) # S/N threshold
|
||||
params.setValue("spacing_difference", 1.5) # Peak spacing factor
|
||||
params.setValue("sn_win_len", 20.0) # S/N window length
|
||||
params.setValue("sn_bin_count", 30) # Bins for S/N estimation
|
||||
params.setValue("ms1_only", "false") # Process only MS1
|
||||
params.setValue("ms_levels", [1, 2]) # MS levels to process
|
||||
picker.setParameters(params)
|
||||
|
||||
# Pick peaks
|
||||
exp_centroided = oms.MSExperiment()
|
||||
picker.pickExperiment(exp, exp_centroided)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `signal_to_noise`: Minimum signal-to-noise ratio
|
||||
- `spacing_difference`: Minimum spacing between peaks
|
||||
- `ms_levels`: List of MS levels to process
|
||||
|
||||
### PeakPickerWavelet - Wavelet-Based Peak Picking
|
||||
|
||||
Uses continuous wavelet transform for peak detection.
|
||||
|
||||
```python
|
||||
wavelet_picker = oms.PeakPickerWavelet()
|
||||
|
||||
params = wavelet_picker.getParameters()
|
||||
params.setValue("signal_to_noise", 1.0)
|
||||
params.setValue("peak_width", 0.15) # Expected peak width (Da)
|
||||
wavelet_picker.setParameters(params)
|
||||
|
||||
wavelet_picker.pickExperiment(exp, exp_centroided)
|
||||
```
|
||||
|
||||
## Feature Detection
|
||||
|
||||
### FeatureFinder Algorithms
|
||||
|
||||
Feature finders detect 2D features (m/z and RT) in LC-MS data.
|
||||
|
||||
#### FeatureFinderMultiplex
|
||||
|
||||
For multiplex labeling experiments (SILAC, dimethyl labeling).
|
||||
|
||||
```python
|
||||
ff = oms.FeatureFinderMultiplex()
|
||||
|
||||
params = ff.getParameters()
|
||||
params.setValue("algorithm:labels", "[]") # Empty for label-free
|
||||
params.setValue("algorithm:charge", "2:4") # Charge range
|
||||
params.setValue("algorithm:rt_typical", 40.0) # Expected feature RT width
|
||||
params.setValue("algorithm:rt_min", 2.0) # Minimum RT width
|
||||
params.setValue("algorithm:mz_tolerance", 10.0) # m/z tolerance (ppm)
|
||||
params.setValue("algorithm:intensity_cutoff", 1000.0) # Minimum intensity
|
||||
ff.setParameters(params)
|
||||
|
||||
# Run feature detection
|
||||
features = oms.FeatureMap()
|
||||
ff.run(exp, features, oms.Param())
|
||||
|
||||
print(f"Found {features.size()} features")
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `algorithm:charge`: Charge state range to consider
|
||||
- `algorithm:rt_typical`: Expected peak width in RT dimension
|
||||
- `algorithm:mz_tolerance`: Mass tolerance in ppm
|
||||
- `algorithm:intensity_cutoff`: Minimum intensity threshold
|
||||
|
||||
#### FeatureFinderCentroided
|
||||
|
||||
For centroided data, identifies isotope patterns and traces over RT.
|
||||
|
||||
```python
|
||||
ff_centroided = oms.FeatureFinderCentroided()
|
||||
|
||||
params = ff_centroided.getParameters()
|
||||
params.setValue("mass_trace:mz_tolerance", 10.0) # ppm
|
||||
params.setValue("mass_trace:min_spectra", 5) # Min consecutive spectra
|
||||
params.setValue("isotopic_pattern:charge_low", 1)
|
||||
params.setValue("isotopic_pattern:charge_high", 4)
|
||||
params.setValue("seed:min_score", 0.5)
|
||||
ff_centroided.setParameters(params)
|
||||
|
||||
features = oms.FeatureMap()
|
||||
seeds = oms.FeatureMap() # Optional seed features
|
||||
ff_centroided.run(exp, features, params, seeds)
|
||||
```
|
||||
|
||||
#### FeatureFinderIdentification
|
||||
|
||||
Uses peptide identifications to guide feature detection.
|
||||
|
||||
```python
|
||||
ff_id = oms.FeatureFinderIdentification()
|
||||
|
||||
params = ff_id.getParameters()
|
||||
params.setValue("extract:mz_window", 10.0) # ppm
|
||||
params.setValue("extract:rt_window", 60.0) # seconds
|
||||
params.setValue("detect:peak_width", 30.0) # Expected peak width
|
||||
ff_id.setParameters(params)
|
||||
|
||||
# Requires peptide identifications
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
features = oms.FeatureMap()
|
||||
|
||||
ff_id.run(exp, protein_ids, peptide_ids, features)
|
||||
```
|
||||
|
||||
## Charge and Isotope Deconvolution
|
||||
|
||||
### Decharging and Charge State Deconvolution
|
||||
|
||||
#### FeatureDeconvolution
|
||||
|
||||
Resolves charge states and combines features.
|
||||
|
||||
```python
|
||||
deconv = oms.FeatureDeconvolution()
|
||||
|
||||
params = deconv.getParameters()
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 4)
|
||||
params.setValue("q_value", 0.01) # FDR threshold
|
||||
deconv.setParameters(params)
|
||||
|
||||
features_deconv = oms.FeatureMap()
|
||||
consensus_map = oms.ConsensusMap()
|
||||
deconv.compute(features, features_deconv, consensus_map)
|
||||
```
|
||||
|
||||
## Map Alignment
|
||||
|
||||
### MapAlignmentAlgorithm
|
||||
|
||||
Aligns retention times across multiple LC-MS runs.
|
||||
|
||||
#### MapAlignmentAlgorithmPoseClustering
|
||||
|
||||
Pose clustering-based RT alignment.
|
||||
|
||||
```python
|
||||
aligner = oms.MapAlignmentAlgorithmPoseClustering()
|
||||
|
||||
params = aligner.getParameters()
|
||||
params.setValue("max_num_peaks_considered", 1000)
|
||||
params.setValue("pairfinder:distance_MZ:max_difference", 0.3) # Da
|
||||
params.setValue("pairfinder:distance_RT:max_difference", 60.0) # seconds
|
||||
aligner.setParameters(params)
|
||||
|
||||
# Align multiple feature maps
|
||||
feature_maps = [features1, features2, features3]
|
||||
transformations = []
|
||||
|
||||
# Create reference (e.g., use first map)
|
||||
reference = oms.FeatureMap(feature_maps[0])
|
||||
|
||||
# Align others to reference
|
||||
for fm in feature_maps[1:]:
|
||||
transformation = oms.TransformationDescription()
|
||||
aligner.align(fm, reference, transformation)
|
||||
transformations.append(transformation)
|
||||
|
||||
# Apply transformation
|
||||
transformer = oms.MapAlignmentTransformer()
|
||||
transformer.transformRetentionTimes(fm, transformation)
|
||||
```
|
||||
|
||||
## Feature Linking
|
||||
|
||||
### FeatureGroupingAlgorithm
|
||||
|
||||
Links features across samples to create consensus features.
|
||||
|
||||
#### FeatureGroupingAlgorithmQT
|
||||
|
||||
Quality threshold-based feature linking.
|
||||
|
||||
```python
|
||||
grouper = oms.FeatureGroupingAlgorithmQT()
|
||||
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 60.0) # seconds
|
||||
params.setValue("distance_MZ:max_difference", 10.0) # ppm
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
# Create consensus map
|
||||
consensus_map = oms.ConsensusMap()
|
||||
|
||||
# Group features from multiple samples
|
||||
feature_maps = [features1, features2, features3]
|
||||
grouper.group(feature_maps, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
```
|
||||
|
||||
#### FeatureGroupingAlgorithmKD
|
||||
|
||||
KD-tree based linking (faster for large datasets).
|
||||
|
||||
```python
|
||||
grouper_kd = oms.FeatureGroupingAlgorithmKD()
|
||||
|
||||
params = grouper_kd.getParameters()
|
||||
params.setValue("mz_unit", "ppm")
|
||||
params.setValue("mz_tolerance", 10.0)
|
||||
params.setValue("rt_tolerance", 30.0)
|
||||
grouper_kd.setParameters(params)
|
||||
|
||||
consensus_map = oms.ConsensusMap()
|
||||
grouper_kd.group(feature_maps, consensus_map)
|
||||
```
|
||||
|
||||
## Chromatographic Analysis
|
||||
|
||||
### ElutionPeakDetection
|
||||
|
||||
Detects elution peaks in chromatograms.
|
||||
|
||||
```python
|
||||
epd = oms.ElutionPeakDetection()
|
||||
|
||||
params = epd.getParameters()
|
||||
params.setValue("chrom_peak_snr", 3.0) # Signal-to-noise threshold
|
||||
params.setValue("chrom_fwhm", 5.0) # Expected FWHM (seconds)
|
||||
epd.setParameters(params)
|
||||
|
||||
# Apply to chromatograms
|
||||
for chrom in exp.getChromatograms():
|
||||
peaks = epd.detectPeaks(chrom)
|
||||
```
|
||||
|
||||
### MRMFeatureFinderScoring
|
||||
|
||||
Scoring and peak picking for targeted (MRM/SRM) experiments.
|
||||
|
||||
```python
|
||||
mrm_finder = oms.MRMFeatureFinderScoring()
|
||||
|
||||
params = mrm_finder.getParameters()
|
||||
params.setValue("TransitionGroupPicker:min_peak_width", 2.0)
|
||||
params.setValue("TransitionGroupPicker:recalculate_peaks", "true")
|
||||
params.setValue("TransitionGroupPicker:PeakPickerMRM:signal_to_noise", 1.0)
|
||||
mrm_finder.setParameters(params)
|
||||
|
||||
# Requires chromatograms
|
||||
features = oms.FeatureMap()
|
||||
mrm_finder.pickExperiment(chrom_exp, features, targets, transformation, swath_maps)
|
||||
```
|
||||
|
||||
## Quantification
|
||||
|
||||
### ProteinInference
|
||||
|
||||
Infers proteins from peptide identifications.
|
||||
|
||||
```python
|
||||
protein_inference = oms.BasicProteinInferenceAlgorithm()
|
||||
|
||||
# Apply to identification results
|
||||
protein_inference.run(peptide_ids, protein_ids)
|
||||
```
|
||||
|
||||
### IsobaricQuantification
|
||||
|
||||
Quantification for isobaric labeling (TMT, iTRAQ).
|
||||
|
||||
```python
|
||||
# For TMT/iTRAQ quantification
|
||||
iso_quant = oms.IsobaricQuantification()
|
||||
|
||||
params = iso_quant.getParameters()
|
||||
params.setValue("channel_116_description", "Sample1")
|
||||
params.setValue("channel_117_description", "Sample2")
|
||||
# ... configure all channels
|
||||
iso_quant.setParameters(params)
|
||||
|
||||
# Run quantification
|
||||
quant_method = oms.IsobaricQuantitationMethod.TMT_10PLEX
|
||||
quant_info = oms.IsobaricQuantifierStatistics()
|
||||
iso_quant.quantify(exp, quant_info)
|
||||
```
|
||||
|
||||
## Data Processing
|
||||
|
||||
### BaselineFiltering
|
||||
|
||||
Removes baseline from spectra.
|
||||
|
||||
```python
|
||||
baseline = oms.TopHatFilter()
|
||||
|
||||
params = baseline.getParameters()
|
||||
params.setValue("struc_elem_length", 3.0) # Structuring element size
|
||||
params.setValue("struc_elem_unit", "Thomson")
|
||||
baseline.setParameters(params)
|
||||
|
||||
baseline.filterExperiment(exp)
|
||||
```
|
||||
|
||||
### SpectraMerger
|
||||
|
||||
Merges consecutive similar spectra.
|
||||
|
||||
```python
|
||||
merger = oms.SpectraMerger()
|
||||
|
||||
params = merger.getParameters()
|
||||
params.setValue("mz_binning_width", 0.05) # Binning width (Da)
|
||||
params.setValue("sort_blocks", "RT_ascending")
|
||||
merger.setParameters(params)
|
||||
|
||||
merger.mergeSpectra(exp)
|
||||
```
|
||||
|
||||
## Quality Control
|
||||
|
||||
### MzMLFileQuality
|
||||
|
||||
Analyzes mzML file quality.
|
||||
|
||||
```python
|
||||
# Calculate basic QC metrics
|
||||
def calculate_qc_metrics(exp):
|
||||
metrics = {
|
||||
'n_spectra': exp.getNrSpectra(),
|
||||
'n_ms1': sum(1 for s in exp if s.getMSLevel() == 1),
|
||||
'n_ms2': sum(1 for s in exp if s.getMSLevel() == 2),
|
||||
'rt_range': (exp.getMinRT(), exp.getMaxRT()),
|
||||
'mz_range': (exp.getMinMZ(), exp.getMaxMZ()),
|
||||
}
|
||||
|
||||
# Calculate TIC
|
||||
tics = []
|
||||
for spectrum in exp:
|
||||
if spectrum.getMSLevel() == 1:
|
||||
mz, intensity = spectrum.get_peaks()
|
||||
tics.append(sum(intensity))
|
||||
|
||||
metrics['median_tic'] = np.median(tics)
|
||||
metrics['mean_tic'] = np.mean(tics)
|
||||
|
||||
return metrics
|
||||
```
|
||||
|
||||
## FDR Control
|
||||
|
||||
### FalseDiscoveryRate
|
||||
|
||||
Estimates and controls false discovery rate.
|
||||
|
||||
```python
|
||||
fdr = oms.FalseDiscoveryRate()
|
||||
|
||||
params = fdr.getParameters()
|
||||
params.setValue("add_decoy_peptides", "false")
|
||||
params.setValue("add_decoy_proteins", "false")
|
||||
fdr.setParameters(params)
|
||||
|
||||
# Apply to identifications
|
||||
fdr.apply(protein_ids, peptide_ids)
|
||||
|
||||
# Filter by FDR threshold
|
||||
fdr_threshold = 0.01
|
||||
filtered_peptides = [p for p in peptide_ids if p.getMetaValue("q-value") <= fdr_threshold]
|
||||
```
|
||||
|
||||
## Algorithm Selection Guide
|
||||
|
||||
### When to Use Which Algorithm
|
||||
|
||||
**For Smoothing:**
|
||||
- Use `GaussFilter` for general-purpose smoothing
|
||||
- Use `SavitzkyGolayFilter` for preserving peak shapes
|
||||
|
||||
**For Peak Picking:**
|
||||
- Use `PeakPickerHiRes` for high-resolution Orbitrap/FT-ICR data
|
||||
- Use `PeakPickerWavelet` for lower-resolution TOF data
|
||||
|
||||
**For Feature Detection:**
|
||||
- Use `FeatureFinderCentroided` for label-free proteomics (DDA)
|
||||
- Use `FeatureFinderMultiplex` for SILAC/dimethyl labeling
|
||||
- Use `FeatureFinderIdentification` when you have ID information
|
||||
- Use `MRMFeatureFinderScoring` for targeted (MRM/SRM) experiments
|
||||
|
||||
**For Feature Linking:**
|
||||
- Use `FeatureGroupingAlgorithmQT` for small-medium datasets (<10 samples)
|
||||
- Use `FeatureGroupingAlgorithmKD` for large datasets (>10 samples)
|
||||
|
||||
## Parameter Tuning Tips
|
||||
|
||||
1. **S/N Thresholds**: Start with 1-3 for clean data, increase for noisy data
|
||||
2. **m/z Tolerance**: Use 5-10 ppm for high-resolution instruments, 0.5-1 Da for low-res
|
||||
3. **RT Tolerance**: Typically 30-60 seconds depending on chromatographic stability
|
||||
4. **Peak Width**: Measure from real data - varies by instrument and gradient length
|
||||
5. **Charge States**: Set based on expected analytes (1-2 for metabolites, 2-4 for peptides)
|
||||
|
||||
## Common Algorithm Workflows
|
||||
|
||||
### Complete Proteomics Workflow
|
||||
|
||||
```python
|
||||
# 1. Load data
|
||||
exp = oms.MSExperiment()
|
||||
oms.MzMLFile().load("raw.mzML", exp)
|
||||
|
||||
# 2. Smooth
|
||||
gauss = oms.GaussFilter()
|
||||
gauss.filterExperiment(exp)
|
||||
|
||||
# 3. Peak picking
|
||||
picker = oms.PeakPickerHiRes()
|
||||
exp_centroid = oms.MSExperiment()
|
||||
picker.pickExperiment(exp, exp_centroid)
|
||||
|
||||
# 4. Feature detection
|
||||
ff = oms.FeatureFinderCentroided()
|
||||
features = oms.FeatureMap()
|
||||
ff.run(exp_centroid, features, oms.Param(), oms.FeatureMap())
|
||||
|
||||
# 5. Save results
|
||||
oms.FeatureXMLFile().store("features.featureXML", features)
|
||||
```
|
||||
|
||||
### Multi-Sample Quantification
|
||||
|
||||
```python
|
||||
# Load multiple samples
|
||||
feature_maps = []
|
||||
for filename in ["sample1.mzML", "sample2.mzML", "sample3.mzML"]:
|
||||
exp = oms.MSExperiment()
|
||||
oms.MzMLFile().load(filename, exp)
|
||||
|
||||
# Process and detect features
|
||||
features = detect_features(exp) # Your processing function
|
||||
feature_maps.append(features)
|
||||
|
||||
# Align retention times
|
||||
align_feature_maps(feature_maps) # Implement alignment
|
||||
|
||||
# Link features
|
||||
grouper = oms.FeatureGroupingAlgorithmQT()
|
||||
consensus_map = oms.ConsensusMap()
|
||||
grouper.group(feature_maps, consensus_map)
|
||||
|
||||
# Export quantification matrix
|
||||
export_quant_matrix(consensus_map)
|
||||
```
|
||||
@@ -1,715 +0,0 @@
|
||||
# pyOpenMS Chemistry Reference
|
||||
|
||||
This document provides comprehensive coverage of chemistry-related functionality in pyOpenMS, including elements, isotopes, molecular formulas, amino acids, peptides, proteins, and modifications.
|
||||
|
||||
## Elements and Isotopes
|
||||
|
||||
### ElementDB - Element Database
|
||||
|
||||
Access atomic and isotopic data for all elements.
|
||||
|
||||
```python
|
||||
import pyopenms as oms
|
||||
|
||||
# Get element database instance
|
||||
element_db = oms.ElementDB()
|
||||
|
||||
# Get element by symbol
|
||||
carbon = element_db.getElement("C")
|
||||
nitrogen = element_db.getElement("N")
|
||||
oxygen = element_db.getElement("O")
|
||||
|
||||
# Element properties
|
||||
print(f"Carbon monoisotopic weight: {carbon.getMonoWeight()}")
|
||||
print(f"Carbon average weight: {carbon.getAverageWeight()}")
|
||||
print(f"Atomic number: {carbon.getAtomicNumber()}")
|
||||
print(f"Symbol: {carbon.getSymbol()}")
|
||||
print(f"Name: {carbon.getName()}")
|
||||
```
|
||||
|
||||
### Isotope Information
|
||||
|
||||
```python
|
||||
# Get isotope distribution for an element
|
||||
isotopes = carbon.getIsotopeDistribution()
|
||||
|
||||
# Access specific isotope
|
||||
c12 = element_db.getElement("C", 12) # Carbon-12
|
||||
c13 = element_db.getElement("C", 13) # Carbon-13
|
||||
|
||||
print(f"C-12 abundance: {isotopes.getContainer()[0].getIntensity()}")
|
||||
print(f"C-13 abundance: {isotopes.getContainer()[1].getIntensity()}")
|
||||
|
||||
# Isotope mass
|
||||
print(f"C-12 mass: {c12.getMonoWeight()}")
|
||||
print(f"C-13 mass: {c13.getMonoWeight()}")
|
||||
```
|
||||
|
||||
### Constants
|
||||
|
||||
```python
|
||||
# Physical constants
|
||||
avogadro = oms.Constants.AVOGADRO
|
||||
electron_mass = oms.Constants.ELECTRON_MASS_U
|
||||
proton_mass = oms.Constants.PROTON_MASS_U
|
||||
|
||||
print(f"Avogadro's number: {avogadro}")
|
||||
print(f"Electron mass: {electron_mass} u")
|
||||
print(f"Proton mass: {proton_mass} u")
|
||||
```
|
||||
|
||||
## Empirical Formulas
|
||||
|
||||
### EmpiricalFormula - Molecular Formulas
|
||||
|
||||
Represent and manipulate molecular formulas.
|
||||
|
||||
#### Creating Formulas
|
||||
|
||||
```python
|
||||
# From string
|
||||
glucose = oms.EmpiricalFormula("C6H12O6")
|
||||
water = oms.EmpiricalFormula("H2O")
|
||||
ammonia = oms.EmpiricalFormula("NH3")
|
||||
|
||||
# From element composition
|
||||
formula = oms.EmpiricalFormula()
|
||||
formula.setCharge(1) # Set charge state
|
||||
```
|
||||
|
||||
#### Formula Arithmetic
|
||||
|
||||
```python
|
||||
# Addition
|
||||
sucrose = oms.EmpiricalFormula("C12H22O11")
|
||||
hydrolyzed = sucrose + water # Hydrolysis adds water
|
||||
|
||||
# Subtraction
|
||||
dehydrated = glucose - water # Dehydration removes water
|
||||
|
||||
# Multiplication
|
||||
three_waters = water * 3 # 3 H2O = H6O3
|
||||
|
||||
# Division
|
||||
formula_half = sucrose / 2 # Half the formula
|
||||
```
|
||||
|
||||
#### Mass Calculations
|
||||
|
||||
```python
|
||||
# Monoisotopic mass
|
||||
mono_mass = glucose.getMonoWeight()
|
||||
print(f"Glucose monoisotopic mass: {mono_mass:.6f} Da")
|
||||
|
||||
# Average mass
|
||||
avg_mass = glucose.getAverageWeight()
|
||||
print(f"Glucose average mass: {avg_mass:.6f} Da")
|
||||
|
||||
# Mass difference
|
||||
mass_diff = (glucose - water).getMonoWeight()
|
||||
```
|
||||
|
||||
#### Elemental Composition
|
||||
|
||||
```python
|
||||
# Get element counts
|
||||
formula = oms.EmpiricalFormula("C6H12O6")
|
||||
|
||||
# Access individual elements
|
||||
n_carbon = formula.getNumberOf(element_db.getElement("C"))
|
||||
n_hydrogen = formula.getNumberOf(element_db.getElement("H"))
|
||||
n_oxygen = formula.getNumberOf(element_db.getElement("O"))
|
||||
|
||||
print(f"C: {n_carbon}, H: {n_hydrogen}, O: {n_oxygen}")
|
||||
|
||||
# String representation
|
||||
print(f"Formula: {formula.toString()}")
|
||||
```
|
||||
|
||||
#### Isotope-Specific Formulas
|
||||
|
||||
```python
|
||||
# Specify specific isotopes using parentheses
|
||||
labeled_glucose = oms.EmpiricalFormula("(13)C6H12O6") # All carbons are C-13
|
||||
partially_labeled = oms.EmpiricalFormula("C5(13)CH12O6") # One C-13
|
||||
|
||||
# Deuterium labeling
|
||||
deuterated = oms.EmpiricalFormula("C6D12O6") # D2O instead of H2O
|
||||
```
|
||||
|
||||
#### Charge States
|
||||
|
||||
```python
|
||||
# Set charge
|
||||
formula = oms.EmpiricalFormula("C6H12O6")
|
||||
formula.setCharge(1) # Positive charge
|
||||
|
||||
# Get charge
|
||||
charge = formula.getCharge()
|
||||
|
||||
# Calculate m/z for charged molecule
|
||||
mz = formula.getMonoWeight() / abs(charge) if charge != 0 else formula.getMonoWeight()
|
||||
```
|
||||
|
||||
### Isotope Pattern Generation
|
||||
|
||||
Generate theoretical isotope patterns for formulas.
|
||||
|
||||
#### CoarseIsotopePatternGenerator
|
||||
|
||||
For unit mass resolution (low-resolution instruments).
|
||||
|
||||
```python
|
||||
# Create generator
|
||||
coarse_gen = oms.CoarseIsotopePatternGenerator()
|
||||
|
||||
# Generate pattern
|
||||
formula = oms.EmpiricalFormula("C6H12O6")
|
||||
pattern = coarse_gen.run(formula)
|
||||
|
||||
# Access isotope peaks
|
||||
iso_dist = pattern.getContainer()
|
||||
for peak in iso_dist:
|
||||
mass = peak.getMZ()
|
||||
abundance = peak.getIntensity()
|
||||
print(f"m/z: {mass:.4f}, Abundance: {abundance:.4f}")
|
||||
```
|
||||
|
||||
#### FineIsotopePatternGenerator
|
||||
|
||||
For high-resolution instruments (hyperfine structure).
|
||||
|
||||
```python
|
||||
# Create generator with resolution
|
||||
fine_gen = oms.FineIsotopePatternGenerator(0.01) # 0.01 Da resolution
|
||||
|
||||
# Generate fine pattern
|
||||
fine_pattern = fine_gen.run(formula)
|
||||
|
||||
# Access fine isotope structure
|
||||
for peak in fine_pattern.getContainer():
|
||||
print(f"m/z: {peak.getMZ():.6f}, Abundance: {peak.getIntensity():.6f}")
|
||||
```
|
||||
|
||||
#### Isotope Pattern Matching
|
||||
|
||||
```python
|
||||
# Compare experimental to theoretical
|
||||
def compare_isotope_patterns(experimental_mz, experimental_int, formula):
|
||||
# Generate theoretical
|
||||
coarse_gen = oms.CoarseIsotopePatternGenerator()
|
||||
theoretical = coarse_gen.run(formula)
|
||||
|
||||
# Extract theoretical peaks
|
||||
theo_peaks = theoretical.getContainer()
|
||||
theo_mz = [p.getMZ() for p in theo_peaks]
|
||||
theo_int = [p.getIntensity() for p in theo_peaks]
|
||||
|
||||
# Normalize both patterns
|
||||
exp_int_norm = [i / max(experimental_int) for i in experimental_int]
|
||||
theo_int_norm = [i / max(theo_int) for i in theo_int]
|
||||
|
||||
# Calculate similarity (e.g., cosine similarity)
|
||||
# ... implement similarity calculation
|
||||
return similarity_score
|
||||
```
|
||||
|
||||
## Amino Acids and Residues
|
||||
|
||||
### Residue - Amino Acid Representation
|
||||
|
||||
Access properties of amino acids.
|
||||
|
||||
```python
|
||||
# Get residue database
|
||||
res_db = oms.ResidueDB()
|
||||
|
||||
# Get specific residue
|
||||
leucine = res_db.getResidue("Leucine")
|
||||
# Or by one-letter code
|
||||
leu = res_db.getResidue("L")
|
||||
|
||||
# Residue properties
|
||||
print(f"Name: {leucine.getName()}")
|
||||
print(f"Three-letter code: {leucine.getThreeLetterCode()}")
|
||||
print(f"One-letter code: {leucine.getOneLetterCode()}")
|
||||
print(f"Monoisotopic mass: {leucine.getMonoWeight():.6f}")
|
||||
print(f"Average mass: {leucine.getAverageWeight():.6f}")
|
||||
|
||||
# Chemical formula
|
||||
formula = leucine.getFormula()
|
||||
print(f"Formula: {formula.toString()}")
|
||||
|
||||
# pKa values
|
||||
print(f"pKa (N-term): {leucine.getPka()}")
|
||||
print(f"pKa (C-term): {leucine.getPkb()}")
|
||||
print(f"pKa (side chain): {leucine.getPkc()}")
|
||||
|
||||
# Side chain basicity/acidity
|
||||
print(f"Basicity: {leucine.getBasicity()}")
|
||||
print(f"Hydrophobicity: {leucine.getHydrophobicity()}")
|
||||
```
|
||||
|
||||
### All Standard Amino Acids
|
||||
|
||||
```python
|
||||
# Iterate over all residues
|
||||
for residue_name in ["Alanine", "Cysteine", "Aspartic acid", "Glutamic acid",
|
||||
"Phenylalanine", "Glycine", "Histidine", "Isoleucine",
|
||||
"Lysine", "Leucine", "Methionine", "Asparagine",
|
||||
"Proline", "Glutamine", "Arginine", "Serine",
|
||||
"Threonine", "Valine", "Tryptophan", "Tyrosine"]:
|
||||
res = res_db.getResidue(residue_name)
|
||||
print(f"{res.getOneLetterCode()}: {res.getMonoWeight():.4f} Da")
|
||||
```
|
||||
|
||||
### Internal Residues vs. Termini
|
||||
|
||||
```python
|
||||
# Get internal residue mass (no terminal groups)
|
||||
internal_mass = leucine.getInternalToFull()
|
||||
|
||||
# Get residue with N-terminal modification
|
||||
n_terminal = res_db.getResidue("L[1]") # With NH2
|
||||
|
||||
# Get residue with C-terminal modification
|
||||
c_terminal = res_db.getResidue("L[2]") # With COOH
|
||||
```
|
||||
|
||||
## Peptide Sequences
|
||||
|
||||
### AASequence - Amino Acid Sequences
|
||||
|
||||
Represent and manipulate peptide sequences.
|
||||
|
||||
#### Creating Sequences
|
||||
|
||||
```python
|
||||
# From string
|
||||
peptide = oms.AASequence.fromString("PEPTIDE")
|
||||
longer = oms.AASequence.fromString("MKTAYIAKQRQISFVK")
|
||||
|
||||
# Empty sequence
|
||||
empty_seq = oms.AASequence()
|
||||
```
|
||||
|
||||
#### Sequence Properties
|
||||
|
||||
```python
|
||||
peptide = oms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
# Length
|
||||
length = peptide.size()
|
||||
print(f"Length: {length} residues")
|
||||
|
||||
# Mass
|
||||
mono_mass = peptide.getMonoWeight()
|
||||
avg_mass = peptide.getAverageWeight()
|
||||
print(f"Monoisotopic mass: {mono_mass:.6f} Da")
|
||||
print(f"Average mass: {avg_mass:.6f} Da")
|
||||
|
||||
# Formula
|
||||
formula = peptide.getFormula()
|
||||
print(f"Formula: {formula.toString()}")
|
||||
|
||||
# String representation
|
||||
seq_str = peptide.toString()
|
||||
print(f"Sequence: {seq_str}")
|
||||
```
|
||||
|
||||
#### Accessing Individual Residues
|
||||
|
||||
```python
|
||||
peptide = oms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
# Access by index
|
||||
first_aa = peptide[0] # Returns Residue object
|
||||
print(f"First amino acid: {first_aa.getOneLetterCode()}")
|
||||
|
||||
# Iterate
|
||||
for i in range(peptide.size()):
|
||||
residue = peptide[i]
|
||||
print(f"Position {i}: {residue.getOneLetterCode()}")
|
||||
```
|
||||
|
||||
#### Modifications
|
||||
|
||||
Add post-translational modifications (PTMs) to sequences.
|
||||
|
||||
```python
|
||||
# Modifications in sequence string
|
||||
# Format: AA(ModificationName)
|
||||
oxidized_met = oms.AASequence.fromString("PEPTIDEM(Oxidation)")
|
||||
phospho = oms.AASequence.fromString("PEPTIDES(Phospho)T(Phospho)")
|
||||
|
||||
# Multiple modifications
|
||||
multi_mod = oms.AASequence.fromString("M(Oxidation)PEPTIDEK(Acetyl)")
|
||||
|
||||
# N-terminal modifications
|
||||
n_term_acetyl = oms.AASequence.fromString("(Acetyl)PEPTIDE")
|
||||
|
||||
# C-terminal modifications
|
||||
c_term_amide = oms.AASequence.fromString("PEPTIDE(Amidated)")
|
||||
|
||||
# Check mass change
|
||||
unmodified = oms.AASequence.fromString("PEPTIDE")
|
||||
modified = oms.AASequence.fromString("PEPTIDEM(Oxidation)")
|
||||
mass_diff = modified.getMonoWeight() - unmodified.getMonoWeight()
|
||||
print(f"Mass shift from oxidation: {mass_diff:.6f} Da")
|
||||
```
|
||||
|
||||
#### Sequence Manipulation
|
||||
|
||||
```python
|
||||
# Prefix (N-terminal fragment)
|
||||
prefix = peptide.getPrefix(3) # First 3 residues
|
||||
print(f"Prefix: {prefix.toString()}")
|
||||
|
||||
# Suffix (C-terminal fragment)
|
||||
suffix = peptide.getSuffix(3) # Last 3 residues
|
||||
print(f"Suffix: {suffix.toString()}")
|
||||
|
||||
# Subsequence
|
||||
subseq = peptide.getSubsequence(2, 4) # Residues 2-4
|
||||
print(f"Subsequence: {subseq.toString()}")
|
||||
```
|
||||
|
||||
#### Theoretical Fragmentation
|
||||
|
||||
Generate theoretical fragment ions for MS/MS.
|
||||
|
||||
```python
|
||||
peptide = oms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
# b-ions (N-terminal fragments)
|
||||
b_ions = []
|
||||
for i in range(1, peptide.size()):
|
||||
b_fragment = peptide.getPrefix(i)
|
||||
b_mass = b_fragment.getMonoWeight()
|
||||
b_ions.append(('b', i, b_mass))
|
||||
print(f"b{i}: {b_mass:.4f}")
|
||||
|
||||
# y-ions (C-terminal fragments)
|
||||
y_ions = []
|
||||
for i in range(1, peptide.size()):
|
||||
y_fragment = peptide.getSuffix(i)
|
||||
y_mass = y_fragment.getMonoWeight()
|
||||
y_ions.append(('y', i, y_mass))
|
||||
print(f"y{i}: {y_mass:.4f}")
|
||||
|
||||
# a-ions (b - CO)
|
||||
a_ions = []
|
||||
CO_mass = 27.994915 # CO loss
|
||||
for ion_type, position, mass in b_ions:
|
||||
a_mass = mass - CO_mass
|
||||
a_ions.append(('a', position, a_mass))
|
||||
|
||||
# c-ions (b + NH3)
|
||||
NH3_mass = 17.026549 # NH3 gain
|
||||
c_ions = []
|
||||
for ion_type, position, mass in b_ions:
|
||||
c_mass = mass + NH3_mass
|
||||
c_ions.append(('c', position, c_mass))
|
||||
|
||||
# z-ions (y - NH3)
|
||||
z_ions = []
|
||||
for ion_type, position, mass in y_ions:
|
||||
z_mass = mass - NH3_mass
|
||||
z_ions.append(('z', position, z_mass))
|
||||
```
|
||||
|
||||
#### Calculate m/z for Charge States
|
||||
|
||||
```python
|
||||
peptide = oms.AASequence.fromString("PEPTIDE")
|
||||
proton_mass = 1.007276
|
||||
|
||||
# [M+H]+
|
||||
mz_1 = peptide.getMonoWeight() + proton_mass
|
||||
print(f"[M+H]+: {mz_1:.4f}")
|
||||
|
||||
# [M+2H]2+
|
||||
mz_2 = (peptide.getMonoWeight() + 2 * proton_mass) / 2
|
||||
print(f"[M+2H]2+: {mz_2:.4f}")
|
||||
|
||||
# [M+3H]3+
|
||||
mz_3 = (peptide.getMonoWeight() + 3 * proton_mass) / 3
|
||||
print(f"[M+3H]3+: {mz_3:.4f}")
|
||||
|
||||
# General formula for any charge
|
||||
def calculate_mz(sequence, charge):
|
||||
proton_mass = 1.007276
|
||||
return (sequence.getMonoWeight() + charge * proton_mass) / charge
|
||||
|
||||
for z in range(1, 5):
|
||||
print(f"[M+{z}H]{z}+: {calculate_mz(peptide, z):.4f}")
|
||||
```
|
||||
|
||||
## Protein Digestion
|
||||
|
||||
### ProteaseDigestion - Enzymatic Cleavage
|
||||
|
||||
Simulate enzymatic protein digestion.
|
||||
|
||||
#### Basic Digestion
|
||||
|
||||
```python
|
||||
# Create digestion object
|
||||
dig = oms.ProteaseDigestion()
|
||||
|
||||
# Set enzyme
|
||||
dig.setEnzyme("Trypsin") # Cleaves after K, R
|
||||
|
||||
# Other common enzymes:
|
||||
# - "Trypsin" (K, R)
|
||||
# - "Lys-C" (K)
|
||||
# - "Arg-C" (R)
|
||||
# - "Asp-N" (D)
|
||||
# - "Glu-C" (E, D)
|
||||
# - "Chymotrypsin" (F, Y, W, L)
|
||||
|
||||
# Set missed cleavages
|
||||
dig.setMissedCleavages(0) # No missed cleavages
|
||||
dig.setMissedCleavages(2) # Allow up to 2 missed cleavages
|
||||
|
||||
# Perform digestion
|
||||
protein = oms.AASequence.fromString("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK")
|
||||
peptides = []
|
||||
dig.digest(protein, peptides)
|
||||
|
||||
# Print results
|
||||
for pep in peptides:
|
||||
print(f"{pep.toString()}: {pep.getMonoWeight():.2f} Da")
|
||||
```
|
||||
|
||||
#### Advanced Digestion Options
|
||||
|
||||
```python
|
||||
# Get enzyme specificity
|
||||
specificity = dig.getSpecificity()
|
||||
# oms.EnzymaticDigestion.SPEC_FULL (both termini)
|
||||
# oms.EnzymaticDigestion.SPEC_SEMI (one terminus)
|
||||
# oms.EnzymaticDigestion.SPEC_NONE (no specificity)
|
||||
|
||||
# Set specificity for semi-tryptic search
|
||||
dig.setSpecificity(oms.EnzymaticDigestion.SPEC_SEMI)
|
||||
|
||||
# Get cleavage sites
|
||||
cleavage_residues = dig.getEnzyme().getCutAfterResidues()
|
||||
restriction_residues = dig.getEnzyme().getRestriction()
|
||||
```
|
||||
|
||||
#### Filter Peptides by Properties
|
||||
|
||||
```python
|
||||
# Filter by mass range
|
||||
min_mass = 600.0
|
||||
max_mass = 4000.0
|
||||
filtered = [p for p in peptides if min_mass <= p.getMonoWeight() <= max_mass]
|
||||
|
||||
# Filter by length
|
||||
min_length = 6
|
||||
max_length = 30
|
||||
length_filtered = [p for p in peptides if min_length <= p.size() <= max_length]
|
||||
|
||||
# Combine filters
|
||||
valid_peptides = [p for p in peptides
|
||||
if min_mass <= p.getMonoWeight() <= max_mass
|
||||
and min_length <= p.size() <= max_length]
|
||||
```
|
||||
|
||||
## Modifications
|
||||
|
||||
### ModificationsDB - Modification Database
|
||||
|
||||
Access and apply post-translational modifications.
|
||||
|
||||
#### Accessing Modifications
|
||||
|
||||
```python
|
||||
# Get modifications database
|
||||
mod_db = oms.ModificationsDB()
|
||||
|
||||
# Get specific modification
|
||||
oxidation = mod_db.getModification("Oxidation")
|
||||
phospho = mod_db.getModification("Phospho")
|
||||
acetyl = mod_db.getModification("Acetyl")
|
||||
|
||||
# Modification properties
|
||||
print(f"Name: {oxidation.getFullName()}")
|
||||
print(f"Mass difference: {oxidation.getDiffMonoMass():.6f} Da")
|
||||
print(f"Formula: {oxidation.getDiffFormula().toString()}")
|
||||
|
||||
# Affected residues
|
||||
print(f"Residues: {oxidation.getResidues()}") # e.g., ['M']
|
||||
|
||||
# Specificity (N-term, C-term, anywhere)
|
||||
print(f"Term specificity: {oxidation.getTermSpecificity()}")
|
||||
```
|
||||
|
||||
#### Common Modifications
|
||||
|
||||
```python
|
||||
# Oxidation (M)
|
||||
oxidation = mod_db.getModification("Oxidation")
|
||||
print(f"Oxidation: +{oxidation.getDiffMonoMass():.4f} Da")
|
||||
|
||||
# Phosphorylation (S, T, Y)
|
||||
phospho = mod_db.getModification("Phospho")
|
||||
print(f"Phospho: +{phospho.getDiffMonoMass():.4f} Da")
|
||||
|
||||
# Carbamidomethylation (C) - common alkylation
|
||||
carbamido = mod_db.getModification("Carbamidomethyl")
|
||||
print(f"Carbamidomethyl: +{carbamido.getDiffMonoMass():.4f} Da")
|
||||
|
||||
# Acetylation (K, N-term)
|
||||
acetyl = mod_db.getModification("Acetyl")
|
||||
print(f"Acetyl: +{acetyl.getDiffMonoMass():.4f} Da")
|
||||
|
||||
# Deamidation (N, Q)
|
||||
deamid = mod_db.getModification("Deamidated")
|
||||
print(f"Deamidation: +{deamid.getDiffMonoMass():.4f} Da")
|
||||
```
|
||||
|
||||
#### Searching Modifications
|
||||
|
||||
```python
|
||||
# Search modifications by mass
|
||||
mass_tolerance = 0.01 # Da
|
||||
target_mass = 15.9949 # Oxidation
|
||||
|
||||
# Get all modifications
|
||||
all_mods = []
|
||||
mod_db.getAllSearchModifications(all_mods)
|
||||
|
||||
# Find matching modifications
|
||||
matching = []
|
||||
for mod_name in all_mods:
|
||||
mod = mod_db.getModification(mod_name)
|
||||
if abs(mod.getDiffMonoMass() - target_mass) < mass_tolerance:
|
||||
matching.append(mod)
|
||||
print(f"Match: {mod.getFullName()} ({mod.getDiffMonoMass():.4f} Da)")
|
||||
```
|
||||
|
||||
#### Variable vs. Fixed Modifications
|
||||
|
||||
```python
|
||||
# In search engines, specify:
|
||||
# Fixed modifications: applied to all occurrences
|
||||
fixed_mods = ["Carbamidomethyl (C)"]
|
||||
|
||||
# Variable modifications: optionally present
|
||||
variable_mods = ["Oxidation (M)", "Phospho (S)", "Phospho (T)", "Phospho (Y)"]
|
||||
```
|
||||
|
||||
## Ribonucleotides (RNA)
|
||||
|
||||
### Ribonucleotide - RNA Building Blocks
|
||||
|
||||
```python
|
||||
# Get ribonucleotide database
|
||||
ribo_db = oms.RibonucleotideDB()
|
||||
|
||||
# Get specific ribonucleotide
|
||||
adenine = ribo_db.getRibonucleotide("A")
|
||||
uracil = ribo_db.getRibonucleotide("U")
|
||||
guanine = ribo_db.getRibonucleotide("G")
|
||||
cytosine = ribo_db.getRibonucleotide("C")
|
||||
|
||||
# Properties
|
||||
print(f"Adenine mono mass: {adenine.getMonoWeight()}")
|
||||
print(f"Formula: {adenine.getFormula().toString()}")
|
||||
|
||||
# Modified ribonucleotides
|
||||
modified_ribo = ribo_db.getRibonucleotide("m6A") # N6-methyladenosine
|
||||
```
|
||||
|
||||
## Practical Examples
|
||||
|
||||
### Calculate Peptide Mass with Modifications
|
||||
|
||||
```python
|
||||
def calculate_peptide_mz(sequence_str, charge):
|
||||
"""Calculate m/z for a peptide sequence string with modifications."""
|
||||
peptide = oms.AASequence.fromString(sequence_str)
|
||||
proton_mass = 1.007276
|
||||
mz = (peptide.getMonoWeight() + charge * proton_mass) / charge
|
||||
return mz
|
||||
|
||||
# Examples
|
||||
print(calculate_peptide_mz("PEPTIDE", 2)) # Unmodified [M+2H]2+
|
||||
print(calculate_peptide_mz("PEPTIDEM(Oxidation)", 2)) # With oxidation
|
||||
print(calculate_peptide_mz("(Acetyl)PEPTIDEK(Acetyl)", 2)) # Acetylated
|
||||
```
|
||||
|
||||
### Generate Complete Fragment Ion Series
|
||||
|
||||
```python
|
||||
def generate_fragment_ions(sequence_str, charge_states=[1, 2]):
|
||||
"""Generate comprehensive fragment ion list."""
|
||||
peptide = oms.AASequence.fromString(sequence_str)
|
||||
proton_mass = 1.007276
|
||||
fragments = []
|
||||
|
||||
for i in range(1, peptide.size()):
|
||||
# b and y ions
|
||||
b_frag = peptide.getPrefix(i)
|
||||
y_frag = peptide.getSuffix(i)
|
||||
|
||||
for z in charge_states:
|
||||
b_mz = (b_frag.getMonoWeight() + z * proton_mass) / z
|
||||
y_mz = (y_frag.getMonoWeight() + z * proton_mass) / z
|
||||
|
||||
fragments.append({
|
||||
'type': 'b',
|
||||
'position': i,
|
||||
'charge': z,
|
||||
'mz': b_mz
|
||||
})
|
||||
fragments.append({
|
||||
'type': 'y',
|
||||
'position': i,
|
||||
'charge': z,
|
||||
'mz': y_mz
|
||||
})
|
||||
|
||||
return fragments
|
||||
|
||||
# Usage
|
||||
ions = generate_fragment_ions("PEPTIDE", charge_states=[1, 2])
|
||||
for ion in ions:
|
||||
print(f"{ion['type']}{ion['position']}^{ion['charge']}+: {ion['mz']:.4f}")
|
||||
```
|
||||
|
||||
### Digest Protein and Calculate Peptide Masses
|
||||
|
||||
```python
|
||||
def digest_and_calculate(protein_seq_str, enzyme="Trypsin", missed_cleavages=2,
|
||||
min_mass=600, max_mass=4000):
|
||||
"""Digest protein and return valid peptides with masses."""
|
||||
dig = oms.ProteaseDigestion()
|
||||
dig.setEnzyme(enzyme)
|
||||
dig.setMissedCleavages(missed_cleavages)
|
||||
|
||||
protein = oms.AASequence.fromString(protein_seq_str)
|
||||
peptides = []
|
||||
dig.digest(protein, peptides)
|
||||
|
||||
results = []
|
||||
for pep in peptides:
|
||||
mass = pep.getMonoWeight()
|
||||
if min_mass <= mass <= max_mass:
|
||||
results.append({
|
||||
'sequence': pep.toString(),
|
||||
'mass': mass,
|
||||
'length': pep.size()
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
# Usage
|
||||
protein = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEK"
|
||||
peptides = digest_and_calculate(protein)
|
||||
for pep in peptides:
|
||||
print(f"{pep['sequence']}: {pep['mass']:.2f} Da ({pep['length']} aa)")
|
||||
```
|
||||
@@ -1,560 +1,497 @@
|
||||
# pyOpenMS Data Structures Reference
|
||||
# Core Data Structures
|
||||
|
||||
This document provides comprehensive coverage of core data structures in pyOpenMS for representing mass spectrometry data.
|
||||
## Overview
|
||||
|
||||
## Core Hierarchy
|
||||
PyOpenMS uses C++ objects with Python bindings. Understanding these core data structures is essential for effective data manipulation.
|
||||
|
||||
```
|
||||
MSExperiment # Top-level: Complete LC-MS/MS run
|
||||
├── MSSpectrum[] # Collection of mass spectra
|
||||
│ ├── Peak1D[] # Individual m/z, intensity pairs
|
||||
│ └── SpectrumSettings # Metadata (RT, MS level, precursor)
|
||||
└── MSChromatogram[] # Collection of chromatograms
|
||||
├── ChromatogramPeak[] # RT, intensity pairs
|
||||
└── ChromatogramSettings # Metadata
|
||||
```
|
||||
## Spectrum and Experiment Objects
|
||||
|
||||
## MSSpectrum
|
||||
### MSExperiment
|
||||
|
||||
Represents a single mass spectrum (1-dimensional peak data).
|
||||
|
||||
### Creation and Basic Properties
|
||||
Container for complete LC-MS experiment data (spectra and chromatograms).
|
||||
|
||||
```python
|
||||
import pyopenms as oms
|
||||
import pyopenms as ms
|
||||
|
||||
# Create empty spectrum
|
||||
spectrum = oms.MSSpectrum()
|
||||
# Create experiment
|
||||
exp = ms.MSExperiment()
|
||||
|
||||
# Set metadata
|
||||
spectrum.setRT(123.45) # Retention time in seconds
|
||||
spectrum.setMSLevel(1) # MS level (1 for MS1, 2 for MS2, etc.)
|
||||
spectrum.setNativeID("scan=1234") # Native ID from file
|
||||
# Load from file
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
|
||||
# Additional metadata
|
||||
spectrum.setDriftTime(15.2) # Ion mobility drift time
|
||||
spectrum.setName("MyScan") # Optional name
|
||||
```
|
||||
|
||||
### Peak Data Management
|
||||
|
||||
**Setting Peaks (Method 1 - Lists):**
|
||||
```python
|
||||
mz_values = [100.5, 200.3, 300.7, 400.2, 500.1]
|
||||
intensity_values = [1000, 5000, 3000, 2000, 1500]
|
||||
|
||||
spectrum.set_peaks((mz_values, intensity_values))
|
||||
```
|
||||
|
||||
**Setting Peaks (Method 2 - NumPy arrays):**
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
mz_array = np.array([100.5, 200.3, 300.7, 400.2, 500.1])
|
||||
intensity_array = np.array([1000, 5000, 3000, 2000, 1500])
|
||||
|
||||
spectrum.set_peaks((mz_array, intensity_array))
|
||||
```
|
||||
|
||||
**Retrieving Peaks:**
|
||||
```python
|
||||
# Get as numpy arrays (efficient)
|
||||
mz_array, intensity_array = spectrum.get_peaks()
|
||||
|
||||
# Check number of peaks
|
||||
n_peaks = spectrum.size()
|
||||
|
||||
# Get individual peak (slower)
|
||||
for i in range(spectrum.size()):
|
||||
peak = spectrum[i]
|
||||
mz = peak.getMZ()
|
||||
intensity = peak.getIntensity()
|
||||
```
|
||||
|
||||
### Precursor Information (for MS2/MSn spectra)
|
||||
|
||||
```python
|
||||
# Create precursor
|
||||
precursor = oms.Precursor()
|
||||
precursor.setMZ(456.789) # Precursor m/z
|
||||
precursor.setCharge(2) # Precursor charge
|
||||
precursor.setIntensity(50000) # Precursor intensity
|
||||
precursor.setIsolationWindowLowerOffset(1.5) # Lower isolation window
|
||||
precursor.setIsolationWindowUpperOffset(1.5) # Upper isolation window
|
||||
|
||||
# Set activation method
|
||||
activation = oms.Activation()
|
||||
activation.setActivationEnergy(35.0) # Collision energy
|
||||
activation.setMethod(oms.Activation.ActivationMethod.CID)
|
||||
precursor.setActivation(activation)
|
||||
|
||||
# Assign to spectrum
|
||||
spectrum.setPrecursors([precursor])
|
||||
|
||||
# Retrieve precursor information
|
||||
precursors = spectrum.getPrecursors()
|
||||
if len(precursors) > 0:
|
||||
prec = precursors[0]
|
||||
print(f"Precursor m/z: {prec.getMZ()}")
|
||||
print(f"Precursor charge: {prec.getCharge()}")
|
||||
```
|
||||
|
||||
### Spectrum Metadata Access
|
||||
|
||||
```python
|
||||
# Check if spectrum is sorted by m/z
|
||||
is_sorted = spectrum.isSorted()
|
||||
|
||||
# Sort spectrum by m/z
|
||||
spectrum.sortByPosition()
|
||||
|
||||
# Sort by intensity
|
||||
spectrum.sortByIntensity()
|
||||
|
||||
# Clear all peaks
|
||||
spectrum.clear(False) # False = keep metadata, True = clear everything
|
||||
|
||||
# Get retention time
|
||||
rt = spectrum.getRT()
|
||||
|
||||
# Get MS level
|
||||
ms_level = spectrum.getMSLevel()
|
||||
```
|
||||
|
||||
### Spectrum Types and Modes
|
||||
|
||||
```python
|
||||
# Set spectrum type
|
||||
spectrum.setType(oms.SpectrumSettings.SpectrumType.CENTROID) # or PROFILE
|
||||
|
||||
# Get spectrum type
|
||||
spec_type = spectrum.getType()
|
||||
if spec_type == oms.SpectrumSettings.SpectrumType.CENTROID:
|
||||
print("Centroid spectrum")
|
||||
elif spec_type == oms.SpectrumSettings.SpectrumType.PROFILE:
|
||||
print("Profile spectrum")
|
||||
```
|
||||
|
||||
### Data Processing Annotations
|
||||
|
||||
```python
|
||||
# Add processing information
|
||||
processing = oms.DataProcessing()
|
||||
processing.setMetaValue("smoothing", "gaussian")
|
||||
spectrum.setDataProcessing([processing])
|
||||
```
|
||||
|
||||
## MSExperiment
|
||||
|
||||
Represents a complete LC-MS/MS experiment containing multiple spectra and chromatograms.
|
||||
|
||||
### Creation and Population
|
||||
|
||||
```python
|
||||
# Create empty experiment
|
||||
exp = oms.MSExperiment()
|
||||
|
||||
# Add spectra
|
||||
spectrum1 = oms.MSSpectrum()
|
||||
spectrum1.setRT(100.0)
|
||||
spectrum1.set_peaks(([100, 200], [1000, 2000]))
|
||||
|
||||
spectrum2 = oms.MSSpectrum()
|
||||
spectrum2.setRT(200.0)
|
||||
spectrum2.set_peaks(([100, 200], [1500, 2500]))
|
||||
|
||||
exp.addSpectrum(spectrum1)
|
||||
exp.addSpectrum(spectrum2)
|
||||
|
||||
# Add chromatograms
|
||||
chrom = oms.MSChromatogram()
|
||||
chrom.set_peaks(([10.5, 11.0, 11.5], [1000, 5000, 3000]))
|
||||
exp.addChromatogram(chrom)
|
||||
```
|
||||
|
||||
### Accessing Spectra and Chromatograms
|
||||
|
||||
```python
|
||||
# Get number of spectra and chromatograms
|
||||
n_spectra = exp.getNrSpectra()
|
||||
n_chroms = exp.getNrChromatograms()
|
||||
|
||||
# Access by index
|
||||
first_spectrum = exp.getSpectrum(0)
|
||||
last_spectrum = exp.getSpectrum(exp.getNrSpectra() - 1)
|
||||
|
||||
# Iterate over all spectra
|
||||
for spectrum in exp:
|
||||
rt = spectrum.getRT()
|
||||
ms_level = spectrum.getMSLevel()
|
||||
n_peaks = spectrum.size()
|
||||
print(f"RT: {rt:.2f}s, MS{ms_level}, Peaks: {n_peaks}")
|
||||
|
||||
# Get all spectra as list
|
||||
spectra = exp.getSpectra()
|
||||
|
||||
# Access chromatograms
|
||||
chrom = exp.getChromatogram(0)
|
||||
```
|
||||
|
||||
### Filtering Operations
|
||||
|
||||
```python
|
||||
# Filter by MS level
|
||||
exp.filterMSLevel(1) # Keep only MS1 spectra
|
||||
exp.filterMSLevel(2) # Keep only MS2 spectra
|
||||
|
||||
# Filter by retention time range
|
||||
exp.filterRT(100.0, 500.0) # Keep RT between 100-500 seconds
|
||||
|
||||
# Filter by m/z range (all spectra)
|
||||
exp.filterMZ(300.0, 1500.0) # Keep m/z between 300-1500
|
||||
|
||||
# Filter by scan number
|
||||
exp.filterScanNumber(100, 200) # Keep scans 100-200
|
||||
```
|
||||
|
||||
### Metadata and Properties
|
||||
|
||||
```python
|
||||
# Set experiment metadata
|
||||
exp.setMetaValue("operator", "John Doe")
|
||||
exp.setMetaValue("instrument", "Q Exactive HF")
|
||||
|
||||
# Get metadata
|
||||
operator = exp.getMetaValue("operator")
|
||||
# Access properties
|
||||
print(f"Number of spectra: {exp.getNrSpectra()}")
|
||||
print(f"Number of chromatograms: {exp.getNrChromatograms()}")
|
||||
|
||||
# Get RT range
|
||||
rt_range = exp.getMinRT(), exp.getMaxRT()
|
||||
rts = [spec.getRT() for spec in exp]
|
||||
print(f"RT range: {min(rts):.1f} - {max(rts):.1f} seconds")
|
||||
|
||||
# Get m/z range
|
||||
mz_range = exp.getMinMZ(), exp.getMaxMZ()
|
||||
# Access individual spectrum
|
||||
spec = exp.getSpectrum(0)
|
||||
|
||||
# Clear all data
|
||||
exp.clear(False) # False = keep metadata
|
||||
# Iterate through spectra
|
||||
for spec in exp:
|
||||
if spec.getMSLevel() == 2:
|
||||
print(f"MS2 spectrum at RT {spec.getRT():.2f}")
|
||||
|
||||
# Get metadata
|
||||
exp_settings = exp.getExperimentalSettings()
|
||||
instrument = exp_settings.getInstrument()
|
||||
print(f"Instrument: {instrument.getName()}")
|
||||
```
|
||||
|
||||
### Sorting and Organization
|
||||
### MSSpectrum
|
||||
|
||||
Individual mass spectrum with m/z and intensity arrays.
|
||||
|
||||
```python
|
||||
# Sort spectra by retention time
|
||||
exp.sortSpectra()
|
||||
# Create empty spectrum
|
||||
spec = ms.MSSpectrum()
|
||||
|
||||
# Update ranges (call after modifications)
|
||||
exp.updateRanges()
|
||||
# Get from experiment
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
spec = exp.getSpectrum(0)
|
||||
|
||||
# Check if experiment is empty
|
||||
is_empty = exp.empty()
|
||||
# Basic properties
|
||||
print(f"MS level: {spec.getMSLevel()}")
|
||||
print(f"Retention time: {spec.getRT():.2f} seconds")
|
||||
print(f"Number of peaks: {spec.size()}")
|
||||
|
||||
# Reset (clear everything)
|
||||
exp.reset()
|
||||
# Get peak data as numpy arrays
|
||||
mz, intensity = spec.get_peaks()
|
||||
print(f"m/z range: {mz.min():.2f} - {mz.max():.2f}")
|
||||
print(f"Max intensity: {intensity.max():.0f}")
|
||||
|
||||
# Access individual peaks
|
||||
for i in range(min(5, spec.size())): # First 5 peaks
|
||||
print(f"Peak {i}: m/z={mz[i]:.4f}, intensity={intensity[i]:.0f}")
|
||||
|
||||
# Precursor information (for MS2)
|
||||
if spec.getMSLevel() == 2:
|
||||
precursors = spec.getPrecursors()
|
||||
if precursors:
|
||||
precursor = precursors[0]
|
||||
print(f"Precursor m/z: {precursor.getMZ():.4f}")
|
||||
print(f"Precursor charge: {precursor.getCharge()}")
|
||||
print(f"Precursor intensity: {precursor.getIntensity():.0f}")
|
||||
|
||||
# Set peak data
|
||||
new_mz = [100.0, 200.0, 300.0]
|
||||
new_intensity = [1000.0, 2000.0, 1500.0]
|
||||
spec.set_peaks((new_mz, new_intensity))
|
||||
```
|
||||
|
||||
## MSChromatogram
|
||||
### MSChromatogram
|
||||
|
||||
Represents an extracted or reconstructed chromatogram (retention time vs. intensity).
|
||||
|
||||
### Creation and Basic Usage
|
||||
Chromatographic trace (TIC, XIC, or SRM transition).
|
||||
|
||||
```python
|
||||
# Create chromatogram
|
||||
chrom = oms.MSChromatogram()
|
||||
# Access chromatogram from experiment
|
||||
for chrom in exp.getChromatograms():
|
||||
print(f"Chromatogram ID: {chrom.getNativeID()}")
|
||||
|
||||
# Set peaks (RT, intensity pairs)
|
||||
rt_values = [10.0, 10.5, 11.0, 11.5, 12.0]
|
||||
intensity_values = [1000, 5000, 8000, 6000, 2000]
|
||||
chrom.set_peaks((rt_values, intensity_values))
|
||||
# Get data
|
||||
rt, intensity = chrom.get_peaks()
|
||||
|
||||
# Get peaks
|
||||
rt_array, int_array = chrom.get_peaks()
|
||||
print(f" RT points: {len(rt)}")
|
||||
print(f" Max intensity: {intensity.max():.0f}")
|
||||
|
||||
# Get size
|
||||
n_points = chrom.size()
|
||||
# Precursor info (for XIC)
|
||||
precursor = chrom.getPrecursor()
|
||||
print(f" Precursor m/z: {precursor.getMZ():.4f}")
|
||||
```
|
||||
|
||||
### Chromatogram Types
|
||||
|
||||
```python
|
||||
# Set chromatogram type
|
||||
chrom.setChromatogramType(oms.ChromatogramSettings.ChromatogramType.SELECTED_ION_CURRENT_CHROMATOGRAM)
|
||||
|
||||
# Other types:
|
||||
# - TOTAL_ION_CURRENT_CHROMATOGRAM
|
||||
# - BASEPEAK_CHROMATOGRAM
|
||||
# - SELECTED_ION_CURRENT_CHROMATOGRAM
|
||||
# - SELECTED_REACTION_MONITORING_CHROMATOGRAM
|
||||
```
|
||||
|
||||
### Metadata
|
||||
|
||||
```python
|
||||
# Set native ID
|
||||
chrom.setNativeID("TIC")
|
||||
|
||||
# Set name
|
||||
chrom.setName("Total Ion Current")
|
||||
|
||||
# Access
|
||||
native_id = chrom.getNativeID()
|
||||
name = chrom.getName()
|
||||
```
|
||||
|
||||
### Precursor and Product Information (for SRM/MRM)
|
||||
|
||||
```python
|
||||
# For targeted experiments
|
||||
precursor = oms.Precursor()
|
||||
precursor.setMZ(456.7)
|
||||
chrom.setPrecursor(precursor)
|
||||
|
||||
product = oms.Product()
|
||||
product.setMZ(789.4)
|
||||
chrom.setProduct(product)
|
||||
```
|
||||
|
||||
## Peak1D and ChromatogramPeak
|
||||
|
||||
Individual peak data points.
|
||||
|
||||
### Peak1D (for mass spectra)
|
||||
|
||||
```python
|
||||
# Create individual peak
|
||||
peak = oms.Peak1D()
|
||||
peak.setMZ(456.789)
|
||||
peak.setIntensity(10000)
|
||||
|
||||
# Access
|
||||
mz = peak.getMZ()
|
||||
intensity = peak.getIntensity()
|
||||
|
||||
# Set position and intensity
|
||||
peak.setPosition([456.789])
|
||||
peak.setIntensity(10000)
|
||||
```
|
||||
|
||||
### ChromatogramPeak (for chromatograms)
|
||||
|
||||
```python
|
||||
# Create chromatogram peak
|
||||
chrom_peak = oms.ChromatogramPeak()
|
||||
chrom_peak.setRT(125.5)
|
||||
chrom_peak.setIntensity(5000)
|
||||
|
||||
# Access
|
||||
rt = chrom_peak.getRT()
|
||||
intensity = chrom_peak.getIntensity()
|
||||
```
|
||||
|
||||
## FeatureMap and Feature
|
||||
|
||||
For quantification results.
|
||||
## Feature Objects
|
||||
|
||||
### Feature
|
||||
|
||||
Represents a detected LC-MS feature (peptide or metabolite signal).
|
||||
Detected chromatographic peak with 2D spatial extent (RT-m/z).
|
||||
|
||||
```python
|
||||
# Create feature
|
||||
feature = oms.Feature()
|
||||
# Load features
|
||||
feature_map = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
# Set properties
|
||||
feature.setMZ(456.789)
|
||||
feature.setRT(123.45)
|
||||
feature.setIntensity(1000000)
|
||||
feature.setCharge(2)
|
||||
feature.setWidth(15.0) # RT width in seconds
|
||||
# Access individual feature
|
||||
feature = feature_map[0]
|
||||
|
||||
# Set quality score
|
||||
feature.setOverallQuality(0.95)
|
||||
# Core properties
|
||||
print(f"m/z: {feature.getMZ():.4f}")
|
||||
print(f"RT: {feature.getRT():.2f} seconds")
|
||||
print(f"Intensity: {feature.getIntensity():.0f}")
|
||||
print(f"Charge: {feature.getCharge()}")
|
||||
|
||||
# Access
|
||||
mz = feature.getMZ()
|
||||
rt = feature.getRT()
|
||||
intensity = feature.getIntensity()
|
||||
charge = feature.getCharge()
|
||||
# Quality metrics
|
||||
print(f"Overall quality: {feature.getOverallQuality():.3f}")
|
||||
print(f"Width (RT): {feature.getWidth():.2f}")
|
||||
|
||||
# Convex hull (spatial extent)
|
||||
hull = feature.getConvexHull()
|
||||
print(f"Hull points: {hull.getHullPoints().size()}")
|
||||
|
||||
# Bounding box
|
||||
bbox = hull.getBoundingBox()
|
||||
print(f"RT range: {bbox.minPosition()[0]:.2f} - {bbox.maxPosition()[0]:.2f}")
|
||||
print(f"m/z range: {bbox.minPosition()[1]:.4f} - {bbox.maxPosition()[1]:.4f}")
|
||||
|
||||
# Subordinate features (isotopes)
|
||||
subordinates = feature.getSubordinates()
|
||||
if subordinates:
|
||||
print(f"Isotopic features: {len(subordinates)}")
|
||||
for sub in subordinates:
|
||||
print(f" m/z: {sub.getMZ():.4f}, intensity: {sub.getIntensity():.0f}")
|
||||
|
||||
# Metadata values
|
||||
if feature.metaValueExists("label"):
|
||||
label = feature.getMetaValue("label")
|
||||
print(f"Label: {label}")
|
||||
```
|
||||
|
||||
### FeatureMap
|
||||
|
||||
Collection of features.
|
||||
Collection of features from a single LC-MS run.
|
||||
|
||||
```python
|
||||
# Create feature map
|
||||
feature_map = oms.FeatureMap()
|
||||
feature_map = ms.FeatureMap()
|
||||
|
||||
# Add features
|
||||
feature1 = oms.Feature()
|
||||
feature1.setMZ(456.789)
|
||||
feature1.setRT(123.45)
|
||||
feature1.setIntensity(1000000)
|
||||
# Load from file
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
feature_map.push_back(feature1)
|
||||
# Access properties
|
||||
print(f"Number of features: {feature_map.size()}")
|
||||
|
||||
# Get size
|
||||
n_features = feature_map.size()
|
||||
# Get unique features
|
||||
print(f"Unique features: {feature_map.getUniqueId()}")
|
||||
|
||||
# Iterate
|
||||
# Metadata
|
||||
primary_path = feature_map.getPrimaryMSRunPath()
|
||||
if primary_path:
|
||||
print(f"Source file: {primary_path[0].decode()}")
|
||||
|
||||
# Iterate through features
|
||||
for feature in feature_map:
|
||||
print(f"m/z: {feature.getMZ():.4f}, RT: {feature.getRT():.2f}")
|
||||
print(f"Feature: m/z={feature.getMZ():.4f}, RT={feature.getRT():.2f}")
|
||||
|
||||
# Access by index
|
||||
first_feature = feature_map[0]
|
||||
# Add new feature
|
||||
new_feature = ms.Feature()
|
||||
new_feature.setMZ(500.0)
|
||||
new_feature.setRT(300.0)
|
||||
new_feature.setIntensity(10000.0)
|
||||
feature_map.push_back(new_feature)
|
||||
|
||||
# Clear
|
||||
feature_map.clear()
|
||||
# Sort features
|
||||
feature_map.sortByRT() # or sortByMZ(), sortByIntensity()
|
||||
|
||||
# Export to pandas
|
||||
df = feature_map.get_df()
|
||||
print(df.head())
|
||||
```
|
||||
|
||||
## PeptideIdentification and ProteinIdentification
|
||||
|
||||
For identification results.
|
||||
|
||||
### PeptideIdentification
|
||||
|
||||
```python
|
||||
# Create peptide identification
|
||||
pep_id = oms.PeptideIdentification()
|
||||
pep_id.setRT(123.45)
|
||||
pep_id.setMZ(456.789)
|
||||
|
||||
# Create peptide hit
|
||||
hit = oms.PeptideHit()
|
||||
hit.setSequence(oms.AASequence.fromString("PEPTIDE"))
|
||||
hit.setCharge(2)
|
||||
hit.setScore(25.5)
|
||||
hit.setRank(1)
|
||||
|
||||
# Add to identification
|
||||
pep_id.setHits([hit])
|
||||
pep_id.setHigherScoreBetter(True)
|
||||
pep_id.setScoreType("XCorr")
|
||||
|
||||
# Access
|
||||
hits = pep_id.getHits()
|
||||
for hit in hits:
|
||||
seq = hit.getSequence().toString()
|
||||
score = hit.getScore()
|
||||
print(f"Sequence: {seq}, Score: {score}")
|
||||
```
|
||||
|
||||
### ProteinIdentification
|
||||
|
||||
```python
|
||||
# Create protein identification
|
||||
prot_id = oms.ProteinIdentification()
|
||||
|
||||
# Create protein hit
|
||||
prot_hit = oms.ProteinHit()
|
||||
prot_hit.setAccession("P12345")
|
||||
prot_hit.setSequence("MKTAYIAKQRQISFVK...")
|
||||
prot_hit.setScore(100.5)
|
||||
|
||||
# Add to identification
|
||||
prot_id.setHits([prot_hit])
|
||||
prot_id.setScoreType("Mascot Score")
|
||||
prot_id.setHigherScoreBetter(True)
|
||||
|
||||
# Search parameters
|
||||
search_params = oms.ProteinIdentification.SearchParameters()
|
||||
search_params.db = "uniprot_human.fasta"
|
||||
search_params.enzyme = "Trypsin"
|
||||
prot_id.setSearchParameters(search_params)
|
||||
```
|
||||
|
||||
## ConsensusMap and ConsensusFeature
|
||||
|
||||
For linking features across multiple samples.
|
||||
|
||||
### ConsensusFeature
|
||||
|
||||
```python
|
||||
# Create consensus feature
|
||||
cons_feature = oms.ConsensusFeature()
|
||||
cons_feature.setMZ(456.789)
|
||||
cons_feature.setRT(123.45)
|
||||
cons_feature.setIntensity(5000000) # Combined intensity
|
||||
Feature linked across multiple samples.
|
||||
|
||||
# Access linked features
|
||||
for handle in cons_feature.getFeatureList():
|
||||
map_index = handle.getMapIndex()
|
||||
feature_index = handle.getIndex()
|
||||
```python
|
||||
# Load consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Access consensus feature
|
||||
cons_feature = consensus_map[0]
|
||||
|
||||
# Consensus properties
|
||||
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
|
||||
print(f"Consensus RT: {cons_feature.getRT():.2f}")
|
||||
print(f"Consensus intensity: {cons_feature.getIntensity():.0f}")
|
||||
|
||||
# Get feature handles (individual map features)
|
||||
feature_list = cons_feature.getFeatureList()
|
||||
print(f"Present in {len(feature_list)} maps")
|
||||
|
||||
for handle in feature_list:
|
||||
map_idx = handle.getMapIndex()
|
||||
intensity = handle.getIntensity()
|
||||
mz = handle.getMZ()
|
||||
rt = handle.getRT()
|
||||
|
||||
print(f" Map {map_idx}: m/z={mz:.4f}, RT={rt:.2f}, intensity={intensity:.0f}")
|
||||
|
||||
# Get unique ID in originating map
|
||||
for handle in feature_list:
|
||||
unique_id = handle.getUniqueId()
|
||||
print(f"Unique ID: {unique_id}")
|
||||
```
|
||||
|
||||
### ConsensusMap
|
||||
|
||||
Collection of consensus features across samples.
|
||||
|
||||
```python
|
||||
# Create consensus map
|
||||
consensus_map = oms.ConsensusMap()
|
||||
consensus_map = ms.ConsensusMap()
|
||||
|
||||
# Add consensus features
|
||||
consensus_map.push_back(cons_feature)
|
||||
# Load from file
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Iterate
|
||||
for cons_feat in consensus_map:
|
||||
mz = cons_feat.getMZ()
|
||||
rt = cons_feat.getRT()
|
||||
n_features = cons_feat.size() # Number of linked features
|
||||
# Access properties
|
||||
print(f"Consensus features: {consensus_map.size()}")
|
||||
|
||||
# Column headers (file descriptions)
|
||||
headers = consensus_map.getColumnHeaders()
|
||||
print(f"Number of files: {len(headers)}")
|
||||
|
||||
for map_idx, description in headers.items():
|
||||
print(f"Map {map_idx}:")
|
||||
print(f" Filename: {description.filename}")
|
||||
print(f" Label: {description.label}")
|
||||
print(f" Size: {description.size}")
|
||||
|
||||
# Iterate through consensus features
|
||||
for cons_feature in consensus_map:
|
||||
print(f"Consensus feature: m/z={cons_feature.getMZ():.4f}")
|
||||
|
||||
# Export to DataFrame
|
||||
df = consensus_map.get_df()
|
||||
```
|
||||
|
||||
## Identification Objects
|
||||
|
||||
### PeptideIdentification
|
||||
|
||||
Identification results for a single spectrum.
|
||||
|
||||
```python
|
||||
# Load identifications
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
# Access peptide identification
|
||||
peptide_id = peptide_ids[0]
|
||||
|
||||
# Spectrum metadata
|
||||
print(f"RT: {peptide_id.getRT():.2f}")
|
||||
print(f"m/z: {peptide_id.getMZ():.4f}")
|
||||
|
||||
# Identification metadata
|
||||
print(f"Identifier: {peptide_id.getIdentifier()}")
|
||||
print(f"Score type: {peptide_id.getScoreType()}")
|
||||
print(f"Higher score better: {peptide_id.isHigherScoreBetter()}")
|
||||
|
||||
# Get peptide hits
|
||||
hits = peptide_id.getHits()
|
||||
print(f"Number of hits: {len(hits)}")
|
||||
|
||||
for hit in hits:
|
||||
print(f" Sequence: {hit.getSequence().toString()}")
|
||||
print(f" Score: {hit.getScore()}")
|
||||
print(f" Charge: {hit.getCharge()}")
|
||||
```
|
||||
|
||||
### PeptideHit
|
||||
|
||||
Individual peptide match to a spectrum.
|
||||
|
||||
```python
|
||||
# Access hit
|
||||
hit = peptide_id.getHits()[0]
|
||||
|
||||
# Sequence information
|
||||
sequence = hit.getSequence()
|
||||
print(f"Sequence: {sequence.toString()}")
|
||||
print(f"Mass: {sequence.getMonoWeight():.4f}")
|
||||
|
||||
# Score and rank
|
||||
print(f"Score: {hit.getScore()}")
|
||||
print(f"Rank: {hit.getRank()}")
|
||||
|
||||
# Charge state
|
||||
print(f"Charge: {hit.getCharge()}")
|
||||
|
||||
# Protein accessions
|
||||
accessions = hit.extractProteinAccessionsSet()
|
||||
for acc in accessions:
|
||||
print(f"Protein: {acc.decode()}")
|
||||
|
||||
# Meta values (additional scores, errors)
|
||||
if hit.metaValueExists("MS:1002252"): # mass error
|
||||
mass_error = hit.getMetaValue("MS:1002252")
|
||||
print(f"Mass error: {mass_error:.4f} ppm")
|
||||
```
|
||||
|
||||
### ProteinIdentification
|
||||
|
||||
Protein-level identification information.
|
||||
|
||||
```python
|
||||
# Access protein identification
|
||||
protein_id = protein_ids[0]
|
||||
|
||||
# Search engine info
|
||||
print(f"Search engine: {protein_id.getSearchEngine()}")
|
||||
print(f"Search engine version: {protein_id.getSearchEngineVersion()}")
|
||||
|
||||
# Search parameters
|
||||
search_params = protein_id.getSearchParameters()
|
||||
print(f"Database: {search_params.db}")
|
||||
print(f"Enzyme: {search_params.digestion_enzyme.getName()}")
|
||||
print(f"Missed cleavages: {search_params.missed_cleavages}")
|
||||
print(f"Precursor tolerance: {search_params.precursor_mass_tolerance}")
|
||||
|
||||
# Protein hits
|
||||
hits = protein_id.getHits()
|
||||
for hit in hits:
|
||||
print(f"Accession: {hit.getAccession()}")
|
||||
print(f"Score: {hit.getScore()}")
|
||||
print(f"Coverage: {hit.getCoverage():.1f}%")
|
||||
```
|
||||
|
||||
### ProteinHit
|
||||
|
||||
Individual protein identification.
|
||||
|
||||
```python
|
||||
# Access protein hit
|
||||
protein_hit = protein_id.getHits()[0]
|
||||
|
||||
# Protein information
|
||||
print(f"Accession: {protein_hit.getAccession()}")
|
||||
print(f"Description: {protein_hit.getDescription()}")
|
||||
print(f"Sequence: {protein_hit.getSequence()}")
|
||||
|
||||
# Scoring
|
||||
print(f"Score: {protein_hit.getScore()}")
|
||||
print(f"Coverage: {protein_hit.getCoverage():.1f}%")
|
||||
|
||||
# Rank
|
||||
print(f"Rank: {protein_hit.getRank()}")
|
||||
```
|
||||
|
||||
## Sequence Objects
|
||||
|
||||
### AASequence
|
||||
|
||||
Amino acid sequence with modifications.
|
||||
|
||||
```python
|
||||
# Create sequence from string
|
||||
seq = ms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
# Basic properties
|
||||
print(f"Sequence: {seq.toString()}")
|
||||
print(f"Length: {seq.size()}")
|
||||
print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}")
|
||||
print(f"Average mass: {seq.getAverageWeight():.4f}")
|
||||
|
||||
# Individual residues
|
||||
for i in range(seq.size()):
|
||||
residue = seq.getResidue(i)
|
||||
print(f"Position {i}: {residue.getOneLetterCode()}")
|
||||
print(f" Mass: {residue.getMonoWeight():.4f}")
|
||||
print(f" Formula: {residue.getFormula().toString()}")
|
||||
|
||||
# Modified sequence
|
||||
mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K")
|
||||
print(f"Modified: {mod_seq.isModified()}")
|
||||
|
||||
# Check modifications
|
||||
for i in range(mod_seq.size()):
|
||||
residue = mod_seq.getResidue(i)
|
||||
if residue.isModified():
|
||||
print(f"Modification at {i}: {residue.getModificationName()}")
|
||||
|
||||
# N-terminal and C-terminal modifications
|
||||
term_mod_seq = ms.AASequence.fromString("(Acetyl)PEPTIDE(Amidated)")
|
||||
```
|
||||
|
||||
### EmpiricalFormula
|
||||
|
||||
Molecular formula representation.
|
||||
|
||||
```python
|
||||
# Create formula
|
||||
formula = ms.EmpiricalFormula("C6H12O6") # Glucose
|
||||
|
||||
# Properties
|
||||
print(f"Formula: {formula.toString()}")
|
||||
print(f"Monoisotopic mass: {formula.getMonoWeight():.4f}")
|
||||
print(f"Average mass: {formula.getAverageWeight():.4f}")
|
||||
|
||||
# Element composition
|
||||
print(f"Carbon atoms: {formula.getNumberOf(b'C')}")
|
||||
print(f"Hydrogen atoms: {formula.getNumberOf(b'H')}")
|
||||
print(f"Oxygen atoms: {formula.getNumberOf(b'O')}")
|
||||
|
||||
# Arithmetic operations
|
||||
formula2 = ms.EmpiricalFormula("H2O")
|
||||
combined = formula + formula2 # Add water
|
||||
print(f"Combined: {combined.toString()}")
|
||||
```
|
||||
|
||||
## Parameter Objects
|
||||
|
||||
### Param
|
||||
|
||||
Generic parameter container used by algorithms.
|
||||
|
||||
```python
|
||||
# Get algorithm parameters
|
||||
algo = ms.GaussFilter()
|
||||
params = algo.getParameters()
|
||||
|
||||
# List all parameters
|
||||
for key in params.keys():
|
||||
value = params.getValue(key)
|
||||
print(f"{key}: {value}")
|
||||
|
||||
# Get specific parameter
|
||||
gaussian_width = params.getValue("gaussian_width")
|
||||
print(f"Gaussian width: {gaussian_width}")
|
||||
|
||||
# Set parameter
|
||||
params.setValue("gaussian_width", 0.2)
|
||||
|
||||
# Apply modified parameters
|
||||
algo.setParameters(params)
|
||||
|
||||
# Copy parameters
|
||||
params_copy = ms.Param(params)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use numpy arrays** for peak data when possible - much faster than individual peak access
|
||||
2. **Sort spectra** by position (m/z) before searching or filtering
|
||||
3. **Update ranges** after modifying MSExperiment: `exp.updateRanges()`
|
||||
4. **Check MS level** before processing - different algorithms for MS1 vs MS2
|
||||
5. **Validate precursor info** for MS2 spectra - ensure charge and m/z are set
|
||||
6. **Use appropriate containers** - MSExperiment for raw data, FeatureMap for quantification
|
||||
7. **Clear metadata carefully** - use `clear(False)` to preserve metadata when clearing peaks
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Create MS2 Spectrum with Precursor
|
||||
### Memory Management
|
||||
|
||||
```python
|
||||
spectrum = oms.MSSpectrum()
|
||||
spectrum.setRT(205.2)
|
||||
spectrum.setMSLevel(2)
|
||||
spectrum.set_peaks(([100, 200, 300], [1000, 5000, 3000]))
|
||||
# For large files, use indexed access instead of full loading
|
||||
indexed_mzml = ms.IndexedMzMLFileLoader()
|
||||
indexed_mzml.load("large_file.mzML")
|
||||
|
||||
precursor = oms.Precursor()
|
||||
precursor.setMZ(450.5)
|
||||
precursor.setCharge(2)
|
||||
spectrum.setPrecursors([precursor])
|
||||
# Access specific spectrum without loading entire file
|
||||
spec = indexed_mzml.getSpectrumById(100)
|
||||
```
|
||||
|
||||
### Extract MS1 Spectra from Experiment
|
||||
### Type Conversion
|
||||
|
||||
```python
|
||||
ms1_exp = oms.MSExperiment()
|
||||
for spectrum in exp:
|
||||
if spectrum.getMSLevel() == 1:
|
||||
ms1_exp.addSpectrum(spectrum)
|
||||
# Convert peak arrays to numpy
|
||||
import numpy as np
|
||||
|
||||
mz, intensity = spec.get_peaks()
|
||||
# These are already numpy arrays
|
||||
|
||||
# Can perform numpy operations
|
||||
filtered_mz = mz[intensity > 1000]
|
||||
```
|
||||
|
||||
### Calculate Total Ion Current (TIC)
|
||||
### Object Copying
|
||||
|
||||
```python
|
||||
tic_values = []
|
||||
rt_values = []
|
||||
for spectrum in exp:
|
||||
if spectrum.getMSLevel() == 1:
|
||||
mz, intensity = spectrum.get_peaks()
|
||||
tic = np.sum(intensity)
|
||||
tic_values.append(tic)
|
||||
rt_values.append(spectrum.getRT())
|
||||
```
|
||||
|
||||
### Find Spectrum Closest to RT
|
||||
|
||||
```python
|
||||
target_rt = 125.5
|
||||
closest_spectrum = None
|
||||
min_diff = float('inf')
|
||||
|
||||
for spectrum in exp:
|
||||
diff = abs(spectrum.getRT() - target_rt)
|
||||
if diff < min_diff:
|
||||
min_diff = diff
|
||||
closest_spectrum = spectrum
|
||||
# Create deep copy
|
||||
exp_copy = ms.MSExperiment(exp)
|
||||
|
||||
# Modifications to copy don't affect original
|
||||
```
|
||||
|
||||
410
scientific-packages/pyopenms/references/feature_detection.md
Normal file
410
scientific-packages/pyopenms/references/feature_detection.md
Normal file
@@ -0,0 +1,410 @@
|
||||
# Feature Detection and Linking
|
||||
|
||||
## Overview
|
||||
|
||||
Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison.
|
||||
|
||||
## Feature Detection Basics
|
||||
|
||||
A feature represents a chromatographic peak characterized by:
|
||||
- m/z value (mass-to-charge ratio)
|
||||
- Retention time (RT)
|
||||
- Intensity
|
||||
- Quality score
|
||||
- Convex hull (spatial extent in RT-m/z space)
|
||||
|
||||
## Feature Finding
|
||||
|
||||
### Feature Finder Multiples (FFM)
|
||||
|
||||
Standard algorithm for feature detection in centroided data:
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
# Load centroided data
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("centroided.mzML", exp)
|
||||
|
||||
# Create feature finder
|
||||
ff = ms.FeatureFinder()
|
||||
|
||||
# Get default parameters
|
||||
params = ff.getParameters("centroided")
|
||||
|
||||
# Modify key parameters
|
||||
params.setValue("mass_trace:mz_tolerance", 10.0) # ppm
|
||||
params.setValue("mass_trace:min_spectra", 7) # Min scans per feature
|
||||
params.setValue("isotopic_pattern:charge_low", 1)
|
||||
params.setValue("isotopic_pattern:charge_high", 4)
|
||||
|
||||
# Run feature detection
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
print(f"Detected {features.size()} features")
|
||||
|
||||
# Save features
|
||||
ms.FeatureXMLFile().store("features.featureXML", features)
|
||||
```
|
||||
|
||||
### Feature Finder for Metabolomics
|
||||
|
||||
Optimized for small molecules:
|
||||
|
||||
```python
|
||||
# Create feature finder for metabolomics
|
||||
ff = ms.FeatureFinder()
|
||||
|
||||
# Get metabolomics-specific parameters
|
||||
params = ff.getParameters("centroided")
|
||||
|
||||
# Configure for metabolomics
|
||||
params.setValue("mass_trace:mz_tolerance", 5.0) # Lower tolerance
|
||||
params.setValue("mass_trace:min_spectra", 5)
|
||||
params.setValue("isotopic_pattern:charge_low", 1) # Mostly singly charged
|
||||
params.setValue("isotopic_pattern:charge_high", 2)
|
||||
|
||||
# Run detection
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
```
|
||||
|
||||
## Accessing Feature Data
|
||||
|
||||
### Iterate Through Features
|
||||
|
||||
```python
|
||||
# Load features
|
||||
feature_map = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
# Access individual features
|
||||
for feature in feature_map:
|
||||
print(f"m/z: {feature.getMZ():.4f}")
|
||||
print(f"RT: {feature.getRT():.2f}")
|
||||
print(f"Intensity: {feature.getIntensity():.0f}")
|
||||
print(f"Charge: {feature.getCharge()}")
|
||||
print(f"Quality: {feature.getOverallQuality():.3f}")
|
||||
print(f"Width (RT): {feature.getWidth():.2f}")
|
||||
|
||||
# Get convex hull
|
||||
hull = feature.getConvexHull()
|
||||
print(f"Hull points: {hull.getHullPoints().size()}")
|
||||
```
|
||||
|
||||
### Feature Subordinates (Isotope Pattern)
|
||||
|
||||
```python
|
||||
# Access isotopic pattern
|
||||
for feature in feature_map:
|
||||
# Get subordinate features (isotopes)
|
||||
subordinates = feature.getSubordinates()
|
||||
|
||||
if subordinates:
|
||||
print(f"Main feature m/z: {feature.getMZ():.4f}")
|
||||
for sub in subordinates:
|
||||
print(f" Isotope m/z: {sub.getMZ():.4f}")
|
||||
print(f" Isotope intensity: {sub.getIntensity():.0f}")
|
||||
```
|
||||
|
||||
### Export to Pandas
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# Convert to DataFrame
|
||||
df = feature_map.get_df()
|
||||
|
||||
print(df.columns)
|
||||
# Typical columns: RT, mz, intensity, charge, quality
|
||||
|
||||
# Analyze features
|
||||
print(f"Mean intensity: {df['intensity'].mean()}")
|
||||
print(f"RT range: {df['RT'].min():.1f} - {df['RT'].max():.1f}")
|
||||
```
|
||||
|
||||
## Feature Linking
|
||||
|
||||
### Map Alignment
|
||||
|
||||
Align retention times before linking:
|
||||
|
||||
```python
|
||||
# Load multiple feature maps
|
||||
fm1 = ms.FeatureMap()
|
||||
fm2 = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("sample1.featureXML", fm1)
|
||||
ms.FeatureXMLFile().load("sample2.featureXML", fm2)
|
||||
|
||||
# Create aligner
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
|
||||
# Align maps
|
||||
fm_aligned = []
|
||||
transformations = []
|
||||
aligner.align([fm1, fm2], fm_aligned, transformations)
|
||||
```
|
||||
|
||||
### Feature Linking Algorithm
|
||||
|
||||
Link features across samples:
|
||||
|
||||
```python
|
||||
# Create feature grouping algorithm
|
||||
grouper = ms.FeatureGroupingAlgorithmQT()
|
||||
|
||||
# Configure parameters
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s)
|
||||
params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm)
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
# Prepare feature maps
|
||||
feature_maps = [fm1, fm2, fm3]
|
||||
|
||||
# Create consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
|
||||
# Link features
|
||||
grouper.group(feature_maps, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
|
||||
# Save consensus map
|
||||
ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map)
|
||||
```
|
||||
|
||||
## Consensus Features
|
||||
|
||||
### Access Consensus Data
|
||||
|
||||
```python
|
||||
# Load consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Iterate through consensus features
|
||||
for cons_feature in consensus_map:
|
||||
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
|
||||
print(f"Consensus RT: {cons_feature.getRT():.2f}")
|
||||
|
||||
# Get features from individual maps
|
||||
for handle in cons_feature.getFeatureList():
|
||||
map_idx = handle.getMapIndex()
|
||||
intensity = handle.getIntensity()
|
||||
print(f" Sample {map_idx}: intensity {intensity:.0f}")
|
||||
```
|
||||
|
||||
### Consensus Map Metadata
|
||||
|
||||
```python
|
||||
# Access file descriptions (map metadata)
|
||||
file_descriptions = consensus_map.getColumnHeaders()
|
||||
|
||||
for map_idx, description in file_descriptions.items():
|
||||
print(f"Map {map_idx}:")
|
||||
print(f" Filename: {description.filename}")
|
||||
print(f" Label: {description.label}")
|
||||
print(f" Size: {description.size}")
|
||||
```
|
||||
|
||||
## Adduct Detection
|
||||
|
||||
Identify different ionization forms of the same molecule:
|
||||
|
||||
```python
|
||||
# Create adduct detector
|
||||
adduct_detector = ms.MetaboliteAdductDecharger()
|
||||
|
||||
# Configure parameters
|
||||
params = adduct_detector.getParameters()
|
||||
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M-H]-")
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 1)
|
||||
params.setValue("max_neutrals", 1)
|
||||
adduct_detector.setParameters(params)
|
||||
|
||||
# Detect adducts
|
||||
feature_map_out = ms.FeatureMap()
|
||||
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
|
||||
```
|
||||
|
||||
## Complete Feature Detection Workflow
|
||||
|
||||
### End-to-End Example
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
def feature_detection_workflow(input_files, output_consensus):
|
||||
"""
|
||||
Complete workflow: feature detection and linking across samples.
|
||||
|
||||
Args:
|
||||
input_files: List of mzML file paths
|
||||
output_consensus: Output consensusXML file path
|
||||
"""
|
||||
|
||||
feature_maps = []
|
||||
|
||||
# Step 1: Detect features in each file
|
||||
for mzml_file in input_files:
|
||||
print(f"Processing {mzml_file}...")
|
||||
|
||||
# Load experiment
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load(mzml_file, exp)
|
||||
|
||||
# Find features
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
params.setValue("mass_trace:mz_tolerance", 10.0)
|
||||
params.setValue("mass_trace:min_spectra", 7)
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
# Store filename in feature map
|
||||
features.setPrimaryMSRunPath([mzml_file.encode()])
|
||||
|
||||
feature_maps.append(features)
|
||||
print(f" Found {features.size()} features")
|
||||
|
||||
# Step 2: Align retention times
|
||||
print("Aligning retention times...")
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
aligned_maps = []
|
||||
transformations = []
|
||||
aligner.align(feature_maps, aligned_maps, transformations)
|
||||
|
||||
# Step 3: Link features
|
||||
print("Linking features across samples...")
|
||||
grouper = ms.FeatureGroupingAlgorithmQT()
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 30.0)
|
||||
params.setValue("distance_MZ:max_difference", 10.0)
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
consensus_map = ms.ConsensusMap()
|
||||
grouper.group(aligned_maps, consensus_map)
|
||||
|
||||
# Save results
|
||||
ms.ConsensusXMLFile().store(output_consensus, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
print(f"Results saved to {output_consensus}")
|
||||
|
||||
return consensus_map
|
||||
|
||||
# Run workflow
|
||||
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
|
||||
consensus = feature_detection_workflow(input_files, "consensus.consensusXML")
|
||||
```
|
||||
|
||||
## Feature Filtering
|
||||
|
||||
### Filter by Quality
|
||||
|
||||
```python
|
||||
# Filter features by quality score
|
||||
filtered_features = ms.FeatureMap()
|
||||
|
||||
for feature in feature_map:
|
||||
if feature.getOverallQuality() > 0.5: # Quality threshold
|
||||
filtered_features.push_back(feature)
|
||||
|
||||
print(f"Kept {filtered_features.size()} high-quality features")
|
||||
```
|
||||
|
||||
### Filter by Intensity
|
||||
|
||||
```python
|
||||
# Keep only intense features
|
||||
min_intensity = 10000
|
||||
|
||||
filtered_features = ms.FeatureMap()
|
||||
for feature in feature_map:
|
||||
if feature.getIntensity() >= min_intensity:
|
||||
filtered_features.push_back(feature)
|
||||
```
|
||||
|
||||
### Filter by m/z Range
|
||||
|
||||
```python
|
||||
# Extract features in specific m/z range
|
||||
mz_min = 200.0
|
||||
mz_max = 800.0
|
||||
|
||||
filtered_features = ms.FeatureMap()
|
||||
for feature in feature_map:
|
||||
mz = feature.getMZ()
|
||||
if mz_min <= mz <= mz_max:
|
||||
filtered_features.push_back(feature)
|
||||
```
|
||||
|
||||
## Feature Annotation
|
||||
|
||||
### Add Identification Information
|
||||
|
||||
```python
|
||||
# Annotate features with peptide identifications
|
||||
# Load identifications
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
# Create ID mapper
|
||||
mapper = ms.IDMapper()
|
||||
|
||||
# Map IDs to features
|
||||
mapper.annotate(feature_map, peptide_ids, protein_ids)
|
||||
|
||||
# Check annotations
|
||||
for feature in feature_map:
|
||||
peptide_ids_for_feature = feature.getPeptideIdentifications()
|
||||
if peptide_ids_for_feature:
|
||||
print(f"Feature at {feature.getMZ():.4f} m/z identified")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Parameter Optimization
|
||||
|
||||
Optimize parameters for your data type:
|
||||
|
||||
```python
|
||||
# Test different tolerance values
|
||||
mz_tolerances = [5.0, 10.0, 20.0] # ppm
|
||||
|
||||
for tol in mz_tolerances:
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
params.setValue("mass_trace:mz_tolerance", tol)
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
print(f"Tolerance {tol} ppm: {features.size()} features")
|
||||
```
|
||||
|
||||
### Visual Inspection
|
||||
|
||||
Export features for visualization:
|
||||
|
||||
```python
|
||||
# Convert to DataFrame for plotting
|
||||
df = feature_map.get_df()
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.scatter(df['RT'], df['mz'], s=df['intensity']/1000, alpha=0.5)
|
||||
plt.xlabel('Retention Time (s)')
|
||||
plt.ylabel('m/z')
|
||||
plt.title('Feature Map')
|
||||
plt.colorbar(label='Intensity (scaled)')
|
||||
plt.show()
|
||||
```
|
||||
349
scientific-packages/pyopenms/references/file_io.md
Normal file
349
scientific-packages/pyopenms/references/file_io.md
Normal file
@@ -0,0 +1,349 @@
|
||||
# File I/O and Data Formats
|
||||
|
||||
## Overview
|
||||
|
||||
PyOpenMS supports multiple mass spectrometry file formats for reading and writing. This guide covers file handling strategies and format-specific operations.
|
||||
|
||||
## Supported Formats
|
||||
|
||||
### Spectrum Data Formats
|
||||
|
||||
- **mzML**: Standard XML-based format for mass spectrometry data
|
||||
- **mzXML**: Earlier XML-based format
|
||||
- **mzData**: XML format (deprecated but supported)
|
||||
|
||||
### Identification Formats
|
||||
|
||||
- **idXML**: OpenMS native identification format
|
||||
- **mzIdentML**: Standard XML format for identification data
|
||||
- **pepXML**: X! Tandem format
|
||||
- **protXML**: Protein identification format
|
||||
|
||||
### Feature and Quantitation Formats
|
||||
|
||||
- **featureXML**: OpenMS format for detected features
|
||||
- **consensusXML**: Format for consensus features across samples
|
||||
- **mzTab**: Tab-delimited format for reporting
|
||||
|
||||
### Sequence and Library Formats
|
||||
|
||||
- **FASTA**: Protein/peptide sequences
|
||||
- **TraML**: Transition lists for targeted experiments
|
||||
|
||||
## Reading mzML Files
|
||||
|
||||
### In-Memory Loading
|
||||
|
||||
Load entire file into memory (suitable for smaller files):
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
# Create experiment container
|
||||
exp = ms.MSExperiment()
|
||||
|
||||
# Load file
|
||||
ms.MzMLFile().load("sample.mzML", exp)
|
||||
|
||||
# Access data
|
||||
print(f"Spectra: {exp.getNrSpectra()}")
|
||||
print(f"Chromatograms: {exp.getNrChromatograms()}")
|
||||
```
|
||||
|
||||
### Indexed Access
|
||||
|
||||
Efficient random access for large files:
|
||||
|
||||
```python
|
||||
# Create indexed access
|
||||
indexed_mzml = ms.IndexedMzMLFileLoader()
|
||||
indexed_mzml.load("large_file.mzML")
|
||||
|
||||
# Get specific spectrum by index
|
||||
spec = indexed_mzml.getSpectrumById(100)
|
||||
|
||||
# Access by native ID
|
||||
spec = indexed_mzml.getSpectrumByNativeId("scan=5000")
|
||||
```
|
||||
|
||||
### Streaming Access
|
||||
|
||||
Memory-efficient processing for very large files:
|
||||
|
||||
```python
|
||||
# Define consumer function
|
||||
class SpectrumProcessor(ms.MSExperimentConsumer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.count = 0
|
||||
|
||||
def consumeSpectrum(self, spec):
|
||||
# Process spectrum
|
||||
if spec.getMSLevel() == 2:
|
||||
self.count += 1
|
||||
|
||||
# Stream file
|
||||
consumer = SpectrumProcessor()
|
||||
ms.MzMLFile().transform("large.mzML", consumer)
|
||||
print(f"Processed {consumer.count} MS2 spectra")
|
||||
```
|
||||
|
||||
### Cached Access
|
||||
|
||||
Balance between memory usage and speed:
|
||||
|
||||
```python
|
||||
# Use on-disk caching
|
||||
options = ms.CachedmzML()
|
||||
options.setMetaDataOnly(False)
|
||||
|
||||
exp = ms.MSExperiment()
|
||||
ms.CachedmzMLHandler().load("sample.mzML", exp, options)
|
||||
```
|
||||
|
||||
## Writing mzML Files
|
||||
|
||||
### Basic Writing
|
||||
|
||||
```python
|
||||
# Create or modify experiment
|
||||
exp = ms.MSExperiment()
|
||||
# ... add spectra ...
|
||||
|
||||
# Write to file
|
||||
ms.MzMLFile().store("output.mzML", exp)
|
||||
```
|
||||
|
||||
### Compression Options
|
||||
|
||||
```python
|
||||
# Configure compression
|
||||
file_handler = ms.MzMLFile()
|
||||
|
||||
options = ms.PeakFileOptions()
|
||||
options.setCompression(True) # Enable compression
|
||||
file_handler.setOptions(options)
|
||||
|
||||
file_handler.store("compressed.mzML", exp)
|
||||
```
|
||||
|
||||
## Reading Identification Data
|
||||
|
||||
### idXML Format
|
||||
|
||||
```python
|
||||
# Load identification results
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
|
||||
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
# Access peptide identifications
|
||||
for peptide_id in peptide_ids:
|
||||
print(f"RT: {peptide_id.getRT()}")
|
||||
print(f"MZ: {peptide_id.getMZ()}")
|
||||
|
||||
# Get peptide hits
|
||||
for hit in peptide_id.getHits():
|
||||
print(f" Sequence: {hit.getSequence().toString()}")
|
||||
print(f" Score: {hit.getScore()}")
|
||||
print(f" Charge: {hit.getCharge()}")
|
||||
```
|
||||
|
||||
### mzIdentML Format
|
||||
|
||||
```python
|
||||
# Read mzIdentML
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
|
||||
ms.MzIdentMLFile().load("results.mzid", protein_ids, peptide_ids)
|
||||
```
|
||||
|
||||
### pepXML Format
|
||||
|
||||
```python
|
||||
# Load pepXML
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
|
||||
ms.PepXMLFile().load("results.pep.xml", protein_ids, peptide_ids)
|
||||
```
|
||||
|
||||
## Reading Feature Data
|
||||
|
||||
### featureXML
|
||||
|
||||
```python
|
||||
# Load features
|
||||
feature_map = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
# Access features
|
||||
for feature in feature_map:
|
||||
print(f"RT: {feature.getRT()}")
|
||||
print(f"MZ: {feature.getMZ()}")
|
||||
print(f"Intensity: {feature.getIntensity()}")
|
||||
print(f"Quality: {feature.getOverallQuality()}")
|
||||
```
|
||||
|
||||
### consensusXML
|
||||
|
||||
```python
|
||||
# Load consensus features
|
||||
consensus_map = ms.ConsensusMap()
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Access consensus features
|
||||
for consensus_feature in consensus_map:
|
||||
print(f"RT: {consensus_feature.getRT()}")
|
||||
print(f"MZ: {consensus_feature.getMZ()}")
|
||||
|
||||
# Get feature handles (sub-features from different maps)
|
||||
for handle in consensus_feature.getFeatureList():
|
||||
map_index = handle.getMapIndex()
|
||||
intensity = handle.getIntensity()
|
||||
print(f" Map {map_index}: {intensity}")
|
||||
```
|
||||
|
||||
## Reading FASTA Files
|
||||
|
||||
```python
|
||||
# Load protein sequences
|
||||
fasta_entries = []
|
||||
ms.FASTAFile().load("database.fasta", fasta_entries)
|
||||
|
||||
for entry in fasta_entries:
|
||||
print(f"Identifier: {entry.identifier}")
|
||||
print(f"Description: {entry.description}")
|
||||
print(f"Sequence: {entry.sequence}")
|
||||
```
|
||||
|
||||
## Reading TraML Files
|
||||
|
||||
```python
|
||||
# Load transition lists for targeted experiments
|
||||
targeted_exp = ms.TargetedExperiment()
|
||||
ms.TraMLFile().load("transitions.TraML", targeted_exp)
|
||||
|
||||
# Access transitions
|
||||
for transition in targeted_exp.getTransitions():
|
||||
print(f"Precursor MZ: {transition.getPrecursorMZ()}")
|
||||
print(f"Product MZ: {transition.getProductMZ()}")
|
||||
```
|
||||
|
||||
## Writing mzTab Files
|
||||
|
||||
```python
|
||||
# Create mzTab for reporting
|
||||
mztab = ms.MzTab()
|
||||
|
||||
# Add metadata
|
||||
metadata = mztab.getMetaData()
|
||||
metadata.mz_tab_version.set("1.0.0")
|
||||
metadata.title.set("Proteomics Analysis Results")
|
||||
|
||||
# Add protein data
|
||||
protein_section = mztab.getProteinSectionRows()
|
||||
# ... populate protein data ...
|
||||
|
||||
# Write to file
|
||||
ms.MzTabFile().store("report.mzTab", mztab)
|
||||
```
|
||||
|
||||
## Format Conversion
|
||||
|
||||
### mzXML to mzML
|
||||
|
||||
```python
|
||||
# Read mzXML
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzXMLFile().load("data.mzXML", exp)
|
||||
|
||||
# Write as mzML
|
||||
ms.MzMLFile().store("data.mzML", exp)
|
||||
```
|
||||
|
||||
### Extract Chromatograms from mzML
|
||||
|
||||
```python
|
||||
# Load experiment
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
|
||||
# Extract specific chromatogram
|
||||
for chrom in exp.getChromatograms():
|
||||
if chrom.getNativeID() == "TIC":
|
||||
rt, intensity = chrom.get_peaks()
|
||||
print(f"TIC has {len(rt)} data points")
|
||||
```
|
||||
|
||||
## File Metadata
|
||||
|
||||
### Access mzML Metadata
|
||||
|
||||
```python
|
||||
# Load file
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("sample.mzML", exp)
|
||||
|
||||
# Get experimental settings
|
||||
exp_settings = exp.getExperimentalSettings()
|
||||
|
||||
# Instrument info
|
||||
instrument = exp_settings.getInstrument()
|
||||
print(f"Instrument: {instrument.getName()}")
|
||||
print(f"Model: {instrument.getModel()}")
|
||||
|
||||
# Sample info
|
||||
sample = exp_settings.getSample()
|
||||
print(f"Sample name: {sample.getName()}")
|
||||
|
||||
# Source files
|
||||
for source_file in exp_settings.getSourceFiles():
|
||||
print(f"Source: {source_file.getNameOfFile()}")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Memory Management
|
||||
|
||||
For large files:
|
||||
1. Use indexed or streaming access instead of full in-memory loading
|
||||
2. Process data in chunks
|
||||
3. Clear data structures when no longer needed
|
||||
|
||||
```python
|
||||
# Good for large files
|
||||
indexed_mzml = ms.IndexedMzMLFileLoader()
|
||||
indexed_mzml.load("huge_file.mzML")
|
||||
|
||||
# Process spectra one at a time
|
||||
for i in range(indexed_mzml.getNrSpectra()):
|
||||
spec = indexed_mzml.getSpectrumById(i)
|
||||
# Process spectrum
|
||||
# Spectrum automatically cleaned up after processing
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
except Exception as e:
|
||||
print(f"Failed to load file: {e}")
|
||||
```
|
||||
|
||||
### File Validation
|
||||
|
||||
```python
|
||||
# Check if file exists and is readable
|
||||
import os
|
||||
|
||||
if os.path.exists("data.mzML") and os.path.isfile("data.mzML"):
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
else:
|
||||
print("File not found")
|
||||
```
|
||||
422
scientific-packages/pyopenms/references/identification.md
Normal file
422
scientific-packages/pyopenms/references/identification.md
Normal file
@@ -0,0 +1,422 @@
|
||||
# Peptide and Protein Identification
|
||||
|
||||
## Overview
|
||||
|
||||
PyOpenMS supports peptide/protein identification through integration with search engines and provides tools for post-processing identification results including FDR control, protein inference, and annotation.
|
||||
|
||||
## Supported Search Engines
|
||||
|
||||
PyOpenMS integrates with these search engines:
|
||||
|
||||
- **Comet**: Fast tandem MS search
|
||||
- **Mascot**: Commercial search engine
|
||||
- **MSGFPlus**: Spectral probability-based search
|
||||
- **XTandem**: Open-source search tool
|
||||
- **OMSSA**: NCBI search engine
|
||||
- **Myrimatch**: High-throughput search
|
||||
- **MSFragger**: Ultra-fast search
|
||||
|
||||
## Reading Identification Data
|
||||
|
||||
### idXML Format
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
# Load identification results
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
|
||||
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
print(f"Protein identifications: {len(protein_ids)}")
|
||||
print(f"Peptide identifications: {len(peptide_ids)}")
|
||||
```
|
||||
|
||||
### Access Peptide Identifications
|
||||
|
||||
```python
|
||||
# Iterate through peptide IDs
|
||||
for peptide_id in peptide_ids:
|
||||
# Spectrum metadata
|
||||
print(f"RT: {peptide_id.getRT():.2f}")
|
||||
print(f"m/z: {peptide_id.getMZ():.4f}")
|
||||
|
||||
# Get peptide hits (ranked by score)
|
||||
hits = peptide_id.getHits()
|
||||
print(f"Number of hits: {len(hits)}")
|
||||
|
||||
for hit in hits:
|
||||
sequence = hit.getSequence()
|
||||
print(f" Sequence: {sequence.toString()}")
|
||||
print(f" Score: {hit.getScore()}")
|
||||
print(f" Charge: {hit.getCharge()}")
|
||||
print(f" Mass error (ppm): {hit.getMetaValue('mass_error_ppm')}")
|
||||
|
||||
# Get modifications
|
||||
if sequence.isModified():
|
||||
for i in range(sequence.size()):
|
||||
residue = sequence.getResidue(i)
|
||||
if residue.isModified():
|
||||
print(f" Modification at position {i}: {residue.getModificationName()}")
|
||||
```
|
||||
|
||||
### Access Protein Identifications
|
||||
|
||||
```python
|
||||
# Access protein-level information
|
||||
for protein_id in protein_ids:
|
||||
# Search parameters
|
||||
search_params = protein_id.getSearchParameters()
|
||||
print(f"Search engine: {protein_id.getSearchEngine()}")
|
||||
print(f"Database: {search_params.db}")
|
||||
|
||||
# Protein hits
|
||||
hits = protein_id.getHits()
|
||||
for hit in hits:
|
||||
print(f" Accession: {hit.getAccession()}")
|
||||
print(f" Score: {hit.getScore()}")
|
||||
print(f" Coverage: {hit.getCoverage()}")
|
||||
print(f" Sequence: {hit.getSequence()}")
|
||||
```
|
||||
|
||||
## False Discovery Rate (FDR)
|
||||
|
||||
### FDR Filtering
|
||||
|
||||
Apply FDR filtering to control false positives:
|
||||
|
||||
```python
|
||||
# Create FDR object
|
||||
fdr = ms.FalseDiscoveryRate()
|
||||
|
||||
# Apply FDR at PSM level
|
||||
fdr.apply(peptide_ids)
|
||||
|
||||
# Filter by FDR threshold
|
||||
fdr_threshold = 0.01 # 1% FDR
|
||||
filtered_peptide_ids = []
|
||||
|
||||
for peptide_id in peptide_ids:
|
||||
# Keep hits below FDR threshold
|
||||
filtered_hits = []
|
||||
for hit in peptide_id.getHits():
|
||||
if hit.getScore() <= fdr_threshold: # Lower score = better
|
||||
filtered_hits.append(hit)
|
||||
|
||||
if filtered_hits:
|
||||
peptide_id.setHits(filtered_hits)
|
||||
filtered_peptide_ids.append(peptide_id)
|
||||
|
||||
print(f"Peptides passing FDR: {len(filtered_peptide_ids)}")
|
||||
```
|
||||
|
||||
### Score Transformation
|
||||
|
||||
Convert scores to q-values:
|
||||
|
||||
```python
|
||||
# Apply score transformation
|
||||
fdr.apply(peptide_ids)
|
||||
|
||||
# Access q-values
|
||||
for peptide_id in peptide_ids:
|
||||
for hit in peptide_id.getHits():
|
||||
q_value = hit.getMetaValue("q-value")
|
||||
print(f"Sequence: {hit.getSequence().toString()}, q-value: {q_value}")
|
||||
```
|
||||
|
||||
## Protein Inference
|
||||
|
||||
### ID Mapper
|
||||
|
||||
Map peptide identifications to proteins:
|
||||
|
||||
```python
|
||||
# Create mapper
|
||||
mapper = ms.IDMapper()
|
||||
|
||||
# Map to features
|
||||
feature_map = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
# Annotate features with IDs
|
||||
mapper.annotate(feature_map, peptide_ids, protein_ids)
|
||||
|
||||
# Check annotated features
|
||||
for feature in feature_map:
|
||||
pep_ids = feature.getPeptideIdentifications()
|
||||
if pep_ids:
|
||||
for pep_id in pep_ids:
|
||||
for hit in pep_id.getHits():
|
||||
print(f"Feature {feature.getMZ():.4f}: {hit.getSequence().toString()}")
|
||||
```
|
||||
|
||||
### Protein Grouping
|
||||
|
||||
Group proteins by shared peptides:
|
||||
|
||||
```python
|
||||
# Create protein inference algorithm
|
||||
inference = ms.BasicProteinInferenceAlgorithm()
|
||||
|
||||
# Run inference
|
||||
inference.run(peptide_ids, protein_ids)
|
||||
|
||||
# Access protein groups
|
||||
for protein_id in protein_ids:
|
||||
hits = protein_id.getHits()
|
||||
if len(hits) > 1:
|
||||
print("Protein group:")
|
||||
for hit in hits:
|
||||
print(f" {hit.getAccession()}")
|
||||
```
|
||||
|
||||
## Peptide Sequence Handling
|
||||
|
||||
### AASequence Object
|
||||
|
||||
Work with peptide sequences:
|
||||
|
||||
```python
|
||||
# Create peptide sequence
|
||||
seq = ms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
print(f"Sequence: {seq.toString()}")
|
||||
print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}")
|
||||
print(f"Average mass: {seq.getAverageWeight():.4f}")
|
||||
print(f"Length: {seq.size()}")
|
||||
|
||||
# Access individual amino acids
|
||||
for i in range(seq.size()):
|
||||
residue = seq.getResidue(i)
|
||||
print(f"Position {i}: {residue.getOneLetterCode()}, mass: {residue.getMonoWeight():.4f}")
|
||||
```
|
||||
|
||||
### Modified Sequences
|
||||
|
||||
Handle post-translational modifications:
|
||||
|
||||
```python
|
||||
# Sequence with modifications
|
||||
mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K")
|
||||
|
||||
print(f"Modified sequence: {mod_seq.toString()}")
|
||||
print(f"Mass with mods: {mod_seq.getMonoWeight():.4f}")
|
||||
|
||||
# Check if modified
|
||||
print(f"Is modified: {mod_seq.isModified()}")
|
||||
|
||||
# Get modification info
|
||||
for i in range(mod_seq.size()):
|
||||
residue = mod_seq.getResidue(i)
|
||||
if residue.isModified():
|
||||
print(f"Residue {residue.getOneLetterCode()} at position {i}")
|
||||
print(f" Modification: {residue.getModificationName()}")
|
||||
```
|
||||
|
||||
### Peptide Digestion
|
||||
|
||||
Simulate enzymatic digestion:
|
||||
|
||||
```python
|
||||
# Create digestion enzyme
|
||||
enzyme = ms.ProteaseDigestion()
|
||||
enzyme.setEnzyme("Trypsin")
|
||||
|
||||
# Set missed cleavages
|
||||
enzyme.setMissedCleavages(2)
|
||||
|
||||
# Digest protein sequence
|
||||
protein_seq = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"
|
||||
|
||||
# Get peptides
|
||||
peptides = []
|
||||
enzyme.digest(ms.AASequence.fromString(protein_seq), peptides)
|
||||
|
||||
print(f"Generated {len(peptides)} peptides")
|
||||
for peptide in peptides[:5]: # Show first 5
|
||||
print(f" {peptide.toString()}, mass: {peptide.getMonoWeight():.2f}")
|
||||
```
|
||||
|
||||
## Theoretical Spectrum Generation
|
||||
|
||||
### Fragment Ion Calculation
|
||||
|
||||
Generate theoretical fragment ions:
|
||||
|
||||
```python
|
||||
# Create peptide
|
||||
peptide = ms.AASequence.fromString("PEPTIDE")
|
||||
|
||||
# Generate b and y ions
|
||||
fragments = []
|
||||
ms.TheoreticalSpectrumGenerator().getSpectrum(fragments, peptide, 1, 1)
|
||||
|
||||
print(f"Generated {fragments.size()} fragment ions")
|
||||
|
||||
# Access fragments
|
||||
mz, intensity = fragments.get_peaks()
|
||||
for m, i in zip(mz[:10], intensity[:10]): # Show first 10
|
||||
print(f"m/z: {m:.4f}, intensity: {i}")
|
||||
```
|
||||
|
||||
## Complete Identification Workflow
|
||||
|
||||
### End-to-End Example
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
def identification_workflow(spectrum_file, fasta_file, output_file):
|
||||
"""
|
||||
Complete identification workflow with FDR control.
|
||||
|
||||
Args:
|
||||
spectrum_file: Input mzML file
|
||||
fasta_file: Protein database (FASTA)
|
||||
output_file: Output idXML file
|
||||
"""
|
||||
|
||||
# Step 1: Load spectra
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load(spectrum_file, exp)
|
||||
print(f"Loaded {exp.getNrSpectra()} spectra")
|
||||
|
||||
# Step 2: Configure search parameters
|
||||
search_params = ms.SearchParameters()
|
||||
search_params.db = fasta_file
|
||||
search_params.precursor_mass_tolerance = 10.0 # ppm
|
||||
search_params.fragment_mass_tolerance = 0.5 # Da
|
||||
search_params.enzyme = "Trypsin"
|
||||
search_params.missed_cleavages = 2
|
||||
search_params.modifications = ["Oxidation (M)", "Carbamidomethyl (C)"]
|
||||
|
||||
# Step 3: Run search (example with Comet adapter)
|
||||
# Note: Requires search engine to be installed
|
||||
# comet = ms.CometAdapter()
|
||||
# protein_ids, peptide_ids = comet.search(exp, search_params)
|
||||
|
||||
# For this example, load pre-computed results
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
ms.IdXMLFile().load("raw_identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
print(f"Initial peptide IDs: {len(peptide_ids)}")
|
||||
|
||||
# Step 4: Apply FDR filtering
|
||||
fdr = ms.FalseDiscoveryRate()
|
||||
fdr.apply(peptide_ids)
|
||||
|
||||
# Filter by 1% FDR
|
||||
filtered_peptide_ids = []
|
||||
for peptide_id in peptide_ids:
|
||||
filtered_hits = []
|
||||
for hit in peptide_id.getHits():
|
||||
q_value = hit.getMetaValue("q-value")
|
||||
if q_value <= 0.01:
|
||||
filtered_hits.append(hit)
|
||||
|
||||
if filtered_hits:
|
||||
peptide_id.setHits(filtered_hits)
|
||||
filtered_peptide_ids.append(peptide_id)
|
||||
|
||||
print(f"Peptides after FDR (1%): {len(filtered_peptide_ids)}")
|
||||
|
||||
# Step 5: Protein inference
|
||||
inference = ms.BasicProteinInferenceAlgorithm()
|
||||
inference.run(filtered_peptide_ids, protein_ids)
|
||||
|
||||
print(f"Identified proteins: {len(protein_ids)}")
|
||||
|
||||
# Step 6: Save results
|
||||
ms.IdXMLFile().store(output_file, protein_ids, filtered_peptide_ids)
|
||||
print(f"Results saved to {output_file}")
|
||||
|
||||
return protein_ids, filtered_peptide_ids
|
||||
|
||||
# Run workflow
|
||||
protein_ids, peptide_ids = identification_workflow(
|
||||
"spectra.mzML",
|
||||
"database.fasta",
|
||||
"identifications_fdr.idXML"
|
||||
)
|
||||
```
|
||||
|
||||
## Spectral Library Search
|
||||
|
||||
### Library Matching
|
||||
|
||||
```python
|
||||
# Load spectral library
|
||||
library = ms.MSPFile()
|
||||
library_spectra = []
|
||||
library.load("spectral_library.msp", library_spectra)
|
||||
|
||||
# Load experimental spectra
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp)
|
||||
|
||||
# Compare spectra
|
||||
spectra_compare = ms.SpectraSTSimilarityScore()
|
||||
|
||||
for exp_spec in exp:
|
||||
if exp_spec.getMSLevel() == 2:
|
||||
best_match_score = 0
|
||||
best_match_lib = None
|
||||
|
||||
for lib_spec in library_spectra:
|
||||
score = spectra_compare.operator()(exp_spec, lib_spec)
|
||||
if score > best_match_score:
|
||||
best_match_score = score
|
||||
best_match_lib = lib_spec
|
||||
|
||||
if best_match_score > 0.7: # Threshold
|
||||
print(f"Match found: score {best_match_score:.3f}")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Decoy Database
|
||||
|
||||
Use target-decoy approach for FDR calculation:
|
||||
|
||||
```python
|
||||
# Generate decoy database
|
||||
decoy_generator = ms.DecoyGenerator()
|
||||
|
||||
# Load target database
|
||||
fasta_entries = []
|
||||
ms.FASTAFile().load("target.fasta", fasta_entries)
|
||||
|
||||
# Generate decoys
|
||||
decoy_entries = []
|
||||
for entry in fasta_entries:
|
||||
decoy_entry = decoy_generator.reverseProtein(entry)
|
||||
decoy_entries.append(decoy_entry)
|
||||
|
||||
# Save combined database
|
||||
all_entries = fasta_entries + decoy_entries
|
||||
ms.FASTAFile().store("target_decoy.fasta", all_entries)
|
||||
```
|
||||
|
||||
### Score Interpretation
|
||||
|
||||
Understand score types from different engines:
|
||||
|
||||
```python
|
||||
# Interpret scores based on search engine
|
||||
for peptide_id in peptide_ids:
|
||||
search_engine = peptide_id.getIdentifier()
|
||||
|
||||
for hit in peptide_id.getHits():
|
||||
score = hit.getScore()
|
||||
|
||||
# Score interpretation varies by engine
|
||||
if "Comet" in search_engine:
|
||||
# Comet: higher E-value = worse
|
||||
print(f"E-value: {score}")
|
||||
elif "Mascot" in search_engine:
|
||||
# Mascot: higher score = better
|
||||
print(f"Ion score: {score}")
|
||||
```
|
||||
482
scientific-packages/pyopenms/references/metabolomics.md
Normal file
482
scientific-packages/pyopenms/references/metabolomics.md
Normal file
@@ -0,0 +1,482 @@
|
||||
# Metabolomics Workflows
|
||||
|
||||
## Overview
|
||||
|
||||
PyOpenMS provides specialized tools for untargeted metabolomics analysis including feature detection optimized for small molecules, adduct grouping, compound identification, and integration with metabolomics databases.
|
||||
|
||||
## Untargeted Metabolomics Pipeline
|
||||
|
||||
### Complete Workflow
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
def metabolomics_pipeline(input_files, output_dir):
|
||||
"""
|
||||
Complete untargeted metabolomics workflow.
|
||||
|
||||
Args:
|
||||
input_files: List of mzML file paths (one per sample)
|
||||
output_dir: Directory for output files
|
||||
"""
|
||||
|
||||
# Step 1: Peak picking and feature detection
|
||||
feature_maps = []
|
||||
|
||||
for mzml_file in input_files:
|
||||
print(f"Processing {mzml_file}...")
|
||||
|
||||
# Load data
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load(mzml_file, exp)
|
||||
|
||||
# Peak picking if needed
|
||||
if not exp.getSpectrum(0).isSorted():
|
||||
picker = ms.PeakPickerHiRes()
|
||||
exp_picked = ms.MSExperiment()
|
||||
picker.pickExperiment(exp, exp_picked)
|
||||
exp = exp_picked
|
||||
|
||||
# Feature detection
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
|
||||
# Metabolomics-specific parameters
|
||||
params.setValue("mass_trace:mz_tolerance", 5.0) # ppm, tighter for metabolites
|
||||
params.setValue("mass_trace:min_spectra", 5)
|
||||
params.setValue("isotopic_pattern:charge_low", 1)
|
||||
params.setValue("isotopic_pattern:charge_high", 2) # Mostly singly charged
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
features.setPrimaryMSRunPath([mzml_file.encode()])
|
||||
feature_maps.append(features)
|
||||
|
||||
print(f" Detected {features.size()} features")
|
||||
|
||||
# Step 2: Adduct detection and grouping
|
||||
print("Detecting adducts...")
|
||||
adduct_grouped_maps = []
|
||||
|
||||
adduct_detector = ms.MetaboliteAdductDecharger()
|
||||
params = adduct_detector.getParameters()
|
||||
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M+NH4]+,[M-H]-,[M+Cl]-")
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 1)
|
||||
adduct_detector.setParameters(params)
|
||||
|
||||
for fm in feature_maps:
|
||||
fm_out = ms.FeatureMap()
|
||||
adduct_detector.compute(fm, fm_out, ms.ConsensusMap())
|
||||
adduct_grouped_maps.append(fm_out)
|
||||
|
||||
# Step 3: RT alignment
|
||||
print("Aligning retention times...")
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
|
||||
params = aligner.getParameters()
|
||||
params.setValue("max_num_peaks_considered", 1000)
|
||||
params.setValue("pairfinder:distance_MZ:max_difference", 10.0)
|
||||
params.setValue("pairfinder:distance_MZ:unit", "ppm")
|
||||
aligner.setParameters(params)
|
||||
|
||||
aligned_maps = []
|
||||
transformations = []
|
||||
aligner.align(adduct_grouped_maps, aligned_maps, transformations)
|
||||
|
||||
# Step 4: Feature linking
|
||||
print("Linking features...")
|
||||
grouper = ms.FeatureGroupingAlgorithmQT()
|
||||
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 60.0) # seconds
|
||||
params.setValue("distance_MZ:max_difference", 5.0) # ppm
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
consensus_map = ms.ConsensusMap()
|
||||
grouper.group(aligned_maps, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
|
||||
# Step 5: Gap filling (fill missing values)
|
||||
print("Filling gaps...")
|
||||
# Gap filling not directly available in Python API
|
||||
# Would use TOPP tool FeatureFinderMetaboIdent
|
||||
|
||||
# Step 6: Export results
|
||||
consensus_file = f"{output_dir}/consensus.consensusXML"
|
||||
ms.ConsensusXMLFile().store(consensus_file, consensus_map)
|
||||
|
||||
# Export to CSV for downstream analysis
|
||||
df = consensus_map.get_df()
|
||||
csv_file = f"{output_dir}/metabolite_table.csv"
|
||||
df.to_csv(csv_file, index=False)
|
||||
|
||||
print(f"Results saved to {output_dir}")
|
||||
|
||||
return consensus_map
|
||||
|
||||
# Run pipeline
|
||||
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
|
||||
consensus = metabolomics_pipeline(input_files, "output")
|
||||
```
|
||||
|
||||
## Adduct Detection
|
||||
|
||||
### Configure Adduct Types
|
||||
|
||||
```python
|
||||
# Create adduct detector
|
||||
adduct_detector = ms.MetaboliteAdductDecharger()
|
||||
|
||||
# Configure common adducts
|
||||
params = adduct_detector.getParameters()
|
||||
|
||||
# Positive mode adducts
|
||||
positive_adducts = [
|
||||
"[M+H]+",
|
||||
"[M+Na]+",
|
||||
"[M+K]+",
|
||||
"[M+NH4]+",
|
||||
"[2M+H]+",
|
||||
"[M+H-H2O]+"
|
||||
]
|
||||
|
||||
# Negative mode adducts
|
||||
negative_adducts = [
|
||||
"[M-H]-",
|
||||
"[M+Cl]-",
|
||||
"[M+FA-H]-", # Formate
|
||||
"[2M-H]-"
|
||||
]
|
||||
|
||||
# Set for positive mode
|
||||
params.setValue("potential_adducts", ",".join(positive_adducts))
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 1)
|
||||
params.setValue("max_neutrals", 1)
|
||||
adduct_detector.setParameters(params)
|
||||
|
||||
# Apply adduct detection
|
||||
feature_map_out = ms.FeatureMap()
|
||||
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
|
||||
```
|
||||
|
||||
### Access Adduct Information
|
||||
|
||||
```python
|
||||
# Check adduct annotations
|
||||
for feature in feature_map_out:
|
||||
# Get adduct type if annotated
|
||||
if feature.metaValueExists("adduct"):
|
||||
adduct = feature.getMetaValue("adduct")
|
||||
neutral_mass = feature.getMetaValue("neutral_mass")
|
||||
print(f"m/z: {feature.getMZ():.4f}")
|
||||
print(f" Adduct: {adduct}")
|
||||
print(f" Neutral mass: {neutral_mass:.4f}")
|
||||
```
|
||||
|
||||
## Compound Identification
|
||||
|
||||
### Mass-Based Annotation
|
||||
|
||||
```python
|
||||
# Annotate features with compound database
|
||||
from pyopenms import MassDecomposition
|
||||
|
||||
# Load compound database (example structure)
|
||||
# In practice, use external database like HMDB, METLIN
|
||||
|
||||
compound_db = [
|
||||
{"name": "Glucose", "formula": "C6H12O6", "mass": 180.0634},
|
||||
{"name": "Citric acid", "formula": "C6H8O7", "mass": 192.0270},
|
||||
# ... more compounds
|
||||
]
|
||||
|
||||
# Annotate features
|
||||
mass_tolerance = 5.0 # ppm
|
||||
|
||||
for feature in feature_map:
|
||||
observed_mz = feature.getMZ()
|
||||
|
||||
# Calculate neutral mass (assuming [M+H]+)
|
||||
neutral_mass = observed_mz - 1.007276 # Proton mass
|
||||
|
||||
# Search database
|
||||
for compound in compound_db:
|
||||
mass_error_ppm = abs(neutral_mass - compound["mass"]) / compound["mass"] * 1e6
|
||||
|
||||
if mass_error_ppm <= mass_tolerance:
|
||||
print(f"Potential match: {compound['name']}")
|
||||
print(f" Observed m/z: {observed_mz:.4f}")
|
||||
print(f" Expected mass: {compound['mass']:.4f}")
|
||||
print(f" Error: {mass_error_ppm:.2f} ppm")
|
||||
```
|
||||
|
||||
### MS/MS-Based Identification
|
||||
|
||||
```python
|
||||
# Load MS2 data
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data_with_ms2.mzML", exp)
|
||||
|
||||
# Extract MS2 spectra
|
||||
ms2_spectra = []
|
||||
for spec in exp:
|
||||
if spec.getMSLevel() == 2:
|
||||
ms2_spectra.append(spec)
|
||||
|
||||
print(f"Found {len(ms2_spectra)} MS2 spectra")
|
||||
|
||||
# Match to spectral library
|
||||
# (Requires external tool or custom implementation)
|
||||
```
|
||||
|
||||
## Data Normalization
|
||||
|
||||
### Total Ion Current (TIC) Normalization
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# Load consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Calculate TIC per sample
|
||||
n_samples = len(consensus_map.getColumnHeaders())
|
||||
tic_per_sample = np.zeros(n_samples)
|
||||
|
||||
for cons_feature in consensus_map:
|
||||
for handle in cons_feature.getFeatureList():
|
||||
map_idx = handle.getMapIndex()
|
||||
tic_per_sample[map_idx] += handle.getIntensity()
|
||||
|
||||
print("TIC per sample:", tic_per_sample)
|
||||
|
||||
# Normalize to median TIC
|
||||
median_tic = np.median(tic_per_sample)
|
||||
normalization_factors = median_tic / tic_per_sample
|
||||
|
||||
print("Normalization factors:", normalization_factors)
|
||||
|
||||
# Apply normalization
|
||||
consensus_map_normalized = ms.ConsensusMap(consensus_map)
|
||||
for cons_feature in consensus_map_normalized:
|
||||
feature_list = cons_feature.getFeatureList()
|
||||
for handle in feature_list:
|
||||
map_idx = handle.getMapIndex()
|
||||
normalized_intensity = handle.getIntensity() * normalization_factors[map_idx]
|
||||
handle.setIntensity(normalized_intensity)
|
||||
```
|
||||
|
||||
## Quality Control
|
||||
|
||||
### Coefficient of Variation (CV) Filtering
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Export to pandas
|
||||
df = consensus_map.get_df()
|
||||
|
||||
# Assume QC samples are columns with 'QC' in name
|
||||
qc_cols = [col for col in df.columns if 'QC' in col]
|
||||
|
||||
if qc_cols:
|
||||
# Calculate CV for each feature in QC samples
|
||||
qc_data = df[qc_cols]
|
||||
cv = (qc_data.std(axis=1) / qc_data.mean(axis=1)) * 100
|
||||
|
||||
# Filter features with CV < 30% in QC samples
|
||||
good_features = df[cv < 30]
|
||||
|
||||
print(f"Features before CV filter: {len(df)}")
|
||||
print(f"Features after CV filter: {len(good_features)}")
|
||||
```
|
||||
|
||||
### Blank Filtering
|
||||
|
||||
```python
|
||||
# Remove features present in blank samples
|
||||
blank_cols = [col for col in df.columns if 'Blank' in col]
|
||||
sample_cols = [col for col in df.columns if 'Sample' in col]
|
||||
|
||||
if blank_cols and sample_cols:
|
||||
# Calculate mean intensity in blanks and samples
|
||||
blank_mean = df[blank_cols].mean(axis=1)
|
||||
sample_mean = df[sample_cols].mean(axis=1)
|
||||
|
||||
# Keep features with 3x higher intensity in samples than blanks
|
||||
ratio = sample_mean / (blank_mean + 1) # Add 1 to avoid division by zero
|
||||
filtered_df = df[ratio > 3]
|
||||
|
||||
print(f"Features before blank filtering: {len(df)}")
|
||||
print(f"Features after blank filtering: {len(filtered_df)}")
|
||||
```
|
||||
|
||||
## Missing Value Imputation
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Load data
|
||||
df = consensus_map.get_df()
|
||||
|
||||
# Replace zeros with NaN
|
||||
df = df.replace(0, np.nan)
|
||||
|
||||
# Count missing values
|
||||
missing_per_feature = df.isnull().sum(axis=1)
|
||||
print(f"Features with >50% missing: {sum(missing_per_feature > len(df.columns)/2)}")
|
||||
|
||||
# Simple imputation: replace with minimum value
|
||||
for col in df.columns:
|
||||
if df[col].dtype in [np.float64, np.int64]:
|
||||
min_val = df[col].min() / 2 # Half minimum
|
||||
df[col].fillna(min_val, inplace=True)
|
||||
```
|
||||
|
||||
## Metabolite Table Export
|
||||
|
||||
### Create Analysis-Ready Table
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def create_metabolite_table(consensus_map, output_file):
|
||||
"""
|
||||
Create metabolite quantification table for statistical analysis.
|
||||
"""
|
||||
|
||||
# Get column headers (file descriptions)
|
||||
headers = consensus_map.getColumnHeaders()
|
||||
|
||||
# Initialize data structure
|
||||
data = {
|
||||
'mz': [],
|
||||
'rt': [],
|
||||
'feature_id': []
|
||||
}
|
||||
|
||||
# Add sample columns
|
||||
for map_idx, header in headers.items():
|
||||
sample_name = header.label or f"Sample_{map_idx}"
|
||||
data[sample_name] = []
|
||||
|
||||
# Extract feature data
|
||||
for idx, cons_feature in enumerate(consensus_map):
|
||||
data['mz'].append(cons_feature.getMZ())
|
||||
data['rt'].append(cons_feature.getRT())
|
||||
data['feature_id'].append(f"F{idx:06d}")
|
||||
|
||||
# Initialize intensities
|
||||
intensities = {map_idx: 0.0 for map_idx in headers.keys()}
|
||||
|
||||
# Fill in measured intensities
|
||||
for handle in cons_feature.getFeatureList():
|
||||
map_idx = handle.getMapIndex()
|
||||
intensities[map_idx] = handle.getIntensity()
|
||||
|
||||
# Add to data structure
|
||||
for map_idx, header in headers.items():
|
||||
sample_name = header.label or f"Sample_{map_idx}"
|
||||
data[sample_name].append(intensities[map_idx])
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Sort by RT
|
||||
df = df.sort_values('rt')
|
||||
|
||||
# Save to CSV
|
||||
df.to_csv(output_file, index=False)
|
||||
|
||||
print(f"Metabolite table with {len(df)} features saved to {output_file}")
|
||||
|
||||
return df
|
||||
|
||||
# Create table
|
||||
df = create_metabolite_table(consensus_map, "metabolite_table.csv")
|
||||
```
|
||||
|
||||
## Integration with External Tools
|
||||
|
||||
### Export for MetaboAnalyst
|
||||
|
||||
```python
|
||||
def export_for_metaboanalyst(df, output_file):
|
||||
"""
|
||||
Format data for MetaboAnalyst input.
|
||||
|
||||
Requires sample names as columns, features as rows.
|
||||
"""
|
||||
|
||||
# Transpose DataFrame
|
||||
# Remove metadata columns
|
||||
sample_cols = [col for col in df.columns if col not in ['mz', 'rt', 'feature_id']]
|
||||
|
||||
# Extract sample data
|
||||
sample_data = df[sample_cols]
|
||||
|
||||
# Transpose (samples as rows, features as columns)
|
||||
df_transposed = sample_data.T
|
||||
|
||||
# Add feature identifiers as column names
|
||||
df_transposed.columns = df['feature_id']
|
||||
|
||||
# Save
|
||||
df_transposed.to_csv(output_file)
|
||||
|
||||
print(f"MetaboAnalyst format saved to {output_file}")
|
||||
|
||||
# Export
|
||||
export_for_metaboanalyst(df, "for_metaboanalyst.csv")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Sample Size and Replicates
|
||||
|
||||
- Include QC samples (pooled sample) every 5-10 injections
|
||||
- Run blank samples to identify contamination
|
||||
- Use at least 3 biological replicates per group
|
||||
- Randomize sample injection order
|
||||
|
||||
### Parameter Optimization
|
||||
|
||||
Test parameters on pooled QC sample:
|
||||
|
||||
```python
|
||||
# Test different mass trace parameters
|
||||
mz_tolerances = [3.0, 5.0, 10.0]
|
||||
min_spectra_values = [3, 5, 7]
|
||||
|
||||
for tol in mz_tolerances:
|
||||
for min_spec in min_spectra_values:
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
params.setValue("mass_trace:mz_tolerance", tol)
|
||||
params.setValue("mass_trace:min_spectra", min_spec)
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
print(f"tol={tol}, min_spec={min_spec}: {features.size()} features")
|
||||
```
|
||||
|
||||
### Retention Time Windows
|
||||
|
||||
Adjust based on chromatographic method:
|
||||
|
||||
```python
|
||||
# For 10-minute LC gradient
|
||||
params.setValue("distance_RT:max_difference", 30.0) # 30 seconds
|
||||
|
||||
# For 60-minute LC gradient
|
||||
params.setValue("distance_RT:max_difference", 90.0) # 90 seconds
|
||||
```
|
||||
433
scientific-packages/pyopenms/references/signal_processing.md
Normal file
433
scientific-packages/pyopenms/references/signal_processing.md
Normal file
@@ -0,0 +1,433 @@
|
||||
# Signal Processing
|
||||
|
||||
## Overview
|
||||
|
||||
PyOpenMS provides algorithms for processing raw mass spectrometry data including smoothing, filtering, peak picking, centroiding, normalization, and deconvolution.
|
||||
|
||||
## Algorithm Pattern
|
||||
|
||||
Most signal processing algorithms follow a standard pattern:
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
# 1. Create algorithm instance
|
||||
algo = ms.AlgorithmName()
|
||||
|
||||
# 2. Get and modify parameters
|
||||
params = algo.getParameters()
|
||||
params.setValue("parameter_name", value)
|
||||
algo.setParameters(params)
|
||||
|
||||
# 3. Apply to data
|
||||
algo.filterExperiment(exp) # or filterSpectrum(spec)
|
||||
```
|
||||
|
||||
## Smoothing
|
||||
|
||||
### Gaussian Filter
|
||||
|
||||
Apply Gaussian smoothing to reduce noise:
|
||||
|
||||
```python
|
||||
# Create Gaussian filter
|
||||
gaussian = ms.GaussFilter()
|
||||
|
||||
# Configure parameters
|
||||
params = gaussian.getParameters()
|
||||
params.setValue("gaussian_width", 0.2) # Width in m/z or RT units
|
||||
params.setValue("ppm_tolerance", 10.0) # For m/z dimension
|
||||
params.setValue("use_ppm_tolerance", "true")
|
||||
gaussian.setParameters(params)
|
||||
|
||||
# Apply to experiment
|
||||
gaussian.filterExperiment(exp)
|
||||
|
||||
# Or apply to single spectrum
|
||||
spec = exp.getSpectrum(0)
|
||||
gaussian.filterSpectrum(spec)
|
||||
```
|
||||
|
||||
### Savitzky-Golay Filter
|
||||
|
||||
Polynomial smoothing that preserves peak shapes:
|
||||
|
||||
```python
|
||||
# Create Savitzky-Golay filter
|
||||
sg_filter = ms.SavitzkyGolayFilter()
|
||||
|
||||
# Configure parameters
|
||||
params = sg_filter.getParameters()
|
||||
params.setValue("frame_length", 11) # Window size (must be odd)
|
||||
params.setValue("polynomial_order", 4) # Polynomial degree
|
||||
sg_filter.setParameters(params)
|
||||
|
||||
# Apply smoothing
|
||||
sg_filter.filterExperiment(exp)
|
||||
```
|
||||
|
||||
## Peak Picking and Centroiding
|
||||
|
||||
### Peak Picker High Resolution
|
||||
|
||||
Detect peaks in high-resolution data:
|
||||
|
||||
```python
|
||||
# Create peak picker
|
||||
peak_picker = ms.PeakPickerHiRes()
|
||||
|
||||
# Configure parameters
|
||||
params = peak_picker.getParameters()
|
||||
params.setValue("signal_to_noise", 3.0) # S/N threshold
|
||||
params.setValue("spacing_difference", 1.5) # Minimum peak spacing
|
||||
peak_picker.setParameters(params)
|
||||
|
||||
# Pick peaks
|
||||
exp_picked = ms.MSExperiment()
|
||||
peak_picker.pickExperiment(exp, exp_picked)
|
||||
```
|
||||
|
||||
### Peak Picker for CWT
|
||||
|
||||
Continuous wavelet transform-based peak picking:
|
||||
|
||||
```python
|
||||
# Create CWT peak picker
|
||||
cwt_picker = ms.PeakPickerCWT()
|
||||
|
||||
# Configure parameters
|
||||
params = cwt_picker.getParameters()
|
||||
params.setValue("signal_to_noise", 1.0)
|
||||
params.setValue("peak_width", 0.15) # Expected peak width
|
||||
cwt_picker.setParameters(params)
|
||||
|
||||
# Pick peaks
|
||||
cwt_picker.pickExperiment(exp, exp_picked)
|
||||
```
|
||||
|
||||
## Normalization
|
||||
|
||||
### Normalizer
|
||||
|
||||
Normalize peak intensities within spectra:
|
||||
|
||||
```python
|
||||
# Create normalizer
|
||||
normalizer = ms.Normalizer()
|
||||
|
||||
# Configure normalization method
|
||||
params = normalizer.getParameters()
|
||||
params.setValue("method", "to_one") # Options: "to_one", "to_TIC"
|
||||
normalizer.setParameters(params)
|
||||
|
||||
# Apply normalization
|
||||
normalizer.filterExperiment(exp)
|
||||
```
|
||||
|
||||
## Peak Filtering
|
||||
|
||||
### Threshold Mower
|
||||
|
||||
Remove peaks below intensity threshold:
|
||||
|
||||
```python
|
||||
# Create threshold filter
|
||||
mower = ms.ThresholdMower()
|
||||
|
||||
# Configure threshold
|
||||
params = mower.getParameters()
|
||||
params.setValue("threshold", 1000.0) # Absolute intensity threshold
|
||||
mower.setParameters(params)
|
||||
|
||||
# Apply filter
|
||||
mower.filterExperiment(exp)
|
||||
```
|
||||
|
||||
### Window Mower
|
||||
|
||||
Keep only highest peaks in sliding windows:
|
||||
|
||||
```python
|
||||
# Create window mower
|
||||
window_mower = ms.WindowMower()
|
||||
|
||||
# Configure parameters
|
||||
params = window_mower.getParameters()
|
||||
params.setValue("windowsize", 50.0) # Window size in m/z
|
||||
params.setValue("peakcount", 2) # Keep top N peaks per window
|
||||
window_mower.setParameters(params)
|
||||
|
||||
# Apply filter
|
||||
window_mower.filterExperiment(exp)
|
||||
```
|
||||
|
||||
### N Largest Peaks
|
||||
|
||||
Keep only the N most intense peaks:
|
||||
|
||||
```python
|
||||
# Create N largest filter
|
||||
n_largest = ms.NLargest()
|
||||
|
||||
# Configure parameters
|
||||
params = n_largest.getParameters()
|
||||
params.setValue("n", 200) # Keep 200 most intense peaks
|
||||
n_largest.setParameters(params)
|
||||
|
||||
# Apply filter
|
||||
n_largest.filterExperiment(exp)
|
||||
```
|
||||
|
||||
## Baseline Reduction
|
||||
|
||||
### Morphological Filter
|
||||
|
||||
Remove baseline using morphological operations:
|
||||
|
||||
```python
|
||||
# Create morphological filter
|
||||
morph_filter = ms.MorphologicalFilter()
|
||||
|
||||
# Configure parameters
|
||||
params = morph_filter.getParameters()
|
||||
params.setValue("struc_elem_length", 3.0) # Structuring element size
|
||||
params.setValue("method", "tophat") # Method: "tophat", "bothat", "erosion", "dilation"
|
||||
morph_filter.setParameters(params)
|
||||
|
||||
# Apply filter
|
||||
morph_filter.filterExperiment(exp)
|
||||
```
|
||||
|
||||
## Spectrum Merging
|
||||
|
||||
### Spectra Merger
|
||||
|
||||
Combine multiple spectra into one:
|
||||
|
||||
```python
|
||||
# Create merger
|
||||
merger = ms.SpectraMerger()
|
||||
|
||||
# Configure parameters
|
||||
params = merger.getParameters()
|
||||
params.setValue("average_gaussian:spectrum_type", "profile")
|
||||
params.setValue("average_gaussian:rt_FWHM", 5.0) # RT window
|
||||
merger.setParameters(params)
|
||||
|
||||
# Merge spectra
|
||||
merger.mergeSpectraBlockWise(exp)
|
||||
```
|
||||
|
||||
## Deconvolution
|
||||
|
||||
### Charge Deconvolution
|
||||
|
||||
Determine charge states and convert to neutral masses:
|
||||
|
||||
```python
|
||||
# Create feature deconvoluter
|
||||
deconvoluter = ms.FeatureDeconvolution()
|
||||
|
||||
# Configure parameters
|
||||
params = deconvoluter.getParameters()
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 4)
|
||||
params.setValue("potential_charge_states", "1,2,3,4")
|
||||
deconvoluter.setParameters(params)
|
||||
|
||||
# Apply deconvolution
|
||||
feature_map_out = ms.FeatureMap()
|
||||
deconvoluter.compute(exp, feature_map, feature_map_out, ms.ConsensusMap())
|
||||
```
|
||||
|
||||
### Isotope Deconvolution
|
||||
|
||||
Remove isotopic patterns:
|
||||
|
||||
```python
|
||||
# Create isotope wavelet transform
|
||||
isotope_wavelet = ms.IsotopeWaveletTransform()
|
||||
|
||||
# Configure parameters
|
||||
params = isotope_wavelet.getParameters()
|
||||
params.setValue("max_charge", 3)
|
||||
params.setValue("intensity_threshold", 10.0)
|
||||
isotope_wavelet.setParameters(params)
|
||||
|
||||
# Apply transformation
|
||||
isotope_wavelet.transform(exp)
|
||||
```
|
||||
|
||||
## Retention Time Alignment
|
||||
|
||||
### Map Alignment
|
||||
|
||||
Align retention times across multiple runs:
|
||||
|
||||
```python
|
||||
# Create map aligner
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
|
||||
# Load multiple experiments
|
||||
exp1 = ms.MSExperiment()
|
||||
exp2 = ms.MSExperiment()
|
||||
ms.MzMLFile().load("run1.mzML", exp1)
|
||||
ms.MzMLFile().load("run2.mzML", exp2)
|
||||
|
||||
# Create reference
|
||||
reference = ms.MSExperiment()
|
||||
|
||||
# Align experiments
|
||||
transformations = []
|
||||
aligner.align(exp1, exp2, transformations)
|
||||
|
||||
# Apply transformation
|
||||
transformer = ms.MapAlignmentTransformer()
|
||||
transformer.transformRetentionTimes(exp2, transformations[0])
|
||||
```
|
||||
|
||||
## Mass Calibration
|
||||
|
||||
### Internal Calibration
|
||||
|
||||
Calibrate mass axis using known reference masses:
|
||||
|
||||
```python
|
||||
# Create internal calibration
|
||||
calibration = ms.InternalCalibration()
|
||||
|
||||
# Set reference masses
|
||||
reference_masses = [500.0, 1000.0, 1500.0] # Known m/z values
|
||||
|
||||
# Calibrate
|
||||
calibration.calibrate(exp, reference_masses)
|
||||
```
|
||||
|
||||
## Quality Control
|
||||
|
||||
### Spectrum Statistics
|
||||
|
||||
Calculate quality metrics:
|
||||
|
||||
```python
|
||||
# Get spectrum
|
||||
spec = exp.getSpectrum(0)
|
||||
|
||||
# Calculate statistics
|
||||
mz, intensity = spec.get_peaks()
|
||||
|
||||
# Total ion current
|
||||
tic = sum(intensity)
|
||||
|
||||
# Base peak
|
||||
base_peak_intensity = max(intensity)
|
||||
base_peak_mz = mz[intensity.argmax()]
|
||||
|
||||
print(f"TIC: {tic}")
|
||||
print(f"Base peak: {base_peak_mz} m/z at {base_peak_intensity}")
|
||||
```
|
||||
|
||||
## Spectrum Preprocessing Pipeline
|
||||
|
||||
### Complete Preprocessing Example
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
def preprocess_experiment(input_file, output_file):
|
||||
"""Complete preprocessing pipeline."""
|
||||
|
||||
# Load data
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load(input_file, exp)
|
||||
|
||||
# 1. Smooth with Gaussian filter
|
||||
gaussian = ms.GaussFilter()
|
||||
gaussian.filterExperiment(exp)
|
||||
|
||||
# 2. Pick peaks
|
||||
picker = ms.PeakPickerHiRes()
|
||||
exp_picked = ms.MSExperiment()
|
||||
picker.pickExperiment(exp, exp_picked)
|
||||
|
||||
# 3. Normalize intensities
|
||||
normalizer = ms.Normalizer()
|
||||
params = normalizer.getParameters()
|
||||
params.setValue("method", "to_TIC")
|
||||
normalizer.setParameters(params)
|
||||
normalizer.filterExperiment(exp_picked)
|
||||
|
||||
# 4. Filter low-intensity peaks
|
||||
mower = ms.ThresholdMower()
|
||||
params = mower.getParameters()
|
||||
params.setValue("threshold", 10.0)
|
||||
mower.setParameters(params)
|
||||
mower.filterExperiment(exp_picked)
|
||||
|
||||
# Save processed data
|
||||
ms.MzMLFile().store(output_file, exp_picked)
|
||||
|
||||
return exp_picked
|
||||
|
||||
# Run pipeline
|
||||
exp_processed = preprocess_experiment("raw_data.mzML", "processed_data.mzML")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Parameter Optimization
|
||||
|
||||
Test parameters on representative data:
|
||||
|
||||
```python
|
||||
# Try different Gaussian widths
|
||||
widths = [0.1, 0.2, 0.5]
|
||||
|
||||
for width in widths:
|
||||
exp_test = ms.MSExperiment()
|
||||
ms.MzMLFile().load("test_data.mzML", exp_test)
|
||||
|
||||
gaussian = ms.GaussFilter()
|
||||
params = gaussian.getParameters()
|
||||
params.setValue("gaussian_width", width)
|
||||
gaussian.setParameters(params)
|
||||
gaussian.filterExperiment(exp_test)
|
||||
|
||||
# Evaluate quality
|
||||
# ... add evaluation code ...
|
||||
```
|
||||
|
||||
### Preserve Original Data
|
||||
|
||||
Keep original data for comparison:
|
||||
|
||||
```python
|
||||
# Load original
|
||||
exp_original = ms.MSExperiment()
|
||||
ms.MzMLFile().load("data.mzML", exp_original)
|
||||
|
||||
# Create copy for processing
|
||||
exp_processed = ms.MSExperiment(exp_original)
|
||||
|
||||
# Process copy
|
||||
gaussian = ms.GaussFilter()
|
||||
gaussian.filterExperiment(exp_processed)
|
||||
|
||||
# Original remains unchanged
|
||||
```
|
||||
|
||||
### Profile vs Centroid Data
|
||||
|
||||
Check data type before processing:
|
||||
|
||||
```python
|
||||
# Check if spectrum is centroided
|
||||
spec = exp.getSpectrum(0)
|
||||
|
||||
if spec.isSorted():
|
||||
# Likely centroided
|
||||
print("Centroid data")
|
||||
else:
|
||||
# Likely profile
|
||||
print("Profile data - apply peak picking")
|
||||
```
|
||||
Reference in New Issue
Block a user