mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
Add scVelo RNA velocity analysis workflow and IQ-TREE reference documentation
- Introduced a comprehensive RNA velocity analysis pipeline using scVelo, including data loading, preprocessing, velocity estimation, and visualization. - Added a script for running RNA velocity analysis with customizable parameters and output options. - Created detailed documentation for IQ-TREE 2 phylogenetic inference, covering command syntax, model selection, bootstrapping methods, and output interpretation. - Included references for velocity models and their mathematical framework, along with a comparison of different models. - Enhanced the scVelo skill documentation with installation instructions, use cases, and best practices for RNA velocity analysis.
This commit is contained in:
332
scientific-skills/bindingdb-database/SKILL.md
Normal file
332
scientific-skills/bindingdb-database/SKILL.md
Normal file
@@ -0,0 +1,332 @@
|
||||
---
|
||||
name: bindingdb-database
|
||||
description: Query BindingDB for measured drug-target binding affinities (Ki, Kd, IC50, EC50). Search by target (UniProt ID), compound (SMILES/name), or pathogen. Essential for drug discovery, lead optimization, polypharmacology analysis, and structure-activity relationship (SAR) studies.
|
||||
license: CC-BY-3.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# BindingDB Database
|
||||
|
||||
## Overview
|
||||
|
||||
BindingDB (https://www.bindingdb.org/) is the primary public database of measured drug-protein binding affinities. It contains over 3 million binding data records for ~1.4 million compounds tested against ~9,200 protein targets, curated from scientific literature and patent literature. BindingDB stores quantitative binding measurements (Ki, Kd, IC50, EC50) essential for drug discovery, pharmacology, and computational chemistry research.
|
||||
|
||||
**Key resources:**
|
||||
- BindingDB website: https://www.bindingdb.org/
|
||||
- REST API: https://www.bindingdb.org/axis2/services/BDBService
|
||||
- Downloads: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
|
||||
- GitHub: https://github.com/drugilsberg/bindingdb
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use BindingDB when:
|
||||
|
||||
- **Target-based drug discovery**: What known compounds bind to a target protein? What are their affinities?
|
||||
- **SAR analysis**: How do structural modifications affect binding affinity for a series of analogs?
|
||||
- **Lead compound profiling**: What targets does a compound bind (selectivity/polypharmacology)?
|
||||
- **Benchmark datasets**: Obtain curated protein-ligand affinity data for ML model training
|
||||
- **Repurposing analysis**: Does an approved drug bind to an unintended target?
|
||||
- **Competitive analysis**: What is the best reported affinity for a target class?
|
||||
- **Fragment screening**: Find validated binding data for fragments against a target
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. BindingDB REST API
|
||||
|
||||
Base URL: `https://www.bindingdb.org/axis2/services/BDBService`
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://www.bindingdb.org/axis2/services/BDBService"
|
||||
|
||||
def bindingdb_query(method, params):
|
||||
"""Query the BindingDB REST API."""
|
||||
url = f"{BASE_URL}/{method}"
|
||||
response = requests.get(url, params=params, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Query by Target (UniProt ID)
|
||||
|
||||
```python
|
||||
def get_ligands_for_target(uniprot_id, affinity_type="Ki", cutoff=10000, unit="nM"):
|
||||
"""
|
||||
Get all ligands with measured affinity for a UniProt target.
|
||||
|
||||
Args:
|
||||
uniprot_id: UniProt accession (e.g., "P00519" for ABL1)
|
||||
affinity_type: "Ki", "Kd", "IC50", "EC50"
|
||||
cutoff: Maximum affinity value to return (in nM)
|
||||
unit: "nM" or "uM"
|
||||
"""
|
||||
params = {
|
||||
"uniprot_id": uniprot_id,
|
||||
"affinity_type": affinity_type,
|
||||
"affinity_cutoff": cutoff,
|
||||
"response": "json"
|
||||
}
|
||||
return bindingdb_query("getLigandsByUniprotID", params)
|
||||
|
||||
# Example: Get all compounds binding ABL1 (imatinib target)
|
||||
ligands = get_ligands_for_target("P00519", affinity_type="Ki", cutoff=100)
|
||||
```
|
||||
|
||||
### 3. Query by Compound Name or SMILES
|
||||
|
||||
```python
|
||||
def search_by_name(compound_name, limit=100):
|
||||
"""Search BindingDB for compounds by name."""
|
||||
params = {
|
||||
"compound_name": compound_name,
|
||||
"response": "json",
|
||||
"max_results": limit
|
||||
}
|
||||
return bindingdb_query("getAffinitiesByCompoundName", params)
|
||||
|
||||
def search_by_smiles(smiles, similarity=100, limit=50):
|
||||
"""
|
||||
Search BindingDB by SMILES string.
|
||||
|
||||
Args:
|
||||
smiles: SMILES string of the compound
|
||||
similarity: Tanimoto similarity threshold (1-100, 100 = exact)
|
||||
"""
|
||||
params = {
|
||||
"SMILES": smiles,
|
||||
"similarity": similarity,
|
||||
"response": "json",
|
||||
"max_results": limit
|
||||
}
|
||||
return bindingdb_query("getAffinitiesByBEI", params)
|
||||
|
||||
# Example: Search for imatinib binding data
|
||||
result = search_by_name("imatinib")
|
||||
```
|
||||
|
||||
### 4. Download-Based Analysis (Recommended for Large Queries)
|
||||
|
||||
For comprehensive analyses, download BindingDB data directly:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def load_bindingdb(filepath="BindingDB_All.tsv"):
|
||||
"""
|
||||
Load BindingDB TSV file.
|
||||
Download from: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
|
||||
"""
|
||||
# Key columns
|
||||
usecols = [
|
||||
"BindingDB Reactant_set_id",
|
||||
"Ligand SMILES",
|
||||
"Ligand InChI",
|
||||
"Ligand InChI Key",
|
||||
"BindingDB Target Chain Sequence",
|
||||
"PDB ID(s) for Ligand-Target Complex",
|
||||
"UniProt (SwissProt) Entry Name of Target Chain",
|
||||
"UniProt (SwissProt) Primary ID of Target Chain",
|
||||
"UniProt (TrEMBL) Primary ID of Target Chain",
|
||||
"Ki (nM)",
|
||||
"IC50 (nM)",
|
||||
"Kd (nM)",
|
||||
"EC50 (nM)",
|
||||
"kon (M-1-s-1)",
|
||||
"koff (s-1)",
|
||||
"Target Name",
|
||||
"Target Source Organism According to Curator or DataSource",
|
||||
"Number of Protein Chains in Target (>1 implies a multichain complex)",
|
||||
"PubChem CID",
|
||||
"PubChem SID",
|
||||
"ChEMBL ID of Ligand",
|
||||
"DrugBank ID of Ligand",
|
||||
]
|
||||
|
||||
df = pd.read_csv(filepath, sep="\t", usecols=[c for c in usecols if c],
|
||||
low_memory=False, on_bad_lines='skip')
|
||||
|
||||
# Convert affinity columns to numeric
|
||||
for col in ["Ki (nM)", "IC50 (nM)", "Kd (nM)", "EC50 (nM)"]:
|
||||
if col in df.columns:
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
|
||||
return df
|
||||
|
||||
def query_target_affinity(df, uniprot_id, affinity_types=None, max_nm=10000):
|
||||
"""Query loaded BindingDB for a specific target."""
|
||||
if affinity_types is None:
|
||||
affinity_types = ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]
|
||||
|
||||
# Filter by UniProt ID
|
||||
mask = df["UniProt (SwissProt) Primary ID of Target Chain"] == uniprot_id
|
||||
target_df = df[mask].copy()
|
||||
|
||||
# Filter by affinity cutoff
|
||||
has_affinity = pd.Series(False, index=target_df.index)
|
||||
for col in affinity_types:
|
||||
if col in target_df.columns:
|
||||
has_affinity |= target_df[col] <= max_nm
|
||||
|
||||
result = target_df[has_affinity][["Ligand SMILES"] + affinity_types +
|
||||
["PubChem CID", "ChEMBL ID of Ligand"]].dropna(how='all')
|
||||
return result.sort_values(affinity_types[0])
|
||||
```
|
||||
|
||||
### 5. SAR Analysis
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def sar_analysis(df, target_uniprot, affinity_col="IC50 (nM)"):
|
||||
"""
|
||||
Structure-activity relationship analysis for a target.
|
||||
Retrieves all compounds with affinity data and ranks by potency.
|
||||
"""
|
||||
target_data = query_target_affinity(df, target_uniprot, [affinity_col])
|
||||
|
||||
if target_data.empty:
|
||||
return target_data
|
||||
|
||||
# Add pIC50 (negative log of IC50 in molar)
|
||||
if affinity_col in target_data.columns:
|
||||
target_data = target_data[target_data[affinity_col].notna()].copy()
|
||||
target_data["pAffinity"] = -((target_data[affinity_col] * 1e-9).apply(
|
||||
lambda x: __import__('math').log10(x)
|
||||
))
|
||||
target_data = target_data.sort_values("pAffinity", ascending=False)
|
||||
|
||||
return target_data
|
||||
|
||||
# Most potent compounds against EGFR (P00533)
|
||||
# sar = sar_analysis(df, "P00533", "IC50 (nM)")
|
||||
# print(sar.head(20))
|
||||
```
|
||||
|
||||
### 6. Polypharmacology Profile
|
||||
|
||||
```python
|
||||
def polypharmacology_profile(df, ligand_smiles_or_name, affinity_cutoff_nM=1000):
|
||||
"""
|
||||
Find all targets a compound binds to.
|
||||
Uses PubChem CID or SMILES for matching.
|
||||
"""
|
||||
# Search by ligand SMILES (exact match)
|
||||
mask = df["Ligand SMILES"] == ligand_smiles_or_name
|
||||
|
||||
ligand_data = df[mask].copy()
|
||||
|
||||
# Filter by affinity
|
||||
aff_cols = ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]
|
||||
has_aff = pd.Series(False, index=ligand_data.index)
|
||||
for col in aff_cols:
|
||||
if col in ligand_data.columns:
|
||||
has_aff |= ligand_data[col] <= affinity_cutoff_nM
|
||||
|
||||
result = ligand_data[has_aff][
|
||||
["Target Name", "UniProt (SwissProt) Primary ID of Target Chain"] + aff_cols
|
||||
].dropna(how='all')
|
||||
|
||||
return result.sort_values("Ki (nM)")
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Find Best Inhibitors for a Target
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def find_best_inhibitors(uniprot_id, affinity_type="IC50 (nM)", top_n=20):
|
||||
"""Find the most potent inhibitors for a target in BindingDB."""
|
||||
df = load_bindingdb("BindingDB_All.tsv") # Load once and reuse
|
||||
result = query_target_affinity(df, uniprot_id, [affinity_type])
|
||||
|
||||
if result.empty:
|
||||
print(f"No data found for {uniprot_id}")
|
||||
return result
|
||||
|
||||
result = result.sort_values(affinity_type).head(top_n)
|
||||
print(f"Top {top_n} inhibitors for {uniprot_id} by {affinity_type}:")
|
||||
for _, row in result.iterrows():
|
||||
print(f" {row['PubChem CID']}: {row[affinity_type]:.1f} nM | SMILES: {row['Ligand SMILES'][:40]}...")
|
||||
return result
|
||||
```
|
||||
|
||||
### Workflow 2: Selectivity Profiling
|
||||
|
||||
1. Get all affinity data for your compound across all targets
|
||||
2. Compare affinity ratios between on-target and off-targets
|
||||
3. Identify selectivity cliffs (structural changes that improve selectivity)
|
||||
4. Cross-reference with ChEMBL for additional selectivity data
|
||||
|
||||
### Workflow 3: Machine Learning Dataset Preparation
|
||||
|
||||
```python
|
||||
def prepare_ml_dataset(df, uniprot_ids, affinity_col="IC50 (nM)",
|
||||
max_affinity_nM=100000, min_count=50):
|
||||
"""Prepare BindingDB data for ML model training."""
|
||||
records = []
|
||||
for uid in uniprot_ids:
|
||||
target_df = query_target_affinity(df, uid, [affinity_col], max_affinity_nM)
|
||||
if len(target_df) >= min_count:
|
||||
target_df = target_df.copy()
|
||||
target_df["target"] = uid
|
||||
records.append(target_df)
|
||||
|
||||
if not records:
|
||||
return pd.DataFrame()
|
||||
|
||||
combined = pd.concat(records)
|
||||
# Add pAffinity (normalized)
|
||||
combined["pAffinity"] = -((combined[affinity_col] * 1e-9).apply(
|
||||
lambda x: __import__('math').log10(max(x, 1e-12))
|
||||
))
|
||||
return combined[["Ligand SMILES", "target", "pAffinity", affinity_col]].dropna()
|
||||
```
|
||||
|
||||
## Key Data Fields
|
||||
|
||||
| Field | Description |
|
||||
|-------|-------------|
|
||||
| `Ligand SMILES` | 2D structure of the compound |
|
||||
| `Ligand InChI Key` | Unique chemical identifier |
|
||||
| `Ki (nM)` | Inhibition constant (equilibrium, functional) |
|
||||
| `Kd (nM)` | Dissociation constant (thermodynamic, binding) |
|
||||
| `IC50 (nM)` | Half-maximal inhibitory concentration |
|
||||
| `EC50 (nM)` | Half-maximal effective concentration |
|
||||
| `kon (M-1-s-1)` | Association rate constant |
|
||||
| `koff (s-1)` | Dissociation rate constant |
|
||||
| `UniProt (SwissProt) Primary ID` | Target UniProt accession |
|
||||
| `Target Name` | Protein name |
|
||||
| `PDB ID(s) for Ligand-Target Complex` | Crystal structures |
|
||||
| `PubChem CID` | PubChem compound ID |
|
||||
| `ChEMBL ID of Ligand` | ChEMBL compound ID |
|
||||
|
||||
## Affinity Interpretation
|
||||
|
||||
| Affinity | Classification | Drug-likeness |
|
||||
|----------|---------------|---------------|
|
||||
| < 1 nM | Sub-nanomolar | Very potent (picomolar range) |
|
||||
| 1–10 nM | Nanomolar | Potent, typical for approved drugs |
|
||||
| 10–100 nM | Moderate | Common lead compounds |
|
||||
| 100–1000 nM | Weak | Fragment/starting point |
|
||||
| > 1000 nM | Very weak | Generally below drug-relevance threshold |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use Ki for direct binding**: Ki reflects true binding affinity independent of enzymatic mechanism
|
||||
- **IC50 context-dependency**: IC50 values depend on substrate concentration (Cheng-Prusoff equation)
|
||||
- **Normalize units**: BindingDB reports in nM; verify units when comparing across studies
|
||||
- **Filter by target organism**: Use `Target Source Organism` to ensure human protein data
|
||||
- **Handle missing values**: Not all compounds have all measurement types
|
||||
- **Cross-reference with ChEMBL**: ChEMBL has more curated activity data for medicinal chemistry
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **BindingDB website**: https://www.bindingdb.org/
|
||||
- **Data downloads**: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
|
||||
- **API documentation**: https://www.bindingdb.org/bind/BindingDBRESTfulAPI.jsp
|
||||
- **Citation**: Gilson MK et al. (2016) Nucleic Acids Research. PMID: 26481362
|
||||
- **Related resources**: ChEMBL (https://www.ebi.ac.uk/chembl/), PubChem BioAssay
|
||||
@@ -0,0 +1,178 @@
|
||||
# BindingDB Affinity Query Reference
|
||||
|
||||
## Affinity Measurement Types
|
||||
|
||||
### Ki (Inhibition Constant)
|
||||
- **Definition**: Equilibrium constant for inhibitor-enzyme complex dissociation
|
||||
- **Equation**: Ki = [E][I]/[EI]
|
||||
- **Usage**: Enzyme inhibition; preferred for mechanistic studies
|
||||
- **Note**: Independent of substrate concentration (unlike IC50)
|
||||
|
||||
### Kd (Dissociation Constant)
|
||||
- **Definition**: Thermodynamic binding equilibrium constant
|
||||
- **Equation**: Kd = [A][B]/[AB]
|
||||
- **Usage**: Direct binding assays (SPR, ITC, fluorescence anisotropy)
|
||||
- **Note**: True measure of binding strength; lower = tighter binding
|
||||
|
||||
### IC50 (Half-Maximal Inhibitory Concentration)
|
||||
- **Definition**: Concentration of inhibitor that reduces target activity by 50%
|
||||
- **Usage**: Most common in drug discovery; assay-dependent
|
||||
- **Conversion to Ki**: Cheng-Prusoff equation: Ki = IC50 / (1 + [S]/Km)
|
||||
- **Note**: Depends on substrate concentration and assay conditions
|
||||
|
||||
### EC50 (Half-Maximal Effective Concentration)
|
||||
- **Definition**: Concentration that produces 50% of maximal effect
|
||||
- **Usage**: Cell-based assays, agonist studies
|
||||
|
||||
### Kinetics Parameters
|
||||
- **kon**: Association rate constant (M⁻¹s⁻¹); describes how fast complex forms
|
||||
- **koff**: Dissociation rate constant (s⁻¹); describes how fast complex dissociates
|
||||
- **Residence time**: τ = 1/koff; longer residence = more sustained effect
|
||||
- **Kd from kinetics**: Kd = koff/kon
|
||||
|
||||
## Common API Query Patterns
|
||||
|
||||
### By UniProt ID (REST API)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_by_uniprot(uniprot_id, affinity_type="Ki"):
|
||||
"""
|
||||
REST API query for BindingDB affinities by UniProt target ID.
|
||||
"""
|
||||
url = "https://www.bindingdb.org/axis2/services/BDBService/getLigandsByUniprotID"
|
||||
params = {
|
||||
"uniprot_id": uniprot_id,
|
||||
"cutoff": "10000", # nM threshold
|
||||
"affinity_type": affinity_type,
|
||||
"response": "json"
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
|
||||
# Important targets
|
||||
COMMON_TARGETS = {
|
||||
"ABL1": "P00519", # Imatinib, dasatinib target
|
||||
"EGFR": "P00533", # Erlotinib, gefitinib target
|
||||
"BRAF": "P15056", # Vemurafenib, dabrafenib target
|
||||
"CDK2": "P24941", # Cell cycle kinase
|
||||
"HDAC1": "Q13547", # Histone deacetylase
|
||||
"BRD4": "O60885", # BET bromodomain reader
|
||||
"MDM2": "Q00987", # p53 negative regulator
|
||||
"BCL2": "P10415", # Antiapoptotic protein
|
||||
"PCSK9": "Q8NBP7", # Cholesterol regulator
|
||||
"JAK2": "O60674", # Cytokine signaling kinase
|
||||
}
|
||||
```
|
||||
|
||||
### By PubChem CID (REST API)
|
||||
|
||||
```python
|
||||
def query_by_pubchem_cid(pubchem_cid):
|
||||
"""Get all binding data for a specific compound by PubChem CID."""
|
||||
url = "https://www.bindingdb.org/axis2/services/BDBService/getAffinitiesByCID"
|
||||
params = {"cid": pubchem_cid, "response": "json"}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
|
||||
# Example: Imatinib PubChem CID = 5291
|
||||
imatinib_data = query_by_pubchem_cid(5291)
|
||||
```
|
||||
|
||||
### By Target Name
|
||||
|
||||
```python
|
||||
def query_by_target_name(target_name, affinity_cutoff=100):
|
||||
"""Query BindingDB by target name."""
|
||||
url = "https://www.bindingdb.org/axis2/services/BDBService/getAffinitiesByTarget"
|
||||
params = {
|
||||
"target_name": target_name,
|
||||
"cutoff": affinity_cutoff,
|
||||
"response": "json"
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
```
|
||||
|
||||
## Dataset Download Guide
|
||||
|
||||
### Available Files
|
||||
|
||||
| File | Size | Contents |
|
||||
|------|------|---------|
|
||||
| `BindingDB_All.tsv.zip` | ~3.5 GB | All data: ~2.9M records |
|
||||
| `BindingDB_All.sdf.zip` | ~7 GB | All data with 3D structures |
|
||||
| `BindingDB_IC50.tsv` | ~1.5 GB | IC50 data only |
|
||||
| `BindingDB_Ki.tsv` | ~0.8 GB | Ki data only |
|
||||
| `BindingDB_Kd.tsv` | ~0.2 GB | Kd data only |
|
||||
| `BindingDB_EC50.tsv` | ~0.5 GB | EC50 data only |
|
||||
| `tdc_bindingdb_*` | Various | TDC-formatted subsets |
|
||||
|
||||
### Efficient Loading
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# For large files, use chunking
|
||||
def load_bindingdb_chunked(filepath, uniprot_ids, affinity_col="Ki (nM)", chunk_size=100000):
|
||||
"""Load BindingDB in chunks to filter for specific targets."""
|
||||
results = []
|
||||
for chunk in pd.read_csv(filepath, sep="\t", chunksize=chunk_size,
|
||||
low_memory=False, on_bad_lines='skip'):
|
||||
# Filter for target
|
||||
mask = chunk["UniProt (SwissProt) Primary ID of Target Chain"].isin(uniprot_ids)
|
||||
if mask.any():
|
||||
results.append(chunk[mask])
|
||||
|
||||
if results:
|
||||
return pd.concat(results)
|
||||
return pd.DataFrame()
|
||||
```
|
||||
|
||||
## pKi / pIC50 Conversion
|
||||
|
||||
Converting raw affinity to logarithmic scale (common in ML):
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
def to_log_affinity(affinity_nM):
|
||||
"""Convert nM affinity to pAffinity (negative log molar)."""
|
||||
affinity_M = affinity_nM * 1e-9 # Convert nM to M
|
||||
return -np.log10(affinity_M)
|
||||
|
||||
# Examples:
|
||||
# 1 nM → pAffinity = 9.0
|
||||
# 10 nM → pAffinity = 8.0
|
||||
# 100 nM → pAffinity = 7.0
|
||||
# 1 μM → pAffinity = 6.0
|
||||
# 10 μM → pAffinity = 5.0
|
||||
```
|
||||
|
||||
## Quality Filters
|
||||
|
||||
When using BindingDB data for ML or SAR:
|
||||
|
||||
```python
|
||||
def filter_quality(df):
|
||||
"""Apply quality filters to BindingDB data."""
|
||||
# 1. Require valid SMILES
|
||||
df = df[df["Ligand SMILES"].notna() & (df["Ligand SMILES"] != "")]
|
||||
|
||||
# 2. Require valid affinity
|
||||
df = df[df["Ki (nM)"].notna() | df["IC50 (nM)"].notna()]
|
||||
|
||||
# 3. Filter extreme values (artifacts)
|
||||
for col in ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]:
|
||||
if col in df.columns:
|
||||
df = df[~(df[col] > 1e6)] # Remove > 1 mM (non-specific)
|
||||
|
||||
# 4. Use only human targets
|
||||
if "Target Source Organism According to Curator or DataSource" in df.columns:
|
||||
df = df[df["Target Source Organism According to Curator or DataSource"].str.contains(
|
||||
"Homo sapiens", na=False
|
||||
)]
|
||||
|
||||
return df
|
||||
```
|
||||
367
scientific-skills/cbioportal-database/SKILL.md
Normal file
367
scientific-skills/cbioportal-database/SKILL.md
Normal file
@@ -0,0 +1,367 @@
|
||||
---
|
||||
name: cbioportal-database
|
||||
description: Query cBioPortal for cancer genomics data including somatic mutations, copy number alterations, gene expression, and survival data across hundreds of cancer studies. Essential for cancer target validation, oncogene/tumor suppressor analysis, and patient-level genomic profiling.
|
||||
license: LGPL-3.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# cBioPortal Database
|
||||
|
||||
## Overview
|
||||
|
||||
cBioPortal for Cancer Genomics (https://www.cbioportal.org/) is an open-access resource for exploring, visualizing, and analyzing multidimensional cancer genomics data. It hosts data from The Cancer Genome Atlas (TCGA), AACR Project GENIE, MSK-IMPACT, and hundreds of other cancer studies — covering mutations, copy number alterations (CNA), structural variants, mRNA/protein expression, methylation, and clinical data for thousands of cancer samples.
|
||||
|
||||
**Key resources:**
|
||||
- cBioPortal website: https://www.cbioportal.org/
|
||||
- REST API: https://www.cbioportal.org/api/
|
||||
- API docs (Swagger): https://www.cbioportal.org/api/swagger-ui/index.html
|
||||
- Python client: `bravado` or `requests`
|
||||
- GitHub: https://github.com/cBioPortal/cbioportal
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use cBioPortal when:
|
||||
|
||||
- **Mutation landscape**: What fraction of a cancer type has mutations in a specific gene?
|
||||
- **Oncogene/TSG validation**: Is a gene frequently mutated, amplified, or deleted in cancer?
|
||||
- **Co-mutation patterns**: Are mutations in gene A and gene B mutually exclusive or co-occurring?
|
||||
- **Survival analysis**: Do mutations in a gene associate with better or worse patient outcomes?
|
||||
- **Alteration profiles**: What types of alterations (missense, truncating, amplification, deletion) affect a gene?
|
||||
- **Pan-cancer analysis**: Compare alteration frequencies across cancer types
|
||||
- **Clinical associations**: Link genomic alterations to clinical variables (stage, grade, treatment response)
|
||||
- **TCGA/GENIE exploration**: Systematic access to TCGA and clinical sequencing datasets
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. cBioPortal REST API
|
||||
|
||||
Base URL: `https://www.cbioportal.org/api`
|
||||
|
||||
The API is RESTful, returns JSON, and requires no API key for public data.
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://www.cbioportal.org/api"
|
||||
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
||||
|
||||
def cbioportal_get(endpoint, params=None):
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.get(url, params=params, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def cbioportal_post(endpoint, body):
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.post(url, json=body, headers=HEADERS)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Browse Studies
|
||||
|
||||
```python
|
||||
def get_all_studies():
|
||||
"""List all available cancer studies."""
|
||||
return cbioportal_get("studies", {"pageSize": 500})
|
||||
|
||||
# Each study has:
|
||||
# studyId: unique identifier (e.g., "brca_tcga")
|
||||
# name: human-readable name
|
||||
# description: dataset description
|
||||
# cancerTypeId: cancer type abbreviation
|
||||
# referenceGenome: GRCh37 or GRCh38
|
||||
# pmid: associated publication
|
||||
|
||||
studies = get_all_studies()
|
||||
print(f"Total studies: {len(studies)}")
|
||||
|
||||
# Common TCGA study IDs:
|
||||
# brca_tcga, luad_tcga, coadread_tcga, gbm_tcga, prad_tcga,
|
||||
# skcm_tcga, blca_tcga, hnsc_tcga, lihc_tcga, stad_tcga
|
||||
|
||||
# Filter for TCGA studies
|
||||
tcga_studies = [s for s in studies if "tcga" in s["studyId"]]
|
||||
print([s["studyId"] for s in tcga_studies[:10]])
|
||||
```
|
||||
|
||||
### 3. Molecular Profiles
|
||||
|
||||
Each study has multiple molecular profiles (mutation, CNA, expression, etc.):
|
||||
|
||||
```python
|
||||
def get_molecular_profiles(study_id):
|
||||
"""Get all molecular profiles for a study."""
|
||||
return cbioportal_get(f"studies/{study_id}/molecular-profiles")
|
||||
|
||||
profiles = get_molecular_profiles("brca_tcga")
|
||||
for p in profiles:
|
||||
print(f" {p['molecularProfileId']}: {p['name']} ({p['molecularAlterationType']})")
|
||||
|
||||
# Alteration types:
|
||||
# MUTATION_EXTENDED — somatic mutations
|
||||
# COPY_NUMBER_ALTERATION — CNA (GISTIC)
|
||||
# MRNA_EXPRESSION — mRNA expression
|
||||
# PROTEIN_LEVEL — RPPA protein expression
|
||||
# STRUCTURAL_VARIANT — fusions/rearrangements
|
||||
```
|
||||
|
||||
### 4. Mutation Data
|
||||
|
||||
```python
|
||||
def get_mutations(molecular_profile_id, entrez_gene_ids, sample_list_id=None):
|
||||
"""Get mutations for specified genes in a molecular profile."""
|
||||
body = {
|
||||
"entrezGeneIds": entrez_gene_ids,
|
||||
"sampleListId": sample_list_id or molecular_profile_id.replace("_mutations", "_all")
|
||||
}
|
||||
return cbioportal_post(
|
||||
f"molecular-profiles/{molecular_profile_id}/mutations/fetch",
|
||||
body
|
||||
)
|
||||
|
||||
# BRCA1 Entrez ID is 672, TP53 is 7157, PTEN is 5728
|
||||
mutations = get_mutations("brca_tcga_mutations", entrez_gene_ids=[7157]) # TP53
|
||||
|
||||
# Each mutation record contains:
|
||||
# patientId, sampleId, entrezGeneId, gene.hugoGeneSymbol
|
||||
# mutationType (Missense_Mutation, Nonsense_Mutation, Frame_Shift_Del, etc.)
|
||||
# proteinChange (e.g., "R175H")
|
||||
# variantClassification, variantType
|
||||
# ncbiBuild, chr, startPosition, endPosition, referenceAllele, variantAllele
|
||||
# mutationStatus (Somatic/Germline)
|
||||
# alleleFreqT (tumor VAF)
|
||||
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(mutations)
|
||||
print(df[["patientId", "mutationType", "proteinChange", "alleleFreqT"]].head())
|
||||
print(f"\nMutation types:\n{df['mutationType'].value_counts()}")
|
||||
```
|
||||
|
||||
### 5. Copy Number Alteration Data
|
||||
|
||||
```python
|
||||
def get_cna(molecular_profile_id, entrez_gene_ids):
|
||||
"""Get discrete CNA data (GISTIC: -2, -1, 0, 1, 2)."""
|
||||
body = {
|
||||
"entrezGeneIds": entrez_gene_ids,
|
||||
"sampleListId": molecular_profile_id.replace("_gistic", "_all").replace("_cna", "_all")
|
||||
}
|
||||
return cbioportal_post(
|
||||
f"molecular-profiles/{molecular_profile_id}/discrete-copy-number/fetch",
|
||||
body
|
||||
)
|
||||
|
||||
# GISTIC values:
|
||||
# -2 = Deep deletion (homozygous loss)
|
||||
# -1 = Shallow deletion (heterozygous loss)
|
||||
# 0 = Diploid (neutral)
|
||||
# 1 = Low-level gain
|
||||
# 2 = High-level amplification
|
||||
|
||||
cna_data = get_cna("brca_tcga_gistic", entrez_gene_ids=[1956]) # EGFR
|
||||
df_cna = pd.DataFrame(cna_data)
|
||||
print(df_cna["value"].value_counts())
|
||||
```
|
||||
|
||||
### 6. Alteration Frequency (OncoPrint-style)
|
||||
|
||||
```python
|
||||
def get_alteration_frequency(study_id, gene_symbols, alteration_types=None):
|
||||
"""Compute alteration frequencies for genes across a cancer study."""
|
||||
import requests, pandas as pd
|
||||
|
||||
# Get sample list
|
||||
samples = requests.get(
|
||||
f"{BASE_URL}/studies/{study_id}/sample-lists",
|
||||
headers=HEADERS
|
||||
).json()
|
||||
all_samples_id = next(
|
||||
(s["sampleListId"] for s in samples if s["category"] == "all_cases_in_study"), None
|
||||
)
|
||||
total_samples = len(requests.get(
|
||||
f"{BASE_URL}/sample-lists/{all_samples_id}/sample-ids",
|
||||
headers=HEADERS
|
||||
).json())
|
||||
|
||||
# Get gene Entrez IDs
|
||||
gene_data = requests.post(
|
||||
f"{BASE_URL}/genes/fetch",
|
||||
json=[{"hugoGeneSymbol": g} for g in gene_symbols],
|
||||
headers=HEADERS
|
||||
).json()
|
||||
entrez_ids = [g["entrezGeneId"] for g in gene_data]
|
||||
|
||||
# Get mutations
|
||||
mutation_profile = f"{study_id}_mutations"
|
||||
mutations = get_mutations(mutation_profile, entrez_ids, all_samples_id)
|
||||
|
||||
freq = {}
|
||||
for g_symbol, e_id in zip(gene_symbols, entrez_ids):
|
||||
mutated = len(set(m["patientId"] for m in mutations if m["entrezGeneId"] == e_id))
|
||||
freq[g_symbol] = mutated / total_samples * 100
|
||||
|
||||
return freq
|
||||
|
||||
# Example
|
||||
freq = get_alteration_frequency("brca_tcga", ["TP53", "PIK3CA", "BRCA1", "BRCA2"])
|
||||
for gene, pct in sorted(freq.items(), key=lambda x: -x[1]):
|
||||
print(f" {gene}: {pct:.1f}%")
|
||||
```
|
||||
|
||||
### 7. Clinical Data
|
||||
|
||||
```python
|
||||
def get_clinical_data(study_id, attribute_ids=None):
|
||||
"""Get patient-level clinical data."""
|
||||
params = {"studyId": study_id}
|
||||
all_clinical = cbioportal_get(
|
||||
"clinical-data/fetch",
|
||||
params
|
||||
)
|
||||
# Returns list of {patientId, studyId, clinicalAttributeId, value}
|
||||
|
||||
# Clinical attributes include:
|
||||
# OS_STATUS, OS_MONTHS, DFS_STATUS, DFS_MONTHS (survival)
|
||||
# TUMOR_STAGE, GRADE, AGE, SEX, RACE
|
||||
# Study-specific attributes vary
|
||||
|
||||
def get_clinical_attributes(study_id):
|
||||
"""List all available clinical attributes for a study."""
|
||||
return cbioportal_get(f"studies/{study_id}/clinical-attributes")
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Gene Alteration Profile in a Cancer Type
|
||||
|
||||
```python
|
||||
import requests, pandas as pd
|
||||
|
||||
def alteration_profile(study_id, gene_symbol):
|
||||
"""Full alteration profile for a gene in a cancer study."""
|
||||
|
||||
# 1. Get gene Entrez ID
|
||||
gene_info = requests.post(
|
||||
f"{BASE_URL}/genes/fetch",
|
||||
json=[{"hugoGeneSymbol": gene_symbol}],
|
||||
headers=HEADERS
|
||||
).json()[0]
|
||||
entrez_id = gene_info["entrezGeneId"]
|
||||
|
||||
# 2. Get mutations
|
||||
mutations = get_mutations(f"{study_id}_mutations", [entrez_id])
|
||||
mut_df = pd.DataFrame(mutations) if mutations else pd.DataFrame()
|
||||
|
||||
# 3. Get CNAs
|
||||
cna = get_cna(f"{study_id}_gistic", [entrez_id])
|
||||
cna_df = pd.DataFrame(cna) if cna else pd.DataFrame()
|
||||
|
||||
# 4. Summary
|
||||
n_mut = len(set(mut_df["patientId"])) if not mut_df.empty else 0
|
||||
n_amp = len(cna_df[cna_df["value"] == 2]) if not cna_df.empty else 0
|
||||
n_del = len(cna_df[cna_df["value"] == -2]) if not cna_df.empty else 0
|
||||
|
||||
return {"mutations": n_mut, "amplifications": n_amp, "deep_deletions": n_del}
|
||||
|
||||
result = alteration_profile("brca_tcga", "PIK3CA")
|
||||
print(result)
|
||||
```
|
||||
|
||||
### Workflow 2: Pan-Cancer Gene Mutation Frequency
|
||||
|
||||
```python
|
||||
import requests, pandas as pd
|
||||
|
||||
def pan_cancer_mutation_freq(gene_symbol, cancer_study_ids=None):
|
||||
"""Mutation frequency of a gene across multiple cancer types."""
|
||||
studies = get_all_studies()
|
||||
if cancer_study_ids:
|
||||
studies = [s for s in studies if s["studyId"] in cancer_study_ids]
|
||||
|
||||
results = []
|
||||
for study in studies[:20]: # Limit for demo
|
||||
try:
|
||||
freq = get_alteration_frequency(study["studyId"], [gene_symbol])
|
||||
results.append({
|
||||
"study": study["studyId"],
|
||||
"cancer": study.get("cancerTypeId", ""),
|
||||
"mutation_pct": freq.get(gene_symbol, 0)
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
df = pd.DataFrame(results).sort_values("mutation_pct", ascending=False)
|
||||
return df
|
||||
```
|
||||
|
||||
### Workflow 3: Survival Analysis by Mutation Status
|
||||
|
||||
```python
|
||||
import requests, pandas as pd
|
||||
|
||||
def survival_by_mutation(study_id, gene_symbol):
|
||||
"""Get survival data split by mutation status."""
|
||||
# This workflow fetches clinical and mutation data for downstream analysis
|
||||
|
||||
gene_info = requests.post(
|
||||
f"{BASE_URL}/genes/fetch",
|
||||
json=[{"hugoGeneSymbol": gene_symbol}],
|
||||
headers=HEADERS
|
||||
).json()[0]
|
||||
entrez_id = gene_info["entrezGeneId"]
|
||||
|
||||
mutations = get_mutations(f"{study_id}_mutations", [entrez_id])
|
||||
mutated_patients = set(m["patientId"] for m in mutations)
|
||||
|
||||
clinical = cbioportal_get("clinical-data/fetch", {"studyId": study_id})
|
||||
clinical_df = pd.DataFrame(clinical)
|
||||
|
||||
os_data = clinical_df[clinical_df["clinicalAttributeId"].isin(["OS_MONTHS", "OS_STATUS"])]
|
||||
os_wide = os_data.pivot(index="patientId", columns="clinicalAttributeId", values="value")
|
||||
os_wide["mutated"] = os_wide.index.isin(mutated_patients)
|
||||
|
||||
return os_wide
|
||||
```
|
||||
|
||||
## Key API Endpoints Summary
|
||||
|
||||
| Endpoint | Description |
|
||||
|----------|-------------|
|
||||
| `GET /studies` | List all studies |
|
||||
| `GET /studies/{studyId}/molecular-profiles` | Molecular profiles for a study |
|
||||
| `POST /molecular-profiles/{profileId}/mutations/fetch` | Get mutation data |
|
||||
| `POST /molecular-profiles/{profileId}/discrete-copy-number/fetch` | Get CNA data |
|
||||
| `POST /molecular-profiles/{profileId}/molecular-data/fetch` | Get expression data |
|
||||
| `GET /studies/{studyId}/clinical-attributes` | Available clinical variables |
|
||||
| `GET /clinical-data/fetch` | Clinical data |
|
||||
| `POST /genes/fetch` | Gene metadata by symbol or Entrez ID |
|
||||
| `GET /studies/{studyId}/sample-lists` | Sample lists |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Know your study IDs**: Use the Swagger UI or `GET /studies` to find the correct study ID
|
||||
- **Use sample lists**: Each study has an `all` sample list and subsets; always specify the appropriate one
|
||||
- **TCGA vs. GENIE**: TCGA data is comprehensive but older; GENIE has more recent clinical sequencing data
|
||||
- **Entrez gene IDs**: The API uses Entrez IDs — use `/genes/fetch` to convert from symbols
|
||||
- **Handle 404s**: Some molecular profiles may not exist for all studies
|
||||
- **Rate limiting**: Add delays for bulk queries; consider downloading data files for large-scale analyses
|
||||
|
||||
## Data Downloads
|
||||
|
||||
For large-scale analyses, download study data directly:
|
||||
```bash
|
||||
# Download TCGA BRCA data
|
||||
wget https://cbioportal-datahub.s3.amazonaws.com/brca_tcga.tar.gz
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **cBioPortal website**: https://www.cbioportal.org/
|
||||
- **API Swagger UI**: https://www.cbioportal.org/api/swagger-ui/index.html
|
||||
- **Documentation**: https://docs.cbioportal.org/
|
||||
- **GitHub**: https://github.com/cBioPortal/cbioportal
|
||||
- **Data hub**: https://datahub.cbioportal.org/
|
||||
- **Citation**: Cerami E et al. (2012) Cancer Discovery. PMID: 22588877
|
||||
- **API clients**: https://docs.cbioportal.org/web-api-and-clients/
|
||||
@@ -0,0 +1,128 @@
|
||||
# cBioPortal Study Exploration Reference
|
||||
|
||||
## Major Study Collections
|
||||
|
||||
### TCGA (The Cancer Genome Atlas)
|
||||
|
||||
| Study ID | Cancer Type | Samples |
|
||||
|----------|-------------|---------|
|
||||
| `brca_tcga` | Breast Cancer | ~1,000 |
|
||||
| `luad_tcga` | Lung Adenocarcinoma | ~500 |
|
||||
| `lusc_tcga` | Lung Squamous Cell Carcinoma | ~500 |
|
||||
| `coadread_tcga` | Colorectal Cancer | ~600 |
|
||||
| `gbm_tcga` | Glioblastoma | ~600 |
|
||||
| `prad_tcga` | Prostate Cancer | ~500 |
|
||||
| `skcm_tcga` | Skin Cutaneous Melanoma | ~450 |
|
||||
| `blca_tcga` | Bladder Urothelial Carcinoma | ~400 |
|
||||
| `hnsc_tcga` | Head and Neck Squamous | ~500 |
|
||||
| `lihc_tcga` | Liver Hepatocellular Carcinoma | ~370 |
|
||||
| `stad_tcga` | Stomach Adenocarcinoma | ~440 |
|
||||
| `ucec_tcga` | Uterine Endometrial Carcinoma | ~550 |
|
||||
| `ov_tcga` | Ovarian Serous Carcinoma | ~580 |
|
||||
| `kirc_tcga` | Kidney Renal Clear Cell Carcinoma | ~530 |
|
||||
| `thca_tcga` | Thyroid Cancer | ~500 |
|
||||
| `paad_tcga` | Pancreatic Adenocarcinoma | ~180 |
|
||||
| `laml_tcga` | Acute Myeloid Leukemia | ~200 |
|
||||
| `acc_tcga` | Adrenocortical Carcinoma | ~90 |
|
||||
|
||||
### TCGA Pan-Cancer
|
||||
|
||||
| Study ID | Description |
|
||||
|----------|-------------|
|
||||
| `tcga_pan_can_atlas_2018` | TCGA Pan-Cancer Atlas (32 cancer types, ~10K samples) |
|
||||
|
||||
### MSK-IMPACT (Memorial Sloan Kettering)
|
||||
|
||||
| Study ID | Description |
|
||||
|----------|-------------|
|
||||
| `msk_impact_2017` | MSK-IMPACT clinical sequencing |
|
||||
| `mskcc_pd` | MSK pediatric solid tumors |
|
||||
|
||||
### AACR Project GENIE
|
||||
|
||||
| Study ID | Description |
|
||||
|----------|-------------|
|
||||
| `genie_14_1_public` | GENIE v14.1 (multi-center clinical sequencing) |
|
||||
|
||||
## Molecular Profile ID Naming Conventions
|
||||
|
||||
Molecular profile IDs are structured as `{studyId}_{type}`:
|
||||
|
||||
| Type Suffix | Alteration Type |
|
||||
|-------------|----------------|
|
||||
| `_mutations` | Somatic mutations (MAF) |
|
||||
| `_gistic` | Copy number (GISTIC discrete: -2, -1, 0, 1, 2) |
|
||||
| `_cna` | Copy number (continuous log2 ratio) |
|
||||
| `_mrna` | mRNA expression (z-scores or log2) |
|
||||
| `_rna_seq_v2_mrna` | RNA-seq (RSEM) |
|
||||
| `_rna_seq_v2_mrna_median_Zscores` | RNA-seq z-scores relative to normals |
|
||||
| `_rppa` | RPPA protein expression |
|
||||
| `_rppa_Zscores` | RPPA z-scores |
|
||||
| `_sv` | Structural variants/fusions |
|
||||
| `_methylation_hm450` | DNA methylation (450K array) |
|
||||
|
||||
**Example:** For `brca_tcga`:
|
||||
- `brca_tcga_mutations` — mutation data
|
||||
- `brca_tcga_gistic` — CNA data
|
||||
- `brca_tcga_rna_seq_v2_mrna` — RNA-seq expression
|
||||
|
||||
## Sample List Categories
|
||||
|
||||
Each study has sample lists of different subsets:
|
||||
|
||||
| Category | sampleListId Pattern | Contents |
|
||||
|----------|---------------------|----------|
|
||||
| `all_cases_in_study` | `{studyId}_all` | All samples |
|
||||
| `all_cases_with_mutation_data` | `{studyId}_sequenced` | Sequenced samples only |
|
||||
| `all_cases_with_cna_data` | `{studyId}_cna` | Samples with CNA data |
|
||||
| `all_cases_with_mrna_data` | `{studyId}_mrna` | Samples with expression |
|
||||
| `all_cases_with_rppa_data` | `{studyId}_rppa` | Samples with RPPA |
|
||||
| `all_complete_cases` | `{studyId}_complete` | Complete multiplatform data |
|
||||
|
||||
## Common Gene Entrez IDs
|
||||
|
||||
| Gene | Entrez ID | Role |
|
||||
|------|-----------|------|
|
||||
| TP53 | 7157 | Tumor suppressor |
|
||||
| PIK3CA | 5290 | Oncogene |
|
||||
| KRAS | 3845 | Oncogene |
|
||||
| BRCA1 | 672 | Tumor suppressor |
|
||||
| BRCA2 | 675 | Tumor suppressor |
|
||||
| PTEN | 5728 | Tumor suppressor |
|
||||
| EGFR | 1956 | Oncogene |
|
||||
| MYC | 4609 | Oncogene |
|
||||
| RB1 | 5925 | Tumor suppressor |
|
||||
| APC | 324 | Tumor suppressor |
|
||||
| CDKN2A | 1029 | Tumor suppressor |
|
||||
| IDH1 | 3417 | Oncogene (mutant) |
|
||||
| BRAF | 673 | Oncogene |
|
||||
| CDH1 | 999 | Tumor suppressor |
|
||||
| VHL | 7428 | Tumor suppressor |
|
||||
|
||||
## Mutation Type Classifications
|
||||
|
||||
| mutationType | Description |
|
||||
|-------------|-------------|
|
||||
| `Missense_Mutation` | Amino acid change |
|
||||
| `Nonsense_Mutation` | Premature stop codon |
|
||||
| `Frame_Shift_Del` | Frameshift deletion |
|
||||
| `Frame_Shift_Ins` | Frameshift insertion |
|
||||
| `Splice_Site` | Splice site mutation |
|
||||
| `In_Frame_Del` | In-frame deletion |
|
||||
| `In_Frame_Ins` | In-frame insertion |
|
||||
| `Translation_Start_Site` | Start codon mutation |
|
||||
| `Nonstop_Mutation` | Stop codon mutation |
|
||||
| `Silent` | Synonymous |
|
||||
| `5'Flank` | 5' flanking |
|
||||
| `3'UTR` | 3' UTR |
|
||||
|
||||
## OncoPrint Color Legend
|
||||
|
||||
cBioPortal uses consistent colors in OncoPrint:
|
||||
- **Red**: Amplification
|
||||
- **Blue (dark)**: Deep deletion
|
||||
- **Green**: Missense mutation
|
||||
- **Black**: Truncating mutation
|
||||
- **Purple**: Fusion
|
||||
- **Orange**: mRNA upregulation
|
||||
- **Teal**: mRNA downregulation
|
||||
300
scientific-skills/depmap/SKILL.md
Normal file
300
scientific-skills/depmap/SKILL.md
Normal file
@@ -0,0 +1,300 @@
|
||||
---
|
||||
name: depmap
|
||||
description: Query the Cancer Dependency Map (DepMap) for cancer cell line gene dependency scores (CRISPR Chronos), drug sensitivity data, and gene effect profiles. Use for identifying cancer-specific vulnerabilities, synthetic lethal interactions, and validating oncology drug targets.
|
||||
license: CC-BY-4.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# DepMap — Cancer Dependency Map
|
||||
|
||||
## Overview
|
||||
|
||||
The Cancer Dependency Map (DepMap) project, run by the Broad Institute, systematically characterizes genetic dependencies across hundreds of cancer cell lines using genome-wide CRISPR knockout screens (DepMap CRISPR), RNA interference (RNAi), and compound sensitivity assays (PRISM). DepMap data is essential for:
|
||||
- Identifying which genes are essential for specific cancer types
|
||||
- Finding cancer-selective dependencies (therapeutic targets)
|
||||
- Validating oncology drug targets
|
||||
- Discovering synthetic lethal interactions
|
||||
|
||||
**Key resources:**
|
||||
- DepMap Portal: https://depmap.org/portal/
|
||||
- DepMap data downloads: https://depmap.org/portal/download/all/
|
||||
- Python package: `depmap` (or access via API/downloads)
|
||||
- API: https://depmap.org/portal/api/
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use DepMap when:
|
||||
|
||||
- **Target validation**: Is a gene essential for survival in cancer cell lines with a specific mutation (e.g., KRAS-mutant)?
|
||||
- **Biomarker discovery**: What genomic features predict sensitivity to knockout of a gene?
|
||||
- **Synthetic lethality**: Find genes that are selectively essential when another gene is mutated/deleted
|
||||
- **Drug sensitivity**: What cell line features predict response to a compound?
|
||||
- **Pan-cancer essentiality**: Is a gene broadly essential across all cancer types (bad target) or selectively essential?
|
||||
- **Correlation analysis**: Which pairs of genes have correlated dependency profiles (co-essentiality)?
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### Dependency Scores
|
||||
|
||||
| Score | Range | Meaning |
|
||||
|-------|-------|---------|
|
||||
| **Chronos** (CRISPR) | ~ -3 to 0+ | More negative = more essential. Common essential threshold: −1. Pan-essential genes ~−1 to −2 |
|
||||
| **RNAi DEMETER2** | ~ -3 to 0+ | Similar scale to Chronos |
|
||||
| **Gene Effect** | normalized | Normalized Chronos; −1 = median effect of common essential genes |
|
||||
|
||||
**Key thresholds:**
|
||||
- Chronos ≤ −0.5: likely dependent
|
||||
- Chronos ≤ −1: strongly dependent (common essential range)
|
||||
|
||||
### Cell Line Annotations
|
||||
|
||||
Each cell line has:
|
||||
- `DepMap_ID`: unique identifier (e.g., `ACH-000001`)
|
||||
- `cell_line_name`: human-readable name
|
||||
- `primary_disease`: cancer type
|
||||
- `lineage`: broad tissue lineage
|
||||
- `lineage_subtype`: specific subtype
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. DepMap API
|
||||
|
||||
```python
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
BASE_URL = "https://depmap.org/portal/api"
|
||||
|
||||
def depmap_get(endpoint, params=None):
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.get(url, params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Gene Dependency Scores
|
||||
|
||||
```python
|
||||
def get_gene_dependency(gene_symbol, dataset="Chronos_Combined"):
|
||||
"""Get CRISPR dependency scores for a gene across all cell lines."""
|
||||
url = f"{BASE_URL}/gene"
|
||||
params = {
|
||||
"gene_id": gene_symbol,
|
||||
"dataset": dataset
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
|
||||
# Alternatively, use the /data endpoint:
|
||||
def get_dependencies_slice(gene_symbol, dataset_name="CRISPRGeneEffect"):
|
||||
"""Get a gene's dependency slice from a dataset."""
|
||||
url = f"{BASE_URL}/data/gene_dependency"
|
||||
params = {"gene_name": gene_symbol, "dataset_name": dataset_name}
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
return data
|
||||
```
|
||||
|
||||
### 3. Download-Based Analysis (Recommended for Large Queries)
|
||||
|
||||
For large-scale analysis, download DepMap data files and analyze locally:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import requests, os
|
||||
|
||||
def download_depmap_data(url, output_path):
|
||||
"""Download a DepMap data file."""
|
||||
response = requests.get(url, stream=True)
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
# DepMap 24Q4 data files (update version as needed)
|
||||
FILES = {
|
||||
"crispr_gene_effect": "https://figshare.com/ndownloader/files/...",
|
||||
# OR download from: https://depmap.org/portal/download/all/
|
||||
# Files available:
|
||||
# CRISPRGeneEffect.csv - Chronos gene effect scores
|
||||
# OmicsExpressionProteinCodingGenesTPMLogp1.csv - mRNA expression
|
||||
# OmicsSomaticMutationsMatrixDamaging.csv - mutation binary matrix
|
||||
# OmicsCNGene.csv - copy number
|
||||
# sample_info.csv - cell line metadata
|
||||
}
|
||||
|
||||
def load_depmap_gene_effect(filepath="CRISPRGeneEffect.csv"):
|
||||
"""
|
||||
Load DepMap CRISPR gene effect matrix.
|
||||
Rows = cell lines (DepMap_ID), Columns = genes (Symbol (EntrezID))
|
||||
"""
|
||||
df = pd.read_csv(filepath, index_col=0)
|
||||
# Rename columns to gene symbols only
|
||||
df.columns = [col.split(" ")[0] for col in df.columns]
|
||||
return df
|
||||
|
||||
def load_cell_line_info(filepath="sample_info.csv"):
|
||||
"""Load cell line metadata."""
|
||||
return pd.read_csv(filepath)
|
||||
```
|
||||
|
||||
### 4. Identifying Selective Dependencies
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
def find_selective_dependencies(gene_effect_df, cell_line_info, target_gene,
|
||||
cancer_type=None, threshold=-0.5):
|
||||
"""Find cell lines selectively dependent on a gene."""
|
||||
|
||||
# Get scores for target gene
|
||||
if target_gene not in gene_effect_df.columns:
|
||||
return None
|
||||
|
||||
scores = gene_effect_df[target_gene].dropna()
|
||||
dependent = scores[scores <= threshold]
|
||||
|
||||
# Add cell line info
|
||||
result = pd.DataFrame({
|
||||
"DepMap_ID": dependent.index,
|
||||
"gene_effect": dependent.values
|
||||
}).merge(cell_line_info[["DepMap_ID", "cell_line_name", "primary_disease", "lineage"]])
|
||||
|
||||
if cancer_type:
|
||||
result = result[result["primary_disease"].str.contains(cancer_type, case=False, na=False)]
|
||||
|
||||
return result.sort_values("gene_effect")
|
||||
|
||||
# Example usage (after loading data)
|
||||
# df_effect = load_depmap_gene_effect("CRISPRGeneEffect.csv")
|
||||
# cell_info = load_cell_line_info("sample_info.csv")
|
||||
# deps = find_selective_dependencies(df_effect, cell_info, "KRAS", cancer_type="Lung")
|
||||
```
|
||||
|
||||
### 5. Biomarker Analysis (Gene Effect vs. Mutation)
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
from scipy import stats
|
||||
|
||||
def biomarker_analysis(gene_effect_df, mutation_df, target_gene, biomarker_gene):
|
||||
"""
|
||||
Test if mutation in biomarker_gene predicts dependency on target_gene.
|
||||
|
||||
Args:
|
||||
gene_effect_df: CRISPR gene effect DataFrame
|
||||
mutation_df: Binary mutation DataFrame (1 = mutated)
|
||||
target_gene: Gene to assess dependency of
|
||||
biomarker_gene: Gene whose mutation may predict dependency
|
||||
"""
|
||||
if target_gene not in gene_effect_df.columns or biomarker_gene not in mutation_df.columns:
|
||||
return None
|
||||
|
||||
# Align cell lines
|
||||
common_lines = gene_effect_df.index.intersection(mutation_df.index)
|
||||
scores = gene_effect_df.loc[common_lines, target_gene].dropna()
|
||||
mutations = mutation_df.loc[scores.index, biomarker_gene]
|
||||
|
||||
mutated = scores[mutations == 1]
|
||||
wt = scores[mutations == 0]
|
||||
|
||||
stat, pval = stats.mannwhitneyu(mutated, wt, alternative='less')
|
||||
|
||||
return {
|
||||
"target_gene": target_gene,
|
||||
"biomarker_gene": biomarker_gene,
|
||||
"n_mutated": len(mutated),
|
||||
"n_wt": len(wt),
|
||||
"mean_effect_mutated": mutated.mean(),
|
||||
"mean_effect_wt": wt.mean(),
|
||||
"pval": pval,
|
||||
"significant": pval < 0.05
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Co-Essentiality Analysis
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def co_essentiality(gene_effect_df, target_gene, top_n=20):
|
||||
"""Find genes with most correlated dependency profiles (co-essential partners)."""
|
||||
if target_gene not in gene_effect_df.columns:
|
||||
return None
|
||||
|
||||
target_scores = gene_effect_df[target_gene].dropna()
|
||||
|
||||
correlations = {}
|
||||
for gene in gene_effect_df.columns:
|
||||
if gene == target_gene:
|
||||
continue
|
||||
other_scores = gene_effect_df[gene].dropna()
|
||||
common = target_scores.index.intersection(other_scores.index)
|
||||
if len(common) < 50:
|
||||
continue
|
||||
r = target_scores[common].corr(other_scores[common])
|
||||
if not pd.isna(r):
|
||||
correlations[gene] = r
|
||||
|
||||
corr_series = pd.Series(correlations).sort_values(ascending=False)
|
||||
return corr_series.head(top_n)
|
||||
|
||||
# Co-essential genes often share biological complexes or pathways
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Target Validation for a Cancer Type
|
||||
|
||||
1. Download `CRISPRGeneEffect.csv` and `sample_info.csv`
|
||||
2. Filter cell lines by cancer type
|
||||
3. Compute mean gene effect for target gene in cancer vs. all others
|
||||
4. Calculate selectivity: how specific is the dependency to your cancer type?
|
||||
5. Cross-reference with mutation, expression, or CNA data as biomarkers
|
||||
|
||||
### Workflow 2: Synthetic Lethality Screen
|
||||
|
||||
1. Identify cell lines with mutation/deletion in gene of interest (e.g., BRCA1-mutant)
|
||||
2. Compute gene effect scores for all genes in mutant vs. WT lines
|
||||
3. Identify genes significantly more essential in mutant lines (synthetic lethal partners)
|
||||
4. Filter by selectivity and effect size
|
||||
|
||||
### Workflow 3: Compound Sensitivity Analysis
|
||||
|
||||
1. Download PRISM compound sensitivity data (`primary-screen-replicate-treatment-info.csv`)
|
||||
2. Correlate compound AUC/log2(fold-change) with genomic features
|
||||
3. Identify predictive biomarkers for compound sensitivity
|
||||
|
||||
## DepMap Data Files Reference
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `CRISPRGeneEffect.csv` | CRISPR Chronos gene effect (primary dependency data) |
|
||||
| `CRISPRGeneEffectUnscaled.csv` | Unscaled CRISPR scores |
|
||||
| `RNAi_merged.csv` | DEMETER2 RNAi dependency |
|
||||
| `sample_info.csv` | Cell line metadata (lineage, disease, etc.) |
|
||||
| `OmicsExpressionProteinCodingGenesTPMLogp1.csv` | mRNA expression |
|
||||
| `OmicsSomaticMutationsMatrixDamaging.csv` | Damaging somatic mutations (binary) |
|
||||
| `OmicsCNGene.csv` | Copy number per gene |
|
||||
| `PRISM_Repurposing_Primary_Screens_Data.csv` | Drug sensitivity (repurposing library) |
|
||||
|
||||
Download all files from: https://depmap.org/portal/download/all/
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use Chronos scores** (not DEMETER2) for current CRISPR analyses — better controlled for cutting efficiency
|
||||
- **Distinguish pan-essential from cancer-selective**: Target genes with low variance (essential in all lines) are poor drug targets
|
||||
- **Validate with expression data**: A gene not expressed in a cell line will score as non-essential regardless of actual function
|
||||
- **Use DepMap ID** for cell line identification — cell_line_name can be ambiguous
|
||||
- **Account for copy number**: Amplified genes may appear essential due to copy number effect (junk DNA hypothesis)
|
||||
- **Multiple testing correction**: When computing biomarker associations genome-wide, apply FDR correction
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **DepMap Portal**: https://depmap.org/portal/
|
||||
- **Data downloads**: https://depmap.org/portal/download/all/
|
||||
- **DepMap paper**: Behan FM et al. (2019) Nature. PMID: 30971826
|
||||
- **Chronos paper**: Dempster JM et al. (2021) Nature Methods. PMID: 34349281
|
||||
- **GitHub**: https://github.com/broadinstitute/depmap-portal
|
||||
- **Figshare**: https://figshare.com/articles/dataset/DepMap_24Q4_Public/27993966
|
||||
178
scientific-skills/depmap/references/dependency_analysis.md
Normal file
178
scientific-skills/depmap/references/dependency_analysis.md
Normal file
@@ -0,0 +1,178 @@
|
||||
# DepMap Dependency Analysis Guide
|
||||
|
||||
## Understanding Chronos Scores
|
||||
|
||||
Chronos is the current (v5+) algorithm for computing gene dependency scores from CRISPR screen data. It addresses systematic biases including:
|
||||
- Copy number effects (high-copy genes appear essential due to DNA cutting)
|
||||
- Guide RNA efficiency variation
|
||||
- Cell line growth rates
|
||||
|
||||
### Score Interpretation
|
||||
|
||||
| Score Range | Interpretation |
|
||||
|------------|----------------|
|
||||
| > 0 | Likely growth-promoting when knocked out (some noise) |
|
||||
| 0 to −0.3 | Non-essential: minimal fitness effect |
|
||||
| −0.3 to −0.5 | Mild dependency |
|
||||
| −0.5 to −1.0 | Significant dependency |
|
||||
| < −1.0 | Strong dependency (common essential range) |
|
||||
| ≈ −1.0 | Median of pan-essential genes (e.g., proteasome subunits) |
|
||||
|
||||
### Common Essential Genes (Controls)
|
||||
|
||||
Genes that are essential in nearly all cell lines (score ~−1 to −2):
|
||||
- Ribosomal proteins: RPL..., RPS...
|
||||
- Proteasome: PSMA..., PSMB...
|
||||
- Spliceosome: SNRPD1, SNRNP70
|
||||
- DNA replication: MCM2, PCNA
|
||||
- Transcription: POLR2A, TAF...
|
||||
|
||||
These can be used as positive controls for screen quality.
|
||||
|
||||
### Non-Essential Controls
|
||||
|
||||
Genes with negligible fitness effect (score ~ 0):
|
||||
- Non-expressed genes (tissue-specific)
|
||||
- Safe harbor loci
|
||||
|
||||
## Selectivity Assessment
|
||||
|
||||
To determine if a dependency is cancer-selective:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def compute_selectivity(gene_effect_df, target_gene, cancer_lineage):
|
||||
"""Compute selectivity score for a cancer lineage."""
|
||||
scores = gene_effect_df[target_gene].dropna()
|
||||
|
||||
# Get cell line metadata
|
||||
from depmap_utils import load_cell_line_info
|
||||
cell_info = load_cell_line_info()
|
||||
scores_df = scores.reset_index()
|
||||
scores_df.columns = ["DepMap_ID", "score"]
|
||||
scores_df = scores_df.merge(cell_info[["DepMap_ID", "lineage"]])
|
||||
|
||||
cancer_scores = scores_df[scores_df["lineage"] == cancer_lineage]["score"]
|
||||
other_scores = scores_df[scores_df["lineage"] != cancer_lineage]["score"]
|
||||
|
||||
# Selectivity: lower mean in cancer lineage vs others
|
||||
selectivity = other_scores.mean() - cancer_scores.mean()
|
||||
return {
|
||||
"target_gene": target_gene,
|
||||
"cancer_lineage": cancer_lineage,
|
||||
"cancer_mean": cancer_scores.mean(),
|
||||
"other_mean": other_scores.mean(),
|
||||
"selectivity_score": selectivity,
|
||||
"n_cancer": len(cancer_scores),
|
||||
"fraction_dependent": (cancer_scores < -0.5).mean()
|
||||
}
|
||||
```
|
||||
|
||||
## CRISPR Dataset Versions
|
||||
|
||||
| Dataset | Description | Recommended |
|
||||
|---------|-------------|-------------|
|
||||
| `CRISPRGeneEffect` | Chronos-corrected gene effect | Yes (current) |
|
||||
| `Achilles_gene_effect` | Older CERES algorithm | Legacy only |
|
||||
| `RNAi_merged` | DEMETER2 RNAi | For cross-validation |
|
||||
|
||||
## Quality Metrics
|
||||
|
||||
DepMap reports quality control metrics per screen:
|
||||
- **Skewness**: Pan-essential genes should show negative skew
|
||||
- **AUC**: Area under ROC for pan-essential vs non-essential controls
|
||||
|
||||
Good screens: skewness < −1, AUC > 0.85
|
||||
|
||||
## Cancer Lineage Codes
|
||||
|
||||
Common values for `lineage` field in `sample_info.csv`:
|
||||
|
||||
| Lineage | Description |
|
||||
|---------|-------------|
|
||||
| `lung` | Lung cancer |
|
||||
| `breast` | Breast cancer |
|
||||
| `colorectal` | Colorectal cancer |
|
||||
| `brain_cancer` | Brain cancer (GBM, etc.) |
|
||||
| `leukemia` | Leukemia |
|
||||
| `lymphoma` | Lymphoma |
|
||||
| `prostate` | Prostate cancer |
|
||||
| `ovarian` | Ovarian cancer |
|
||||
| `pancreatic` | Pancreatic cancer |
|
||||
| `skin` | Melanoma and other skin |
|
||||
| `liver` | Liver cancer |
|
||||
| `kidney` | Kidney cancer |
|
||||
|
||||
## Synthetic Lethality Analysis
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
def find_synthetic_lethal(gene_effect_df, mutation_df, biomarker_gene,
|
||||
fdr_threshold=0.1):
|
||||
"""
|
||||
Find synthetic lethal partners for a loss-of-function mutation.
|
||||
|
||||
For each gene, tests if cell lines mutant in biomarker_gene
|
||||
are more dependent on that gene vs. WT lines.
|
||||
"""
|
||||
if biomarker_gene not in mutation_df.columns:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Get mutant vs WT cell lines
|
||||
common = gene_effect_df.index.intersection(mutation_df.index)
|
||||
is_mutant = mutation_df.loc[common, biomarker_gene] == 1
|
||||
|
||||
mutant_lines = common[is_mutant]
|
||||
wt_lines = common[~is_mutant]
|
||||
|
||||
results = []
|
||||
for gene in gene_effect_df.columns:
|
||||
mut_scores = gene_effect_df.loc[mutant_lines, gene].dropna()
|
||||
wt_scores = gene_effect_df.loc[wt_lines, gene].dropna()
|
||||
|
||||
if len(mut_scores) < 5 or len(wt_scores) < 10:
|
||||
continue
|
||||
|
||||
stat, pval = stats.mannwhitneyu(mut_scores, wt_scores, alternative='less')
|
||||
results.append({
|
||||
"gene": gene,
|
||||
"mean_mutant": mut_scores.mean(),
|
||||
"mean_wt": wt_scores.mean(),
|
||||
"effect_size": wt_scores.mean() - mut_scores.mean(),
|
||||
"pval": pval,
|
||||
"n_mutant": len(mut_scores),
|
||||
"n_wt": len(wt_scores)
|
||||
})
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
# FDR correction
|
||||
from scipy.stats import false_discovery_control
|
||||
df["qval"] = false_discovery_control(df["pval"], method="bh")
|
||||
df = df[df["qval"] < fdr_threshold].sort_values("effect_size", ascending=False)
|
||||
return df
|
||||
```
|
||||
|
||||
## Drug Sensitivity (PRISM)
|
||||
|
||||
DepMap also contains compound sensitivity data from the PRISM assay:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def load_prism_data(filepath="primary-screen-replicate-collapsed-logfold-change.csv"):
|
||||
"""
|
||||
Load PRISM drug sensitivity data.
|
||||
Rows = cell lines, Columns = compounds (broad_id::name::dose)
|
||||
Values = log2 fold change (more negative = more sensitive)
|
||||
"""
|
||||
return pd.read_csv(filepath, index_col=0)
|
||||
|
||||
# Available datasets:
|
||||
# primary-screen: 4,518 compounds at single dose
|
||||
# secondary-screen: ~8,000 compounds at multiple doses (AUC available)
|
||||
```
|
||||
338
scientific-skills/glycoengineering/SKILL.md
Normal file
338
scientific-skills/glycoengineering/SKILL.md
Normal file
@@ -0,0 +1,338 @@
|
||||
---
|
||||
name: glycoengineering
|
||||
description: Analyze and engineer protein glycosylation. Scan sequences for N-glycosylation sequons (N-X-S/T), predict O-glycosylation hotspots, and access curated glycoengineering tools (NetOGlyc, GlycoShield, GlycoWorkbench). For glycoprotein engineering, therapeutic antibody optimization, and vaccine design.
|
||||
license: Unknown
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# Glycoengineering
|
||||
|
||||
## Overview
|
||||
|
||||
Glycosylation is the most common and complex post-translational modification (PTM) of proteins, affecting over 50% of all human proteins. Glycans regulate protein folding, stability, immune recognition, receptor interactions, and pharmacokinetics of therapeutic proteins. Glycoengineering involves rational modification of glycosylation patterns for improved therapeutic efficacy, stability, or immune evasion.
|
||||
|
||||
**Two major glycosylation types:**
|
||||
- **N-glycosylation**: Attached to asparagine (N) in the sequon N-X-[S/T] where X ≠ Proline; occurs in the ER/Golgi
|
||||
- **O-glycosylation**: Attached to serine (S) or threonine (T); no strict consensus motif; primarily GalNAc initiation
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use this skill when:
|
||||
|
||||
- **Antibody engineering**: Optimize Fc glycosylation for enhanced ADCC, CDC, or reduced immunogenicity
|
||||
- **Therapeutic protein design**: Identify glycosylation sites that affect half-life, stability, or immunogenicity
|
||||
- **Vaccine antigen design**: Engineer glycan shields to focus immune responses on conserved epitopes
|
||||
- **Biosimilar characterization**: Compare glycan patterns between reference and biosimilar
|
||||
- **Drug target analysis**: Does glycosylation affect target engagement for a receptor?
|
||||
- **Protein stability**: N-glycans often stabilize proteins; identify sites for stabilizing mutations
|
||||
|
||||
## N-Glycosylation Sequon Analysis
|
||||
|
||||
### Scanning for N-Glycosylation Sites
|
||||
|
||||
N-glycosylation occurs at the sequon **N-X-[S/T]** where X ≠ Proline.
|
||||
|
||||
```python
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
def find_n_glycosylation_sequons(sequence: str) -> List[dict]:
|
||||
"""
|
||||
Scan a protein sequence for canonical N-linked glycosylation sequons.
|
||||
Motif: N-X-[S/T], where X ≠ Proline.
|
||||
|
||||
Args:
|
||||
sequence: Single-letter amino acid sequence
|
||||
|
||||
Returns:
|
||||
List of dicts with position (1-based), motif, and context
|
||||
"""
|
||||
seq = sequence.upper()
|
||||
results = []
|
||||
i = 0
|
||||
while i <= len(seq) - 3:
|
||||
triplet = seq[i:i+3]
|
||||
if triplet[0] == 'N' and triplet[1] != 'P' and triplet[2] in {'S', 'T'}:
|
||||
context = seq[max(0, i-3):i+6] # ±3 residue context
|
||||
results.append({
|
||||
'position': i + 1, # 1-based
|
||||
'motif': triplet,
|
||||
'context': context,
|
||||
'sequon_type': 'NXS' if triplet[2] == 'S' else 'NXT'
|
||||
})
|
||||
i += 3
|
||||
else:
|
||||
i += 1
|
||||
return results
|
||||
|
||||
def summarize_glycosylation_sites(sequence: str, protein_name: str = "") -> str:
|
||||
"""Generate a research log summary of N-glycosylation sites."""
|
||||
sequons = find_n_glycosylation_sequons(sequence)
|
||||
|
||||
lines = [f"# N-Glycosylation Sequon Analysis: {protein_name or 'Protein'}"]
|
||||
lines.append(f"Sequence length: {len(sequence)}")
|
||||
lines.append(f"Total N-glycosylation sequons: {len(sequons)}")
|
||||
|
||||
if sequons:
|
||||
lines.append(f"\nN-X-S sites: {sum(1 for s in sequons if s['sequon_type'] == 'NXS')}")
|
||||
lines.append(f"N-X-T sites: {sum(1 for s in sequons if s['sequon_type'] == 'NXT')}")
|
||||
lines.append(f"\nSite details:")
|
||||
for s in sequons:
|
||||
lines.append(f" Position {s['position']}: {s['motif']} (context: ...{s['context']}...)")
|
||||
else:
|
||||
lines.append("No canonical N-glycosylation sequons detected.")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
# Example: IgG1 Fc region
|
||||
fc_sequence = "APELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK"
|
||||
print(summarize_glycosylation_sites(fc_sequence, "IgG1 Fc"))
|
||||
```
|
||||
|
||||
### Mutating N-Glycosylation Sites
|
||||
|
||||
```python
|
||||
def eliminate_glycosite(sequence: str, position: int, replacement: str = "Q") -> str:
|
||||
"""
|
||||
Eliminate an N-glycosylation site by substituting Asn → Gln (conservative).
|
||||
|
||||
Args:
|
||||
sequence: Protein sequence
|
||||
position: 1-based position of the Asn to mutate
|
||||
replacement: Amino acid to substitute (default Q = Gln; similar size, not glycosylated)
|
||||
|
||||
Returns:
|
||||
Mutated sequence
|
||||
"""
|
||||
seq = list(sequence.upper())
|
||||
idx = position - 1
|
||||
assert seq[idx] == 'N', f"Position {position} is '{seq[idx]}', not 'N'"
|
||||
seq[idx] = replacement.upper()
|
||||
return ''.join(seq)
|
||||
|
||||
def add_glycosite(sequence: str, position: int, flanking_context: str = "S") -> str:
|
||||
"""
|
||||
Introduce an N-glycosylation site by mutating a residue to Asn,
|
||||
and ensuring X ≠ Pro and +2 = S/T.
|
||||
|
||||
Args:
|
||||
position: 1-based position to introduce Asn
|
||||
flanking_context: 'S' or 'T' at position+2 (if modification needed)
|
||||
"""
|
||||
seq = list(sequence.upper())
|
||||
idx = position - 1
|
||||
|
||||
# Mutate to Asn
|
||||
seq[idx] = 'N'
|
||||
|
||||
# Ensure X+1 != Pro (mutate to Ala if needed)
|
||||
if idx + 1 < len(seq) and seq[idx + 1] == 'P':
|
||||
seq[idx + 1] = 'A'
|
||||
|
||||
# Ensure X+2 = S or T
|
||||
if idx + 2 < len(seq) and seq[idx + 2] not in ('S', 'T'):
|
||||
seq[idx + 2] = flanking_context
|
||||
|
||||
return ''.join(seq)
|
||||
```
|
||||
|
||||
## O-Glycosylation Analysis
|
||||
|
||||
### Heuristic O-Glycosylation Hotspot Prediction
|
||||
|
||||
```python
|
||||
def predict_o_glycosylation_hotspots(
|
||||
sequence: str,
|
||||
window: int = 7,
|
||||
min_st_fraction: float = 0.4,
|
||||
disallow_proline_next: bool = True
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Heuristic O-glycosylation hotspot scoring based on local S/T density.
|
||||
Not a substitute for NetOGlyc; use as fast baseline.
|
||||
|
||||
Rules:
|
||||
- O-GalNAc glycosylation clusters on Ser/Thr-rich segments
|
||||
- Flag Ser/Thr residues in windows enriched for S/T
|
||||
- Avoid S/T immediately followed by Pro (TP/SP motifs inhibit GalNAc-T)
|
||||
|
||||
Args:
|
||||
window: Odd window size for local S/T density
|
||||
min_st_fraction: Minimum fraction of S/T in window to flag site
|
||||
"""
|
||||
if window % 2 == 0:
|
||||
window = 7
|
||||
seq = sequence.upper()
|
||||
half = window // 2
|
||||
candidates = []
|
||||
|
||||
for i, aa in enumerate(seq):
|
||||
if aa not in ('S', 'T'):
|
||||
continue
|
||||
if disallow_proline_next and i + 1 < len(seq) and seq[i+1] == 'P':
|
||||
continue
|
||||
|
||||
start = max(0, i - half)
|
||||
end = min(len(seq), i + half + 1)
|
||||
segment = seq[start:end]
|
||||
st_count = sum(1 for c in segment if c in ('S', 'T'))
|
||||
frac = st_count / len(segment)
|
||||
|
||||
if frac >= min_st_fraction:
|
||||
candidates.append({
|
||||
'position': i + 1,
|
||||
'residue': aa,
|
||||
'st_fraction': round(frac, 3),
|
||||
'window': f"{start+1}-{end}",
|
||||
'segment': segment
|
||||
})
|
||||
|
||||
return candidates
|
||||
```
|
||||
|
||||
## External Glycoengineering Tools
|
||||
|
||||
### 1. NetOGlyc 4.0 (O-glycosylation prediction)
|
||||
|
||||
Web service for high-accuracy O-GalNAc site prediction:
|
||||
- **URL**: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/
|
||||
- **Input**: FASTA protein sequence
|
||||
- **Output**: Per-residue O-glycosylation probability scores
|
||||
- **Method**: Neural network trained on experimentally verified O-GalNAc sites
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def submit_netoglycv4(fasta_sequence: str) -> str:
|
||||
"""
|
||||
Submit sequence to NetOGlyc 4.0 web service.
|
||||
Returns the job URL for result retrieval.
|
||||
|
||||
Note: This uses the DTU Health Tech web service. Results take ~1-5 min.
|
||||
"""
|
||||
url = "https://services.healthtech.dtu.dk/cgi-bin/webface2.cgi"
|
||||
# NetOGlyc submission (parameters may vary with web service version)
|
||||
# Recommend using the web interface directly for most use cases
|
||||
print("Submit sequence at: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/")
|
||||
return url
|
||||
|
||||
# Also: NetNGlyc for N-glycosylation prediction
|
||||
# URL: https://services.healthtech.dtu.dk/services/NetNGlyc-1.0/
|
||||
```
|
||||
|
||||
### 2. GlycoShield-MD (Glycan Shielding Analysis)
|
||||
|
||||
GlycoShield-MD analyzes how glycans shield protein surfaces during MD simulations:
|
||||
- **URL**: https://gitlab.mpcdf.mpg.de/dioscuri-biophysics/glycoshield-md/
|
||||
- **Use**: Map glycan shielding on protein surface over MD trajectory
|
||||
- **Output**: Per-residue shielding fraction, visualization
|
||||
|
||||
```bash
|
||||
# Installation
|
||||
pip install glycoshield
|
||||
|
||||
# Basic usage: analyze glycan shielding from glycosylated protein MD trajectory
|
||||
glycoshield \
|
||||
--topology glycoprotein.pdb \
|
||||
--trajectory glycoprotein.xtc \
|
||||
--glycan_resnames BGLCNA FUC \
|
||||
--output shielding_analysis/
|
||||
```
|
||||
|
||||
### 3. GlycoWorkbench (Glycan Structure Drawing/Analysis)
|
||||
|
||||
- **URL**: https://www.eurocarbdb.org/project/glycoworkbench
|
||||
- **Use**: Draw glycan structures, calculate masses, annotate MS spectra
|
||||
- **Format**: GlycoCT, IUPAC condensed glycan notation
|
||||
|
||||
### 4. GlyConnect (Glycan-Protein Database)
|
||||
|
||||
- **URL**: https://glyconnect.expasy.org/
|
||||
- **Use**: Find experimentally verified glycoproteins and glycosylation sites
|
||||
- **Query**: By protein (UniProt ID), glycan structure, or tissue
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_glyconnect(uniprot_id: str) -> dict:
|
||||
"""Query GlyConnect for glycosylation data for a protein."""
|
||||
url = f"https://glyconnect.expasy.org/api/proteins/uniprot/{uniprot_id}"
|
||||
response = requests.get(url, headers={"Accept": "application/json"})
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
return {}
|
||||
|
||||
# Example: query EGFR glycosylation
|
||||
egfr_glyco = query_glyconnect("P00533")
|
||||
```
|
||||
|
||||
### 5. UniCarbKB (Glycan Structure Database)
|
||||
|
||||
- **URL**: https://unicarbkb.org/
|
||||
- **Use**: Browse glycan structures, search by mass or composition
|
||||
- **Format**: GlycoCT or IUPAC notation
|
||||
|
||||
## Key Glycoengineering Strategies
|
||||
|
||||
### For Therapeutic Antibodies
|
||||
|
||||
| Goal | Strategy | Notes |
|
||||
|------|----------|-------|
|
||||
| Enhance ADCC | Defucosylation at Fc Asn297 | Afucosylated IgG1 has ~50× better FcγRIIIa binding |
|
||||
| Reduce immunogenicity | Remove non-human glycans | Eliminate α-Gal, NGNA epitopes |
|
||||
| Improve PK half-life | Sialylation | Sialylated glycans extend half-life |
|
||||
| Reduce inflammation | Hypersialylation | IVIG anti-inflammatory mechanism |
|
||||
| Create glycan shield | Add N-glycosites to surface | Masks vulnerable epitopes (vaccine design) |
|
||||
|
||||
### Common Mutations Used
|
||||
|
||||
| Mutation | Effect |
|
||||
|----------|--------|
|
||||
| N297A/Q (IgG1) | Removes Fc glycosylation (aglycosyl) |
|
||||
| N297D (IgG1) | Removes Fc glycosylation |
|
||||
| S298A/E333A/K334A | Increases FcγRIIIa binding |
|
||||
| F243L (IgG1) | Increases defucosylation |
|
||||
| T299A | Removes Fc glycosylation |
|
||||
|
||||
## Glycan Notation
|
||||
|
||||
### IUPAC Condensed Notation (Monosaccharide abbreviations)
|
||||
|
||||
| Symbol | Full Name | Type |
|
||||
|--------|-----------|------|
|
||||
| Glc | Glucose | Hexose |
|
||||
| GlcNAc | N-Acetylglucosamine | HexNAc |
|
||||
| Man | Mannose | Hexose |
|
||||
| Gal | Galactose | Hexose |
|
||||
| Fuc | Fucose | Deoxyhexose |
|
||||
| Neu5Ac | N-Acetylneuraminic acid (Sialic acid) | Sialic acid |
|
||||
| GalNAc | N-Acetylgalactosamine | HexNAc |
|
||||
|
||||
### Complex N-Glycan Structure
|
||||
|
||||
```
|
||||
Typical complex biantennary N-glycan:
|
||||
Neu5Ac-Gal-GlcNAc-Man\
|
||||
Man-GlcNAc-GlcNAc-[Asn]
|
||||
Neu5Ac-Gal-GlcNAc-Man/
|
||||
(±Core Fuc at innermost GlcNAc)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Start with NetNGlyc/NetOGlyc** for computational prediction before experimental validation
|
||||
- **Verify with mass spectrometry**: Glycoproteomics (Byonic, Mascot) for site-specific glycan profiling
|
||||
- **Consider site context**: Not all predicted sequons are actually glycosylated (accessibility, cell type, protein conformation)
|
||||
- **For antibodies**: Fc N297 glycan is critical — always characterize this site first
|
||||
- **Use GlyConnect** to check if your protein of interest has experimentally verified glycosylation data
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **GlyTouCan** (glycan structure repository): https://glytoucan.org/
|
||||
- **GlyConnect**: https://glyconnect.expasy.org/
|
||||
- **CFG Functional Glycomics**: http://www.functionalglycomics.org/
|
||||
- **DTU Health Tech servers** (NetNGlyc, NetOGlyc): https://services.healthtech.dtu.dk/
|
||||
- **GlycoWorkbench**: https://glycoworkbench.software.informer.com/
|
||||
- **Review**: Apweiler R et al. (1999) Biochim Biophys Acta. PMID: 10564035
|
||||
- **Therapeutic glycoengineering review**: Jefferis R (2009) Nature Reviews Drug Discovery. PMID: 19448661
|
||||
@@ -0,0 +1,165 @@
|
||||
# Glycan Databases and Resources Reference
|
||||
|
||||
## Primary Databases
|
||||
|
||||
### GlyTouCan
|
||||
- **URL**: https://glytoucan.org/
|
||||
- **Content**: Unique accession numbers (GTC IDs) for glycan structures
|
||||
- **Use**: Standardized glycan identification across databases
|
||||
- **Format**: GlycoCT, WURCS, IUPAC
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def lookup_glytoucan(glytoucan_id: str) -> dict:
|
||||
"""Fetch glycan details from GlyTouCan."""
|
||||
url = f"https://api.glytoucan.org/glycan/{glytoucan_id}"
|
||||
response = requests.get(url, headers={"Accept": "application/json"})
|
||||
return response.json() if response.ok else {}
|
||||
```
|
||||
|
||||
### GlyConnect
|
||||
- **URL**: https://glyconnect.expasy.org/
|
||||
- **Content**: Protein glycosylation database with site-specific glycan profiles
|
||||
- **Integration**: Links UniProt proteins to experimentally verified glycosylation
|
||||
- **Use**: Look up known glycosylation for your target protein
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def get_glycoprotein_info(uniprot_id: str) -> dict:
|
||||
"""Get glycosylation data for a protein from GlyConnect."""
|
||||
base_url = "https://glyconnect.expasy.org/api"
|
||||
response = requests.get(f"{base_url}/proteins/uniprot/{uniprot_id}")
|
||||
return response.json() if response.ok else {}
|
||||
|
||||
def get_glycan_compositions(glyconnect_protein_id: int) -> list:
|
||||
"""Get all glycan compositions for a GlyConnect protein entry."""
|
||||
base_url = "https://glyconnect.expasy.org/api"
|
||||
response = requests.get(f"{base_url}/compositions/protein/{glyconnect_protein_id}")
|
||||
return response.json().get("data", []) if response.ok else []
|
||||
```
|
||||
|
||||
### UniCarbKB
|
||||
- **URL**: https://unicarbkb.org/
|
||||
- **Content**: Curated glycan structures with biological context
|
||||
- **Features**: Tissue/cell-type specific glycan data, mass spectrometry data
|
||||
|
||||
### KEGG Glycan
|
||||
- **URL**: https://www.genome.jp/kegg/glycan/
|
||||
- **Content**: Glycan structures in KEGG format, biosynthesis pathways
|
||||
- **Integration**: Links to KEGG PATHWAY maps for glycan biosynthesis
|
||||
|
||||
### CAZy (Carbohydrate-Active Enzymes)
|
||||
- **URL**: http://www.cazy.org/
|
||||
- **Content**: Enzymes that build, break, and modify glycans
|
||||
- **Use**: Identify enzymes for glycoengineering applications
|
||||
|
||||
## Prediction Servers
|
||||
|
||||
### NetNGlyc 1.0
|
||||
- **URL**: https://services.healthtech.dtu.dk/services/NetNGlyc-1.0/
|
||||
- **Method**: Neural network for N-glycosylation site prediction
|
||||
- **Input**: Protein FASTA sequence
|
||||
- **Output**: Per-asparagine probability score; threshold ~0.5
|
||||
|
||||
### NetOGlyc 4.0
|
||||
- **URL**: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/
|
||||
- **Method**: Neural network for O-GalNAc glycosylation prediction
|
||||
- **Input**: Protein FASTA sequence
|
||||
- **Output**: Per-serine/threonine probability; threshold ~0.5
|
||||
|
||||
### GlycoMine (Machine Learning)
|
||||
- Machine learning predictor for N-, O- and C-glycosylation
|
||||
- Multiple glycan types: N-GlcNAc, O-GalNAc, O-GlcNAc, O-Man, O-Fuc, O-Glc, C-Man
|
||||
|
||||
### SymLink (Glycosylation site & sequon predictor)
|
||||
- Species-specific N-glycosylation prediction
|
||||
- More specific than simple sequon scanning
|
||||
|
||||
## Mass Spectrometry Glycoproteomics Tools
|
||||
|
||||
### Byonic (Protein Metrics)
|
||||
- De novo glycopeptide identification from MS2 spectra
|
||||
- Comprehensive glycan database
|
||||
- Site-specific glycoform assignment
|
||||
|
||||
### Mascot Glycan Analysis
|
||||
- Glycan-specific search parameters
|
||||
- Common for bottom-up glycoproteomics
|
||||
|
||||
### GlycoWorkbench
|
||||
- **URL**: https://www.eurocarbdb.org/project/glycoworkbench
|
||||
- Glycan structure drawing and mass calculation
|
||||
- Annotation of MS/MS spectra with glycan fragment ions
|
||||
|
||||
### Skyline
|
||||
- Targeted quantification of glycopeptides
|
||||
- Integrates with glycan database
|
||||
|
||||
## Glycan Nomenclature Systems
|
||||
|
||||
### Oxford Notation (For N-glycans)
|
||||
Codes complex N-glycans as text strings:
|
||||
```
|
||||
G0F = Core-fucosylated, biantennary, no galactose
|
||||
G1F = Core-fucosylated, one galactose
|
||||
G2F = Core-fucosylated, two galactoses
|
||||
G2FS1 = Core-fucosylated, two galactoses, one sialic acid
|
||||
G2FS2 = Core-fucosylated, two galactoses, two sialic acids
|
||||
M5 = High mannose 5 (Man5GlcNAc2)
|
||||
M9 = High mannose 9 (Man9GlcNAc2)
|
||||
```
|
||||
|
||||
### Symbol Nomenclature for Glycans (SNFG)
|
||||
Standard colored symbols for publications:
|
||||
- Blue circle = Glucose
|
||||
- Green circle = Mannose
|
||||
- Yellow circle = Galactose
|
||||
- Blue square = N-Acetylglucosamine
|
||||
- Yellow square = N-Acetylgalactosamine
|
||||
- Purple diamond = N-Acetylneuraminic acid (sialic acid)
|
||||
- Red triangle = Fucose
|
||||
|
||||
## Therapeutic Glycoproteins and Key Glycosylation Sites
|
||||
|
||||
| Therapeutic | Target | Key Glycosylation | Function |
|
||||
|-------------|--------|------------------|---------|
|
||||
| IgG1 antibody | Various | N297 (Fc) | ADCC/CDC effector function |
|
||||
| Erythropoietin | EPOR | N24, N38, N83, O-glycans | Pharmacokinetics |
|
||||
| Etanercept | TNF | N420 (IgG1 Fc) | Half-life |
|
||||
| tPA (alteplase) | Fibrin | N117, N184, N448 | Fibrin binding |
|
||||
| Factor VIII | VWF | 25 N-glycosites | Clearance |
|
||||
|
||||
## Batch Analysis Example
|
||||
|
||||
```python
|
||||
from glycoengineering_tools import find_n_glycosylation_sequons, predict_o_glycosylation_hotspots
|
||||
import pandas as pd
|
||||
|
||||
def analyze_glycosylation_landscape(sequences_dict: dict) -> pd.DataFrame:
|
||||
"""
|
||||
Batch analysis of glycosylation for multiple proteins.
|
||||
|
||||
Args:
|
||||
sequences_dict: {protein_name: sequence}
|
||||
|
||||
Returns:
|
||||
DataFrame with glycosylation summary per protein
|
||||
"""
|
||||
results = []
|
||||
for name, seq in sequences_dict.items():
|
||||
n_sites = find_n_glycosylation_sequons(seq)
|
||||
o_sites = predict_o_glycosylation_hotspots(seq)
|
||||
|
||||
results.append({
|
||||
'protein': name,
|
||||
'length': len(seq),
|
||||
'n_glycosites': len(n_sites),
|
||||
'o_glyco_hotspots': len(o_sites),
|
||||
'n_glyco_density': len(n_sites) / len(seq) * 100,
|
||||
'n_glyco_positions': [s['position'] for s in n_sites]
|
||||
})
|
||||
|
||||
return pd.DataFrame(results)
|
||||
```
|
||||
395
scientific-skills/gnomad-database/SKILL.md
Normal file
395
scientific-skills/gnomad-database/SKILL.md
Normal file
@@ -0,0 +1,395 @@
|
||||
---
|
||||
name: gnomad-database
|
||||
description: Query gnomAD (Genome Aggregation Database) for population allele frequencies, variant constraint scores (pLI, LOEUF), and loss-of-function intolerance. Essential for variant pathogenicity interpretation, rare disease genetics, and identifying loss-of-function intolerant genes.
|
||||
license: CC0-1.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# gnomAD Database
|
||||
|
||||
## Overview
|
||||
|
||||
The Genome Aggregation Database (gnomAD) is the largest publicly available collection of human genetic variation, aggregated from large-scale sequencing projects. gnomAD v4 contains exome sequences from 730,947 individuals and genome sequences from 76,215 individuals across diverse ancestries. It provides population allele frequencies, variant consequence annotations, and gene-level constraint metrics that are essential for interpreting the clinical significance of genetic variants.
|
||||
|
||||
**Key resources:**
|
||||
- gnomAD browser: https://gnomad.broadinstitute.org/
|
||||
- GraphQL API: https://gnomad.broadinstitute.org/api
|
||||
- Data downloads: https://gnomad.broadinstitute.org/downloads
|
||||
- Documentation: https://gnomad.broadinstitute.org/help
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use gnomAD when:
|
||||
|
||||
- **Variant frequency lookup**: Checking if a variant is rare, common, or absent in the general population
|
||||
- **Pathogenicity assessment**: Rare variants (MAF < 1%) are candidates for disease causation; gnomAD helps filter benign common variants
|
||||
- **Loss-of-function intolerance**: Using pLI and LOEUF scores to assess whether a gene tolerates protein-truncating variants
|
||||
- **Population-stratified frequencies**: Comparing allele frequencies across ancestries (African/African American, Admixed American, Ashkenazi Jewish, East Asian, Finnish, Middle Eastern, Non-Finnish European, South Asian)
|
||||
- **ClinVar/ACMG variant classification**: gnomAD frequency data feeds into BA1/BS1 evidence codes for variant classification
|
||||
- **Constraint analysis**: Identifying genes depleted of missense or loss-of-function variation (z-scores, pLI, LOEUF)
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. gnomAD GraphQL API
|
||||
|
||||
gnomAD uses a GraphQL API accessible at `https://gnomad.broadinstitute.org/api`. Most queries fetch variants by gene or specific genomic position.
|
||||
|
||||
**Datasets available:**
|
||||
- `gnomad_r4` — gnomAD v4 exomes (recommended default, GRCh38)
|
||||
- `gnomad_r4_genomes` — gnomAD v4 genomes (GRCh38)
|
||||
- `gnomad_r3` — gnomAD v3 genomes (GRCh38)
|
||||
- `gnomad_r2_1` — gnomAD v2 exomes (GRCh37)
|
||||
|
||||
**Reference genomes:**
|
||||
- `GRCh38` — default for v3/v4
|
||||
- `GRCh37` — for v2
|
||||
|
||||
### 2. Querying Variants by Gene
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_gnomad_gene(gene_symbol, dataset="gnomad_r4", reference_genome="GRCh38"):
|
||||
"""Fetch variants in a gene from gnomAD."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
|
||||
query = """
|
||||
query GeneVariants($gene_symbol: String!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
|
||||
gene_id
|
||||
gene_symbol
|
||||
variants(dataset: $dataset) {
|
||||
variant_id
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
consequence
|
||||
genome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
populations {
|
||||
id
|
||||
ac
|
||||
an
|
||||
af
|
||||
}
|
||||
}
|
||||
exome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
}
|
||||
lof
|
||||
lof_flags
|
||||
lof_filter
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {
|
||||
"gene_symbol": gene_symbol,
|
||||
"dataset": dataset,
|
||||
"reference_genome": reference_genome
|
||||
}
|
||||
|
||||
response = requests.post(url, json={"query": query, "variables": variables})
|
||||
return response.json()
|
||||
|
||||
# Example
|
||||
result = query_gnomad_gene("BRCA1")
|
||||
gene_data = result["data"]["gene"]
|
||||
variants = gene_data["variants"]
|
||||
|
||||
# Filter to rare PTVs
|
||||
rare_ptvs = [
|
||||
v for v in variants
|
||||
if v.get("lof") == "LC" or v.get("consequence") in ["stop_gained", "frameshift_variant"]
|
||||
and v.get("genome", {}).get("af", 1) < 0.001
|
||||
]
|
||||
print(f"Found {len(rare_ptvs)} rare PTVs in {gene_data['gene_symbol']}")
|
||||
```
|
||||
|
||||
### 3. Querying a Specific Variant
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_gnomad_variant(variant_id, dataset="gnomad_r4"):
|
||||
"""Fetch details for a specific variant (e.g., '1-55516888-G-GA')."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
|
||||
query = """
|
||||
query VariantDetails($variantId: String!, $dataset: DatasetId!) {
|
||||
variant(variantId: $variantId, dataset: $dataset) {
|
||||
variant_id
|
||||
chrom
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
genome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
populations {
|
||||
id
|
||||
ac
|
||||
an
|
||||
af
|
||||
}
|
||||
}
|
||||
exome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
populations {
|
||||
id
|
||||
ac
|
||||
an
|
||||
af
|
||||
}
|
||||
}
|
||||
consequence
|
||||
lof
|
||||
rsids
|
||||
in_silico_predictors {
|
||||
id
|
||||
value
|
||||
flags
|
||||
}
|
||||
clinvar_variation_id
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
response = requests.post(
|
||||
url,
|
||||
json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}}
|
||||
)
|
||||
return response.json()
|
||||
|
||||
# Example: query a specific variant
|
||||
result = query_gnomad_variant("17-43094692-G-A") # BRCA1 missense
|
||||
variant = result["data"]["variant"]
|
||||
|
||||
if variant:
|
||||
genome_af = variant.get("genome", {}).get("af", "N/A")
|
||||
exome_af = variant.get("exome", {}).get("af", "N/A")
|
||||
print(f"Variant: {variant['variant_id']}")
|
||||
print(f" Consequence: {variant['consequence']}")
|
||||
print(f" Genome AF: {genome_af}")
|
||||
print(f" Exome AF: {exome_af}")
|
||||
print(f" LoF: {variant.get('lof')}")
|
||||
```
|
||||
|
||||
### 4. Gene Constraint Scores
|
||||
|
||||
gnomAD constraint scores assess how tolerant a gene is to variation relative to expectation:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_gnomad_constraint(gene_symbol, reference_genome="GRCh38"):
|
||||
"""Fetch constraint scores for a gene."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
|
||||
query = """
|
||||
query GeneConstraint($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
|
||||
gene_id
|
||||
gene_symbol
|
||||
gnomad_constraint {
|
||||
exp_lof
|
||||
exp_mis
|
||||
exp_syn
|
||||
obs_lof
|
||||
obs_mis
|
||||
obs_syn
|
||||
oe_lof
|
||||
oe_mis
|
||||
oe_syn
|
||||
oe_lof_lower
|
||||
oe_lof_upper
|
||||
lof_z
|
||||
mis_z
|
||||
syn_z
|
||||
pLI
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
response = requests.post(
|
||||
url,
|
||||
json={"query": query, "variables": {"gene_symbol": gene_symbol, "reference_genome": reference_genome}}
|
||||
)
|
||||
return response.json()
|
||||
|
||||
# Example
|
||||
result = query_gnomad_constraint("KCNQ2")
|
||||
gene = result["data"]["gene"]
|
||||
constraint = gene["gnomad_constraint"]
|
||||
|
||||
print(f"Gene: {gene['gene_symbol']}")
|
||||
print(f" pLI: {constraint['pLI']:.3f} (>0.9 = LoF intolerant)")
|
||||
print(f" LOEUF: {constraint['oe_lof_upper']:.3f} (<0.35 = highly constrained)")
|
||||
print(f" Obs/Exp LoF: {constraint['oe_lof']:.3f}")
|
||||
print(f" Missense Z: {constraint['mis_z']:.3f}")
|
||||
```
|
||||
|
||||
**Constraint score interpretation:**
|
||||
| Score | Range | Meaning |
|
||||
|-------|-------|---------|
|
||||
| `pLI` | 0–1 | Probability of LoF intolerance; >0.9 = highly intolerant |
|
||||
| `LOEUF` | 0–∞ | LoF observed/expected upper bound; <0.35 = constrained |
|
||||
| `oe_lof` | 0–∞ | Observed/expected ratio for LoF variants |
|
||||
| `mis_z` | −∞ to ∞ | Missense constraint z-score; >3.09 = constrained |
|
||||
| `syn_z` | −∞ to ∞ | Synonymous z-score (control; should be near 0) |
|
||||
|
||||
### 5. Population Frequency Analysis
|
||||
|
||||
```python
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
def get_population_frequencies(variant_id, dataset="gnomad_r4"):
|
||||
"""Extract per-population allele frequencies for a variant."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
|
||||
query = """
|
||||
query PopFreqs($variantId: String!, $dataset: DatasetId!) {
|
||||
variant(variantId: $variantId, dataset: $dataset) {
|
||||
variant_id
|
||||
genome {
|
||||
populations {
|
||||
id
|
||||
ac
|
||||
an
|
||||
af
|
||||
ac_hom
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
response = requests.post(
|
||||
url,
|
||||
json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}}
|
||||
)
|
||||
data = response.json()
|
||||
populations = data["data"]["variant"]["genome"]["populations"]
|
||||
|
||||
df = pd.DataFrame(populations)
|
||||
df = df[df["an"] > 0].copy()
|
||||
df["af"] = df["ac"] / df["an"]
|
||||
df = df.sort_values("af", ascending=False)
|
||||
return df
|
||||
|
||||
# Population IDs in gnomAD v4:
|
||||
# afr = African/African American
|
||||
# ami = Amish
|
||||
# amr = Admixed American
|
||||
# asj = Ashkenazi Jewish
|
||||
# eas = East Asian
|
||||
# fin = Finnish
|
||||
# mid = Middle Eastern
|
||||
# nfe = Non-Finnish European
|
||||
# sas = South Asian
|
||||
# remaining = Other
|
||||
```
|
||||
|
||||
### 6. Structural Variants (gnomAD-SV)
|
||||
|
||||
gnomAD also contains a structural variant dataset:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_gnomad_sv(gene_symbol):
|
||||
"""Query structural variants overlapping a gene."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
|
||||
query = """
|
||||
query SVsByGene($gene_symbol: String!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: GRCh38) {
|
||||
structural_variants {
|
||||
variant_id
|
||||
type
|
||||
chrom
|
||||
pos
|
||||
end
|
||||
af
|
||||
ac
|
||||
an
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
response = requests.post(url, json={"query": query, "variables": {"gene_symbol": gene_symbol}})
|
||||
return response.json()
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Variant Pathogenicity Assessment
|
||||
|
||||
1. **Check population frequency** — Is the variant rare enough to be pathogenic?
|
||||
- Use gnomAD AF < 1% for recessive, < 0.1% for dominant conditions
|
||||
- Check ancestry-specific frequencies (a variant rare overall may be common in one population)
|
||||
|
||||
2. **Assess functional impact** — LoF variants have highest prior probability
|
||||
- Check `lof` field: `HC` = high-confidence LoF, `LC` = low-confidence
|
||||
- Check `lof_flags` for issues like "NAGNAG_SITE", "PHYLOCSF_WEAK"
|
||||
|
||||
3. **Apply ACMG criteria:**
|
||||
- BA1: AF > 5% → Benign Stand-Alone
|
||||
- BS1: AF > disease prevalence threshold → Benign Supporting
|
||||
- PM2: Absent or very rare in gnomAD → Pathogenic Moderate
|
||||
|
||||
### Workflow 2: Gene Prioritization in Rare Disease
|
||||
|
||||
1. Query constraint scores for candidate genes
|
||||
2. Filter for pLI > 0.9 (haploinsufficient) or LOEUF < 0.35
|
||||
3. Cross-reference with observed LoF variants in the gene
|
||||
4. Integrate with ClinVar and disease databases
|
||||
|
||||
### Workflow 3: Population Genetics Research
|
||||
|
||||
1. Identify variant of interest from GWAS or clinical data
|
||||
2. Query per-population frequencies
|
||||
3. Compare frequency differences across ancestries
|
||||
4. Test for enrichment in specific founder populations
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use gnomAD v4 (gnomad_r4)** for the most current data; use v2 (gnomad_r2_1) only for GRCh37 compatibility
|
||||
- **Handle null responses**: Variants not observed in gnomAD are not necessarily pathogenic — absence is informative
|
||||
- **Distinguish exome vs. genome data**: Genome data has more uniform coverage; exome data is larger but may have coverage gaps
|
||||
- **Rate limit GraphQL queries**: Add delays between requests; batch queries when possible
|
||||
- **Homozygous counts** (`ac_hom`) are relevant for recessive disease analysis
|
||||
- **LOEUF is preferred over pLI** for gene constraint (less sensitive to sample size)
|
||||
|
||||
## Data Access
|
||||
|
||||
- **Browser**: https://gnomad.broadinstitute.org/ — interactive variant and gene browsing
|
||||
- **GraphQL API**: https://gnomad.broadinstitute.org/api — programmatic access
|
||||
- **Downloads**: https://gnomad.broadinstitute.org/downloads — VCF, Hail tables, constraint tables
|
||||
- **Google Cloud**: gs://gcp-public-data--gnomad/
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **gnomAD website**: https://gnomad.broadinstitute.org/
|
||||
- **gnomAD blog**: https://gnomad.broadinstitute.org/news
|
||||
- **Downloads**: https://gnomad.broadinstitute.org/downloads
|
||||
- **API explorer**: https://gnomad.broadinstitute.org/api (interactive GraphiQL)
|
||||
- **Constraint documentation**: https://gnomad.broadinstitute.org/help/constraint
|
||||
- **Citation**: Karczewski KJ et al. (2020) Nature. PMID: 32461654; Chen S et al. (2024) Nature. PMID: 38conservation
|
||||
- **GitHub**: https://github.com/broadinstitute/gnomad-browser
|
||||
219
scientific-skills/gnomad-database/references/graphql_queries.md
Normal file
219
scientific-skills/gnomad-database/references/graphql_queries.md
Normal file
@@ -0,0 +1,219 @@
|
||||
# gnomAD GraphQL Query Reference
|
||||
|
||||
## API Endpoint
|
||||
|
||||
```
|
||||
POST https://gnomad.broadinstitute.org/api
|
||||
Content-Type: application/json
|
||||
|
||||
Body: { "query": "<graphql_query>", "variables": { ... } }
|
||||
```
|
||||
|
||||
## Dataset Identifiers
|
||||
|
||||
| ID | Description | Reference Genome |
|
||||
|----|-------------|-----------------|
|
||||
| `gnomad_r4` | gnomAD v4 exomes (730K individuals) | GRCh38 |
|
||||
| `gnomad_r4_genomes` | gnomAD v4 genomes (76K individuals) | GRCh38 |
|
||||
| `gnomad_r3` | gnomAD v3 genomes (76K individuals) | GRCh38 |
|
||||
| `gnomad_r2_1` | gnomAD v2 exomes (125K individuals) | GRCh37 |
|
||||
| `gnomad_r2_1_non_cancer` | v2 non-cancer subset | GRCh37 |
|
||||
| `gnomad_cnv_r4` | Copy number variants | GRCh38 |
|
||||
|
||||
## Core Query Templates
|
||||
|
||||
### 1. Variants in a Gene
|
||||
|
||||
```graphql
|
||||
query GeneVariants($gene_symbol: String!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
|
||||
gene_id
|
||||
gene_symbol
|
||||
chrom
|
||||
start
|
||||
stop
|
||||
variants(dataset: $dataset) {
|
||||
variant_id
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
consequence
|
||||
lof
|
||||
lof_flags
|
||||
lof_filter
|
||||
genome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
populations { id ac an af ac_hom }
|
||||
}
|
||||
exome {
|
||||
af
|
||||
ac
|
||||
an
|
||||
ac_hom
|
||||
populations { id ac an af ac_hom }
|
||||
}
|
||||
rsids
|
||||
clinvar_variation_id
|
||||
in_silico_predictors { id value flags }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Single Variant Lookup
|
||||
|
||||
```graphql
|
||||
query VariantDetails($variantId: String!, $dataset: DatasetId!) {
|
||||
variant(variantId: $variantId, dataset: $dataset) {
|
||||
variant_id
|
||||
chrom
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
consequence
|
||||
lof
|
||||
lof_flags
|
||||
rsids
|
||||
genome { af ac an ac_hom populations { id ac an af } }
|
||||
exome { af ac an ac_hom populations { id ac an af } }
|
||||
in_silico_predictors { id value flags }
|
||||
clinvar_variation_id
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Variant ID format:** `{chrom}-{pos}-{ref}-{alt}` (e.g., `17-43094692-G-A`)
|
||||
|
||||
### 3. Gene Constraint
|
||||
|
||||
```graphql
|
||||
query GeneConstraint($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
|
||||
gene_id
|
||||
gene_symbol
|
||||
gnomad_constraint {
|
||||
exp_lof exp_mis exp_syn
|
||||
obs_lof obs_mis obs_syn
|
||||
oe_lof oe_mis oe_syn
|
||||
oe_lof_lower oe_lof_upper
|
||||
oe_mis_lower oe_mis_upper
|
||||
lof_z mis_z syn_z
|
||||
pLI
|
||||
flags
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Region Query (by genomic position)
|
||||
|
||||
```graphql
|
||||
query RegionVariants($chrom: String!, $start: Int!, $stop: Int!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
|
||||
region(chrom: $chrom, start: $start, stop: $stop, reference_genome: $reference_genome) {
|
||||
variants(dataset: $dataset) {
|
||||
variant_id
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
consequence
|
||||
genome { af ac an }
|
||||
exome { af ac an }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. ClinVar Variants in Gene
|
||||
|
||||
```graphql
|
||||
query ClinVarVariants($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
|
||||
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
|
||||
clinvar_variants {
|
||||
variant_id
|
||||
pos
|
||||
ref
|
||||
alt
|
||||
clinical_significance
|
||||
clinvar_variation_id
|
||||
gold_stars
|
||||
major_consequence
|
||||
in_gnomad
|
||||
gnomad_exomes { ac an af }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Population IDs
|
||||
|
||||
| ID | Population |
|
||||
|----|-----------|
|
||||
| `afr` | African/African American |
|
||||
| `ami` | Amish |
|
||||
| `amr` | Admixed American |
|
||||
| `asj` | Ashkenazi Jewish |
|
||||
| `eas` | East Asian |
|
||||
| `fin` | Finnish |
|
||||
| `mid` | Middle Eastern |
|
||||
| `nfe` | Non-Finnish European |
|
||||
| `sas` | South Asian |
|
||||
| `remaining` | Other/Unassigned |
|
||||
| `XX` | Female (appended to above, e.g., `afr_XX`) |
|
||||
| `XY` | Male |
|
||||
|
||||
## LoF Annotation Fields
|
||||
|
||||
| Field | Values | Meaning |
|
||||
|-------|--------|---------|
|
||||
| `lof` | `HC`, `LC`, `null` | High/low-confidence LoF, or not annotated as LoF |
|
||||
| `lof_flags` | comma-separated strings | Quality flags (e.g., `NAGNAG_SITE`, `NON_CANONICAL_SPLICE_SITE`) |
|
||||
| `lof_filter` | string or null | Reason for LC classification |
|
||||
|
||||
## In Silico Predictor IDs
|
||||
|
||||
Common values for `in_silico_predictors[].id`:
|
||||
- `cadd` — CADD PHRED score
|
||||
- `revel` — REVEL score
|
||||
- `spliceai_ds_max` — SpliceAI max delta score
|
||||
- `pangolin_largest_ds` — Pangolin splicing score
|
||||
- `polyphen` — PolyPhen-2 prediction
|
||||
- `sift` — SIFT prediction
|
||||
|
||||
## Python Helper
|
||||
|
||||
```python
|
||||
import requests
|
||||
import time
|
||||
|
||||
def gnomad_query(query: str, variables: dict, retries: int = 3) -> dict:
|
||||
"""Execute a gnomAD GraphQL query with retry logic."""
|
||||
url = "https://gnomad.broadinstitute.org/api"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
json={"query": query, "variables": variables},
|
||||
headers=headers,
|
||||
timeout=60
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
if "errors" in result:
|
||||
print(f"GraphQL errors: {result['errors']}")
|
||||
return result
|
||||
|
||||
return result
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt < retries - 1:
|
||||
time.sleep(2 ** attempt) # exponential backoff
|
||||
else:
|
||||
raise
|
||||
|
||||
return {}
|
||||
```
|
||||
@@ -0,0 +1,85 @@
|
||||
# gnomAD Variant Interpretation Guide
|
||||
|
||||
## Allele Frequency Thresholds for Disease Interpretation
|
||||
|
||||
### ACMG/AMP Criteria
|
||||
|
||||
| Criterion | AF threshold | Classification |
|
||||
|-----------|-------------|----------------|
|
||||
| BA1 | > 0.05 (5%) | Benign Stand-Alone |
|
||||
| BS1 | > disease prevalence | Benign Supporting |
|
||||
| PM2_Supporting | < 0.0001 (0.01%) for dominant; absent for recessive | Pathogenic Moderate → Supporting |
|
||||
|
||||
**Notes:**
|
||||
- BA1 applies to most conditions; exceptions include autosomal dominant with high penetrance (e.g., LDLR for FH: BA1 threshold is ~0.1%)
|
||||
- BS1 requires knowing disease prevalence; for rare diseases (1:10,000), BS1 if AF > 0.01%
|
||||
- Homozygous counts (`ac_hom`) matter for recessive diseases
|
||||
|
||||
### Practical Thresholds
|
||||
|
||||
| Inheritance | Suggested max AF |
|
||||
|-------------|-----------------|
|
||||
| Autosomal Dominant (high penetrance) | < 0.001 (0.1%) |
|
||||
| Autosomal Dominant (reduced penetrance) | < 0.01 (1%) |
|
||||
| Autosomal Recessive | < 0.01 (1%) |
|
||||
| X-linked recessive | < 0.001 in females |
|
||||
|
||||
## Absence in gnomAD
|
||||
|
||||
A variant **absent in gnomAD** (ac = 0) is evidence of rarity, but interpret carefully:
|
||||
- gnomAD does not capture all rare variants (sequencing depth, coverage, calling thresholds)
|
||||
- A variant absent in 730K exomes is very strong evidence of rarity for PM2
|
||||
- Check coverage at the position: if < 10x, absence is less informative
|
||||
|
||||
## Loss-of-Function Variant Assessment
|
||||
|
||||
### LOFTEE Classification (lof field)
|
||||
|
||||
- **HC (High Confidence):** Predicted to truncate functional protein
|
||||
- Stop-gained, splice site (±1,2), frameshift variants
|
||||
- Passes all LOFTEE quality filters
|
||||
|
||||
- **LC (Low Confidence):** LoF annotation with quality concerns
|
||||
- Check `lof_flags` for specific reason
|
||||
- May still be pathogenic — requires manual review
|
||||
|
||||
### Common lof_flags
|
||||
|
||||
| Flag | Meaning |
|
||||
|------|---------|
|
||||
| `NAGNAG_SITE` | Splice site may be rescued by nearby alternative site |
|
||||
| `NON_CANONICAL_SPLICE_SITE` | Not a canonical splice donor/acceptor |
|
||||
| `PHYLOCSF_WEAK` | Weak phylogenetic conservation signal |
|
||||
| `SMALL_INTRON` | Intron too small to affect splicing |
|
||||
| `SINGLE_EXON` | Single-exon gene (no splicing) |
|
||||
| `LAST_EXON` | In last exon (NMD may not apply) |
|
||||
|
||||
## Homozygous Observations
|
||||
|
||||
The `ac_hom` field counts homozygous (or hemizygous in males for chrX) observations.
|
||||
|
||||
**For recessive diseases:**
|
||||
- If a variant is observed homozygous in healthy individuals in gnomAD, it is strong evidence against pathogenicity (BS2 criterion)
|
||||
- Even a single homozygous observation can be informative
|
||||
|
||||
## Coverage at Position
|
||||
|
||||
Always check that gnomAD has adequate coverage at the variant position before concluding absence is meaningful. The gnomAD browser shows coverage tracks, and coverage data can be downloaded from:
|
||||
- https://gnomad.broadinstitute.org/downloads#v4-coverage
|
||||
|
||||
## In Silico Predictor Scores
|
||||
|
||||
| Predictor | Score Range | Pathogenic Threshold |
|
||||
|-----------|-------------|---------------------|
|
||||
| CADD PHRED | 0–99 | > 20 deleterious; > 30 highly deleterious |
|
||||
| REVEL | 0–1 | > 0.75 likely pathogenic (for missense) |
|
||||
| SpliceAI max_ds | 0–1 | > 0.5 likely splice-altering |
|
||||
| SIFT | 0–1 | < 0.05 deleterious |
|
||||
| PolyPhen-2 | 0–1 | > 0.909 probably damaging |
|
||||
|
||||
## Ancestry-Specific Considerations
|
||||
|
||||
- A variant rare overall may be a common founder variant in a specific population
|
||||
- Always check all ancestry-specific AFs, not just the total
|
||||
- Finnish and Ashkenazi Jewish populations have high rates of founder variants
|
||||
- Report ancestry-specific frequencies when relevant to patient ancestry
|
||||
315
scientific-skills/gtex-database/SKILL.md
Normal file
315
scientific-skills/gtex-database/SKILL.md
Normal file
@@ -0,0 +1,315 @@
|
||||
---
|
||||
name: gtex-database
|
||||
description: Query GTEx (Genotype-Tissue Expression) portal for tissue-specific gene expression, eQTLs (expression quantitative trait loci), and sQTLs. Essential for linking GWAS variants to gene regulation, understanding tissue-specific expression, and interpreting non-coding variant effects.
|
||||
license: CC-BY-4.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# GTEx Database
|
||||
|
||||
## Overview
|
||||
|
||||
The Genotype-Tissue Expression (GTEx) project provides a comprehensive resource for studying tissue-specific gene expression and genetic regulation across 54 non-diseased human tissues from nearly 1,000 individuals. GTEx v10 (the latest release) enables researchers to understand how genetic variants regulate gene expression (eQTLs) and splicing (sQTLs) in a tissue-specific manner, which is critical for interpreting GWAS loci and identifying regulatory mechanisms.
|
||||
|
||||
**Key resources:**
|
||||
- GTEx Portal: https://gtexportal.org/
|
||||
- GTEx API v2: https://gtexportal.org/api/v2/
|
||||
- Data downloads: https://gtexportal.org/home/downloads/adult-gtex/
|
||||
- Documentation: https://gtexportal.org/home/documentationPage
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use GTEx when:
|
||||
|
||||
- **GWAS locus interpretation**: Identifying which gene a non-coding GWAS variant regulates via eQTLs
|
||||
- **Tissue-specific expression**: Comparing gene expression levels across 54 human tissues
|
||||
- **eQTL colocalization**: Testing if a GWAS signal and an eQTL signal share the same causal variant
|
||||
- **Multi-tissue eQTL analysis**: Finding variants that regulate expression in multiple tissues
|
||||
- **Splicing QTLs (sQTLs)**: Identifying variants that affect splicing ratios
|
||||
- **Tissue specificity analysis**: Determining which tissues express a gene of interest
|
||||
- **Gene expression exploration**: Retrieving normalized expression levels (TPM) per tissue
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. GTEx REST API v2
|
||||
|
||||
Base URL: `https://gtexportal.org/api/v2/`
|
||||
|
||||
The API returns JSON and does not require authentication. All endpoints support pagination.
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://gtexportal.org/api/v2"
|
||||
|
||||
def gtex_get(endpoint, params=None):
|
||||
"""Make a GET request to the GTEx API."""
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.get(url, params=params, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Gene Expression by Tissue
|
||||
|
||||
```python
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
def get_gene_expression_by_tissue(gene_id_or_symbol, dataset_id="gtex_v10"):
|
||||
"""Get median gene expression across all tissues."""
|
||||
url = "https://gtexportal.org/api/v2/expression/medianGeneExpression"
|
||||
params = {
|
||||
"gencodeId": gene_id_or_symbol,
|
||||
"datasetId": dataset_id,
|
||||
"itemsPerPage": 100
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
|
||||
records = data.get("data", [])
|
||||
df = pd.DataFrame(records)
|
||||
if not df.empty:
|
||||
df = df[["tissueSiteDetailId", "tissueSiteDetail", "median", "unit"]].sort_values(
|
||||
"median", ascending=False
|
||||
)
|
||||
return df
|
||||
|
||||
# Example: get expression of APOE across tissues
|
||||
df = get_gene_expression_by_tissue("ENSG00000130203.10") # APOE GENCODE ID
|
||||
# Or use gene symbol (some endpoints accept both)
|
||||
print(df.head(10))
|
||||
# Output: tissue name, median TPM, sorted by highest expression
|
||||
```
|
||||
|
||||
### 3. eQTL Lookup
|
||||
|
||||
```python
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
def query_eqtl(gene_id, tissue_id=None, dataset_id="gtex_v10"):
|
||||
"""Query significant eQTLs for a gene, optionally filtered by tissue."""
|
||||
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
|
||||
params = {
|
||||
"gencodeId": gene_id,
|
||||
"datasetId": dataset_id,
|
||||
"itemsPerPage": 250
|
||||
}
|
||||
if tissue_id:
|
||||
params["tissueSiteDetailId"] = tissue_id
|
||||
|
||||
all_results = []
|
||||
page = 0
|
||||
while True:
|
||||
params["page"] = page
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
results = data.get("data", [])
|
||||
if not results:
|
||||
break
|
||||
all_results.extend(results)
|
||||
if len(results) < params["itemsPerPage"]:
|
||||
break
|
||||
page += 1
|
||||
|
||||
df = pd.DataFrame(all_results)
|
||||
if not df.empty:
|
||||
df = df.sort_values("pval", ascending=True)
|
||||
return df
|
||||
|
||||
# Example: Find eQTLs for PCSK9
|
||||
df = query_eqtl("ENSG00000169174.14")
|
||||
print(df[["snpId", "tissueSiteDetailId", "slope", "pval", "gencodeId"]].head(20))
|
||||
```
|
||||
|
||||
### 4. Single-Tissue eQTL by Variant
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_variant_eqtl(variant_id, tissue_id=None, dataset_id="gtex_v10"):
|
||||
"""Get all eQTL associations for a specific variant."""
|
||||
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
|
||||
params = {
|
||||
"variantId": variant_id, # e.g., "chr1_55516888_G_GA_b38"
|
||||
"datasetId": dataset_id,
|
||||
"itemsPerPage": 250
|
||||
}
|
||||
if tissue_id:
|
||||
params["tissueSiteDetailId"] = tissue_id
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
|
||||
# GTEx variant ID format: chr{chrom}_{pos}_{ref}_{alt}_b38
|
||||
# Example: "chr17_43094692_G_A_b38"
|
||||
```
|
||||
|
||||
### 5. Multi-Tissue eQTL (eGenes)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def get_egenes(tissue_id, dataset_id="gtex_v10"):
|
||||
"""Get all eGenes (genes with at least one significant eQTL) in a tissue."""
|
||||
url = "https://gtexportal.org/api/v2/association/egene"
|
||||
params = {
|
||||
"tissueSiteDetailId": tissue_id,
|
||||
"datasetId": dataset_id,
|
||||
"itemsPerPage": 500
|
||||
}
|
||||
|
||||
all_egenes = []
|
||||
page = 0
|
||||
while True:
|
||||
params["page"] = page
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
batch = data.get("data", [])
|
||||
if not batch:
|
||||
break
|
||||
all_egenes.extend(batch)
|
||||
if len(batch) < params["itemsPerPage"]:
|
||||
break
|
||||
page += 1
|
||||
return all_egenes
|
||||
|
||||
# Example: all eGenes in whole blood
|
||||
egenes = get_egenes("Whole_Blood")
|
||||
print(f"Found {len(egenes)} eGenes in Whole Blood")
|
||||
```
|
||||
|
||||
### 6. Tissue List
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def get_tissues(dataset_id="gtex_v10"):
|
||||
"""Get all available tissues with metadata."""
|
||||
url = "https://gtexportal.org/api/v2/dataset/tissueSiteDetail"
|
||||
params = {"datasetId": dataset_id, "itemsPerPage": 100}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()["data"]
|
||||
|
||||
tissues = get_tissues()
|
||||
# Key fields: tissueSiteDetailId, tissueSiteDetail, colorHex, samplingSite
|
||||
# Common tissue IDs:
|
||||
# Whole_Blood, Brain_Cortex, Liver, Kidney_Cortex, Heart_Left_Ventricle,
|
||||
# Lung, Muscle_Skeletal, Adipose_Subcutaneous, Colon_Transverse, ...
|
||||
```
|
||||
|
||||
### 7. sQTL (Splicing QTLs)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def query_sqtl(gene_id, tissue_id=None, dataset_id="gtex_v10"):
|
||||
"""Query significant sQTLs for a gene."""
|
||||
url = "https://gtexportal.org/api/v2/association/singleTissueSqtl"
|
||||
params = {
|
||||
"gencodeId": gene_id,
|
||||
"datasetId": dataset_id,
|
||||
"itemsPerPage": 250
|
||||
}
|
||||
if tissue_id:
|
||||
params["tissueSiteDetailId"] = tissue_id
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Interpreting a GWAS Variant via eQTLs
|
||||
|
||||
1. **Identify the GWAS variant** (rs ID or chromosome position)
|
||||
2. **Convert to GTEx variant ID format** (`chr{chrom}_{pos}_{ref}_{alt}_b38`)
|
||||
3. **Query all eQTL associations** for that variant across tissues
|
||||
4. **Check effect direction**: is the GWAS risk allele the same as the eQTL effect allele?
|
||||
5. **Prioritize tissues**: select tissues biologically relevant to the disease
|
||||
6. **Consider colocalization** using `coloc` (R package) with full summary statistics
|
||||
|
||||
```python
|
||||
import requests, pandas as pd
|
||||
|
||||
def interpret_gwas_variant(variant_id, dataset_id="gtex_v10"):
|
||||
"""Find all genes regulated by a GWAS variant."""
|
||||
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
|
||||
params = {"variantId": variant_id, "datasetId": dataset_id, "itemsPerPage": 500}
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
|
||||
df = pd.DataFrame(data.get("data", []))
|
||||
if df.empty:
|
||||
return df
|
||||
return df[["geneSymbol", "tissueSiteDetailId", "slope", "pval", "maf"]].sort_values("pval")
|
||||
|
||||
# Example
|
||||
results = interpret_gwas_variant("chr1_154453788_A_T_b38")
|
||||
print(results.groupby("geneSymbol")["tissueSiteDetailId"].count().sort_values(ascending=False))
|
||||
```
|
||||
|
||||
### Workflow 2: Gene Expression Atlas
|
||||
|
||||
1. Get median expression for a gene across all tissues
|
||||
2. Identify the primary expression site(s)
|
||||
3. Compare with disease-relevant tissues
|
||||
4. Download raw data for statistical comparisons
|
||||
|
||||
### Workflow 3: Tissue-Specific eQTL Analysis
|
||||
|
||||
1. Select tissues relevant to your disease
|
||||
2. Query all eGenes in that tissue
|
||||
3. Cross-reference with GWAS-significant loci
|
||||
4. Identify co-localized signals
|
||||
|
||||
## Key API Endpoints
|
||||
|
||||
| Endpoint | Description |
|
||||
|----------|-------------|
|
||||
| `/expression/medianGeneExpression` | Median TPM by tissue for a gene |
|
||||
| `/expression/geneExpression` | Full distribution of expression per tissue |
|
||||
| `/association/singleTissueEqtl` | Significant eQTL associations |
|
||||
| `/association/singleTissueSqtl` | Significant sQTL associations |
|
||||
| `/association/egene` | eGenes in a tissue |
|
||||
| `/dataset/tissueSiteDetail` | Available tissues with metadata |
|
||||
| `/reference/gene` | Gene metadata (GENCODE IDs, coordinates) |
|
||||
| `/variant/variantPage` | Variant lookup by rsID or position |
|
||||
|
||||
## Datasets Available
|
||||
|
||||
| ID | Description |
|
||||
|----|-------------|
|
||||
| `gtex_v10` | GTEx v10 (current; ~960 donors, 54 tissues) |
|
||||
| `gtex_v8` | GTEx v8 (838 donors, 49 tissues) — older but widely cited |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use GENCODE IDs** (e.g., `ENSG00000130203.10`) for gene queries; the `.version` suffix matters for some endpoints
|
||||
- **GTEx variant IDs** use the format `chr{chrom}_{pos}_{ref}_{alt}_b38` (GRCh38) — different from rs IDs
|
||||
- **Handle pagination**: Large queries (e.g., all eGenes) require iterating through pages
|
||||
- **Tissue nomenclature**: Use `tissueSiteDetailId` (e.g., `Whole_Blood`) not display names for API calls
|
||||
- **FDR correction**: GTEx uses FDR < 0.05 (q-value) as the significance threshold for eQTLs
|
||||
- **Effect alleles**: The `slope` field is the effect of the alternative allele; positive = higher expression with alt allele
|
||||
|
||||
## Data Downloads (for large-scale analysis)
|
||||
|
||||
For genome-wide analyses, download full summary statistics rather than using the API:
|
||||
|
||||
```bash
|
||||
# All significant eQTLs (v10)
|
||||
wget https://storage.googleapis.com/adult-gtex/bulk-qtl/v10/single-tissue-cis-qtl/GTEx_Analysis_v10_eQTL.tar
|
||||
|
||||
# Normalized expression matrices
|
||||
wget https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_reads.gct.gz
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **GTEx Portal**: https://gtexportal.org/
|
||||
- **API documentation**: https://gtexportal.org/api/v2/
|
||||
- **Data downloads**: https://gtexportal.org/home/downloads/adult-gtex/
|
||||
- **GitHub**: https://github.com/broadinstitute/gtex-pipeline
|
||||
- **Citation**: GTEx Consortium (2020) Science. PMID: 32913098
|
||||
191
scientific-skills/gtex-database/references/api_reference.md
Normal file
191
scientific-skills/gtex-database/references/api_reference.md
Normal file
@@ -0,0 +1,191 @@
|
||||
# GTEx API v2 Reference
|
||||
|
||||
## Base URL
|
||||
|
||||
```
|
||||
https://gtexportal.org/api/v2/
|
||||
```
|
||||
|
||||
All endpoints accept GET requests. Responses are JSON. No authentication required.
|
||||
|
||||
## Common Query Parameters
|
||||
|
||||
| Parameter | Description | Example |
|
||||
|-----------|-------------|---------|
|
||||
| `gencodeId` | GENCODE gene ID with version | `ENSG00000130203.10` |
|
||||
| `geneSymbol` | Gene symbol | `APOE` |
|
||||
| `variantId` | GTEx variant ID | `chr17_45413693_C_T_b38` |
|
||||
| `tissueSiteDetailId` | Tissue identifier | `Whole_Blood` |
|
||||
| `datasetId` | Dataset version | `gtex_v10` |
|
||||
| `itemsPerPage` | Results per page | `250` |
|
||||
| `page` | Page number (0-indexed) | `0` |
|
||||
|
||||
## Endpoint Reference
|
||||
|
||||
### Expression Endpoints
|
||||
|
||||
#### `GET /expression/medianGeneExpression`
|
||||
|
||||
Median TPM expression for a gene across tissues.
|
||||
|
||||
**Parameters:** `gencodeId`, `datasetId`, `itemsPerPage`
|
||||
|
||||
**Response fields:**
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"gencodeId": "ENSG00000130203.10",
|
||||
"geneSymbol": "APOE",
|
||||
"tissueSiteDetailId": "Liver",
|
||||
"tissueSiteDetail": "Liver",
|
||||
"median": 2847.9,
|
||||
"unit": "TPM",
|
||||
"datasetId": "gtex_v10"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### `GET /expression/geneExpression`
|
||||
|
||||
Full expression distribution (box plot data) per tissue.
|
||||
|
||||
**Parameters:** `gencodeId`, `tissueSiteDetailId`, `datasetId`
|
||||
|
||||
**Response fields:**
|
||||
- `data[].tissueExpressionData.data`: array of TPM values per sample
|
||||
|
||||
### Association (QTL) Endpoints
|
||||
|
||||
#### `GET /association/singleTissueEqtl`
|
||||
|
||||
Significant single-tissue cis-eQTLs.
|
||||
|
||||
**Parameters:** `gencodeId` OR `variantId`, `tissueSiteDetailId` (optional), `datasetId`
|
||||
|
||||
**Response fields:**
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"gencodeId": "ENSG00000169174.14",
|
||||
"geneSymbol": "PCSK9",
|
||||
"variantId": "chr1_55516888_G_GA_b38",
|
||||
"snpId": "rs72646508",
|
||||
"tissueSiteDetailId": "Liver",
|
||||
"slope": -0.342,
|
||||
"slopeStandardError": 0.051,
|
||||
"pval": 3.2e-11,
|
||||
"qval": 2.1e-8,
|
||||
"maf": 0.089,
|
||||
"datasetId": "gtex_v10"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Key fields:**
|
||||
- `slope`: effect of alt allele on expression (log2 scale after rank normalization)
|
||||
- `pval`: nominal p-value
|
||||
- `qval`: FDR-adjusted q-value
|
||||
- `maf`: minor allele frequency in the GTEx cohort
|
||||
|
||||
#### `GET /association/singleTissueSqtl`
|
||||
|
||||
Significant single-tissue sQTLs (splicing).
|
||||
|
||||
**Parameters:** Same as eQTL endpoint
|
||||
|
||||
#### `GET /association/egene`
|
||||
|
||||
All eGenes (genes with ≥1 significant eQTL) in a tissue.
|
||||
|
||||
**Parameters:** `tissueSiteDetailId`, `datasetId`
|
||||
|
||||
**Response fields:** gene ID, gene symbol, best eQTL variant, p-value, q-value
|
||||
|
||||
### Dataset/Metadata Endpoints
|
||||
|
||||
#### `GET /dataset/tissueSiteDetail`
|
||||
|
||||
List of all available tissues.
|
||||
|
||||
**Parameters:** `datasetId`, `itemsPerPage`
|
||||
|
||||
**Response fields:**
|
||||
- `tissueSiteDetailId`: API identifier (use this in queries)
|
||||
- `tissueSiteDetail`: Display name
|
||||
- `colorHex`: Color for visualization
|
||||
- `samplingSite`: Anatomical location
|
||||
|
||||
#### `GET /reference/gene`
|
||||
|
||||
Gene metadata from GENCODE.
|
||||
|
||||
**Parameters:** `geneSymbol` OR `gencodeId`, `referenceGenomeId` (GRCh38)
|
||||
|
||||
### Variant Endpoints
|
||||
|
||||
#### `GET /variant/variantPage`
|
||||
|
||||
Variant metadata and lookup.
|
||||
|
||||
**Parameters:** `snpId` (rsID) OR `variantId`
|
||||
|
||||
## Tissue IDs Reference (Common Tissues)
|
||||
|
||||
| ID | Display Name |
|
||||
|----|-------------|
|
||||
| `Whole_Blood` | Whole Blood |
|
||||
| `Brain_Cortex` | Brain - Cortex |
|
||||
| `Brain_Hippocampus` | Brain - Hippocampus |
|
||||
| `Brain_Frontal_Cortex_BA9` | Brain - Frontal Cortex (BA9) |
|
||||
| `Liver` | Liver |
|
||||
| `Kidney_Cortex` | Kidney - Cortex |
|
||||
| `Heart_Left_Ventricle` | Heart - Left Ventricle |
|
||||
| `Lung` | Lung |
|
||||
| `Muscle_Skeletal` | Muscle - Skeletal |
|
||||
| `Adipose_Subcutaneous` | Adipose - Subcutaneous |
|
||||
| `Colon_Transverse` | Colon - Transverse |
|
||||
| `Small_Intestine_Terminal_Ileum` | Small Intestine - Terminal Ileum |
|
||||
| `Skin_Sun_Exposed_Lower_leg` | Skin - Sun Exposed (Lower leg) |
|
||||
| `Thyroid` | Thyroid |
|
||||
| `Nerve_Tibial` | Nerve - Tibial |
|
||||
| `Artery_Coronary` | Artery - Coronary |
|
||||
| `Artery_Aorta` | Artery - Aorta |
|
||||
| `Pancreas` | Pancreas |
|
||||
| `Pituitary` | Pituitary |
|
||||
| `Spleen` | Spleen |
|
||||
| `Prostate` | Prostate |
|
||||
| `Ovary` | Ovary |
|
||||
| `Uterus` | Uterus |
|
||||
| `Testis` | Testis |
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
import requests
|
||||
from requests.exceptions import HTTPError, Timeout
|
||||
|
||||
def safe_gtex_query(endpoint, params):
|
||||
url = f"https://gtexportal.org/api/v2/{endpoint}"
|
||||
try:
|
||||
response = requests.get(url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except HTTPError as e:
|
||||
print(f"HTTP error {e.response.status_code}: {e.response.text}")
|
||||
except Timeout:
|
||||
print("Request timed out")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return None
|
||||
```
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
GTEx API does not publish explicit rate limits but:
|
||||
- Add 0.5–1s delays between bulk queries
|
||||
- Use data downloads for genome-wide analyses instead of API
|
||||
- Cache results locally for repeated queries
|
||||
305
scientific-skills/interpro-database/SKILL.md
Normal file
305
scientific-skills/interpro-database/SKILL.md
Normal file
@@ -0,0 +1,305 @@
|
||||
---
|
||||
name: interpro-database
|
||||
description: Query InterPro for protein family, domain, and functional site annotations. Integrates Pfam, PANTHER, PRINTS, SMART, SUPERFAMILY, and 11 other member databases. Use for protein function prediction, domain architecture analysis, evolutionary classification, and GO term mapping.
|
||||
license: CC0-1.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# InterPro Database
|
||||
|
||||
## Overview
|
||||
|
||||
InterPro (https://www.ebi.ac.uk/interpro/) is a comprehensive resource for protein family and domain classification maintained by EMBL-EBI. It integrates signatures from 13 member databases including Pfam, PANTHER, PRINTS, ProSite, SMART, TIGRFAM, SUPERFAMILY, CDD, and others, providing a unified view of protein functional annotations for over 100 million protein sequences.
|
||||
|
||||
InterPro classifies proteins into:
|
||||
- **Families**: Groups of proteins sharing common ancestry and function
|
||||
- **Domains**: Independently folding structural/functional units
|
||||
- **Homologous superfamilies**: Structurally similar protein regions
|
||||
- **Repeats**: Short tandem sequences
|
||||
- **Sites**: Functional sites (active, binding, PTM)
|
||||
|
||||
**Key resources:**
|
||||
- InterPro website: https://www.ebi.ac.uk/interpro/
|
||||
- REST API: https://www.ebi.ac.uk/interpro/api/
|
||||
- API documentation: https://github.com/ProteinsWebTeam/interpro7-api/blob/master/docs/
|
||||
- Python client: via `requests`
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use InterPro when:
|
||||
|
||||
- **Protein function prediction**: What function(s) does an uncharacterized protein likely have?
|
||||
- **Domain architecture**: What domains make up a protein, and in what order?
|
||||
- **Protein family classification**: Which family/superfamily does a protein belong to?
|
||||
- **GO term annotation**: Map protein sequences to Gene Ontology terms via InterPro
|
||||
- **Evolutionary analysis**: Are two proteins in the same homologous superfamily?
|
||||
- **Structure prediction context**: What domains should a new protein structure be compared against?
|
||||
- **Pipeline annotation**: Batch-annotate proteomes or novel sequences
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. InterPro REST API
|
||||
|
||||
Base URL: `https://www.ebi.ac.uk/interpro/api/`
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://www.ebi.ac.uk/interpro/api"
|
||||
|
||||
def interpro_get(endpoint, params=None):
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
headers = {"Accept": "application/json"}
|
||||
response = requests.get(url, params=params, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Look Up a Protein
|
||||
|
||||
```python
|
||||
def get_protein_entries(uniprot_id):
|
||||
"""Get all InterPro entries that match a UniProt protein."""
|
||||
data = interpro_get(f"protein/UniProt/{uniprot_id}/entry/InterPro/")
|
||||
return data
|
||||
|
||||
# Example: Human p53 (TP53)
|
||||
result = get_protein_entries("P04637")
|
||||
entries = result.get("results", [])
|
||||
|
||||
for entry in entries:
|
||||
meta = entry["metadata"]
|
||||
print(f" {meta['accession']} ({meta['type']}): {meta['name']}")
|
||||
# e.g., IPR011615 (domain): p53, tetramerisation domain
|
||||
# IPR010991 (domain): p53, DNA-binding domain
|
||||
# IPR013872 (family): p53 family
|
||||
```
|
||||
|
||||
### 3. Get Specific InterPro Entry
|
||||
|
||||
```python
|
||||
def get_entry(interpro_id):
|
||||
"""Fetch details for an InterPro entry."""
|
||||
return interpro_get(f"entry/InterPro/{interpro_id}/")
|
||||
|
||||
# Example: Get Pfam domain PF00397 (WW domain)
|
||||
ww_entry = get_entry("IPR001202")
|
||||
print(f"Name: {ww_entry['metadata']['name']}")
|
||||
print(f"Type: {ww_entry['metadata']['type']}")
|
||||
|
||||
# Also supports member database IDs:
|
||||
def get_pfam_entry(pfam_id):
|
||||
return interpro_get(f"entry/Pfam/{pfam_id}/")
|
||||
|
||||
pfam = get_pfam_entry("PF00397")
|
||||
```
|
||||
|
||||
### 4. Search Proteins by InterPro Entry
|
||||
|
||||
```python
|
||||
def get_proteins_for_entry(interpro_id, database="UniProt", page_size=25):
|
||||
"""Get all proteins annotated with an InterPro entry."""
|
||||
params = {"page_size": page_size}
|
||||
data = interpro_get(f"entry/InterPro/{interpro_id}/protein/{database}/", params)
|
||||
return data
|
||||
|
||||
# Example: Find all human kinase-domain proteins
|
||||
kinase_proteins = get_proteins_for_entry("IPR000719") # Protein kinase domain
|
||||
print(f"Total proteins: {kinase_proteins['count']}")
|
||||
```
|
||||
|
||||
### 5. Domain Architecture
|
||||
|
||||
```python
|
||||
def get_domain_architecture(uniprot_id):
|
||||
"""Get the complete domain architecture of a protein."""
|
||||
data = interpro_get(f"protein/UniProt/{uniprot_id}/")
|
||||
return data
|
||||
|
||||
# Example: Get full domain architecture for EGFR
|
||||
egfr = get_domain_architecture("P00533")
|
||||
|
||||
# The response includes locations of all matching entries on the sequence
|
||||
for entry in egfr.get("entries", []):
|
||||
for fragment in entry.get("entry_protein_locations", []):
|
||||
for loc in fragment.get("fragments", []):
|
||||
print(f" {entry['accession']}: {loc['start']}-{loc['end']}")
|
||||
```
|
||||
|
||||
### 6. GO Term Mapping
|
||||
|
||||
```python
|
||||
def get_go_terms_for_protein(uniprot_id):
|
||||
"""Get GO terms associated with a protein via InterPro."""
|
||||
data = interpro_get(f"protein/UniProt/{uniprot_id}/")
|
||||
|
||||
# GO terms are embedded in the entry metadata
|
||||
go_terms = []
|
||||
for entry in data.get("entries", []):
|
||||
go = entry.get("metadata", {}).get("go_terms", [])
|
||||
go_terms.extend(go)
|
||||
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
unique_go = []
|
||||
for term in go_terms:
|
||||
if term["identifier"] not in seen:
|
||||
seen.add(term["identifier"])
|
||||
unique_go.append(term)
|
||||
|
||||
return unique_go
|
||||
|
||||
# GO terms include:
|
||||
# {"identifier": "GO:0004672", "name": "protein kinase activity", "category": {"code": "F", "name": "Molecular Function"}}
|
||||
```
|
||||
|
||||
### 7. Batch Protein Lookup
|
||||
|
||||
```python
|
||||
def batch_lookup_proteins(uniprot_ids, database="UniProt"):
|
||||
"""Look up multiple proteins and collect their InterPro entries."""
|
||||
import time
|
||||
results = {}
|
||||
for uid in uniprot_ids:
|
||||
try:
|
||||
data = interpro_get(f"protein/{database}/{uid}/entry/InterPro/")
|
||||
entries = data.get("results", [])
|
||||
results[uid] = [
|
||||
{
|
||||
"accession": e["metadata"]["accession"],
|
||||
"name": e["metadata"]["name"],
|
||||
"type": e["metadata"]["type"]
|
||||
}
|
||||
for e in entries
|
||||
]
|
||||
except Exception as e:
|
||||
results[uid] = {"error": str(e)}
|
||||
time.sleep(0.3) # Rate limiting
|
||||
return results
|
||||
|
||||
# Example
|
||||
proteins = ["P04637", "P00533", "P38398", "Q9Y6I9"]
|
||||
domain_info = batch_lookup_proteins(proteins)
|
||||
for uid, entries in domain_info.items():
|
||||
print(f"\n{uid}:")
|
||||
for e in entries[:3]:
|
||||
print(f" - {e['accession']} ({e['type']}): {e['name']}")
|
||||
```
|
||||
|
||||
### 8. Search by Text or Taxonomy
|
||||
|
||||
```python
|
||||
def search_entries(query, entry_type=None, taxonomy_id=None):
|
||||
"""Search InterPro entries by text."""
|
||||
params = {"search": query, "page_size": 20}
|
||||
if entry_type:
|
||||
params["type"] = entry_type # family, domain, homologous_superfamily, etc.
|
||||
|
||||
endpoint = "entry/InterPro/"
|
||||
if taxonomy_id:
|
||||
endpoint = f"entry/InterPro/taxonomy/UniProt/{taxonomy_id}/"
|
||||
|
||||
return interpro_get(endpoint, params)
|
||||
|
||||
# Search for kinase-related entries
|
||||
kinase_entries = search_entries("kinase", entry_type="domain")
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Characterize an Unknown Protein
|
||||
|
||||
1. **Run InterProScan** locally or via the web (https://www.ebi.ac.uk/interpro/search/sequence/) to scan a protein sequence
|
||||
2. **Parse results** to identify domain architecture
|
||||
3. **Look up each InterPro entry** for biological context
|
||||
4. **Get GO terms** from associated InterPro entries for functional inference
|
||||
|
||||
```python
|
||||
# After running InterProScan and getting a UniProt ID:
|
||||
def characterize_protein(uniprot_id):
|
||||
"""Complete characterization workflow."""
|
||||
|
||||
# 1. Get all annotations
|
||||
entries = get_protein_entries(uniprot_id)
|
||||
|
||||
# 2. Group by type
|
||||
by_type = {}
|
||||
for e in entries.get("results", []):
|
||||
t = e["metadata"]["type"]
|
||||
by_type.setdefault(t, []).append({
|
||||
"accession": e["metadata"]["accession"],
|
||||
"name": e["metadata"]["name"]
|
||||
})
|
||||
|
||||
# 3. Get GO terms
|
||||
go_terms = get_go_terms_for_protein(uniprot_id)
|
||||
|
||||
return {
|
||||
"families": by_type.get("family", []),
|
||||
"domains": by_type.get("domain", []),
|
||||
"superfamilies": by_type.get("homologous_superfamily", []),
|
||||
"go_terms": go_terms
|
||||
}
|
||||
```
|
||||
|
||||
### Workflow 2: Find All Members of a Protein Family
|
||||
|
||||
1. Identify the InterPro family entry ID (e.g., IPR000719 for protein kinases)
|
||||
2. Query all UniProt proteins annotated with that entry
|
||||
3. Filter by organism/taxonomy if needed
|
||||
4. Download FASTA sequences for phylogenetic analysis
|
||||
|
||||
### Workflow 3: Comparative Domain Analysis
|
||||
|
||||
1. Collect proteins of interest (e.g., all paralogs)
|
||||
2. Get domain architecture for each protein
|
||||
3. Compare domain compositions and orders
|
||||
4. Identify domain gain/loss events
|
||||
|
||||
## API Endpoint Summary
|
||||
|
||||
| Endpoint | Description |
|
||||
|----------|-------------|
|
||||
| `/protein/UniProt/{id}/` | Full annotation for a protein |
|
||||
| `/protein/UniProt/{id}/entry/InterPro/` | InterPro entries for a protein |
|
||||
| `/entry/InterPro/{id}/` | Details of an InterPro entry |
|
||||
| `/entry/Pfam/{id}/` | Pfam entry details |
|
||||
| `/entry/InterPro/{id}/protein/UniProt/` | Proteins with an entry |
|
||||
| `/entry/InterPro/` | Search/list InterPro entries |
|
||||
| `/taxonomy/UniProt/{tax_id}/` | Proteins from a taxon |
|
||||
| `/structure/PDB/{pdb_id}/` | Structures mapped to InterPro |
|
||||
|
||||
## Member Databases
|
||||
|
||||
| Database | Focus |
|
||||
|----------|-------|
|
||||
| Pfam | Protein domains (HMM profiles) |
|
||||
| PANTHER | Protein families and subfamilies |
|
||||
| PRINTS | Protein fingerprints |
|
||||
| ProSitePatterns | Amino acid patterns |
|
||||
| ProSiteProfiles | Protein profile patterns |
|
||||
| SMART | Protein domain analysis |
|
||||
| TIGRFAM | JCVI curated protein families |
|
||||
| SUPERFAMILY | Structural classification |
|
||||
| CDD | Conserved Domain Database (NCBI) |
|
||||
| HAMAP | Microbial protein families |
|
||||
| NCBIfam | NCBI curated TIGRFAMs |
|
||||
| Gene3D | CATH structural classification |
|
||||
| PIRSR | PIR site rules |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use UniProt accession numbers** (not gene names) for the most reliable lookups
|
||||
- **Distinguish types**: `family` gives broad classification; `domain` gives specific structural/functional units
|
||||
- **InterProScan is faster for novel sequences**: For sequences not in UniProt, submit to the web service
|
||||
- **Handle pagination**: Large result sets require iterating through pages
|
||||
- **Combine with UniProt data**: InterPro entries often include links to UniProt, PDB, and GO
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **InterPro website**: https://www.ebi.ac.uk/interpro/
|
||||
- **InterProScan** (run locally): https://github.com/ebi-pf-team/interproscan
|
||||
- **API documentation**: https://github.com/ProteinsWebTeam/interpro7-api/blob/master/docs/
|
||||
- **Pfam**: https://www.ebi.ac.uk/interpro/entry/pfam/
|
||||
- **Citation**: Paysan-Lafosse T et al. (2023) Nucleic Acids Research. PMID: 36350672
|
||||
@@ -0,0 +1,159 @@
|
||||
# InterPro Domain Analysis Reference
|
||||
|
||||
## Entry Types
|
||||
|
||||
| Type | Description | Example |
|
||||
|------|-------------|---------|
|
||||
| `family` | Group of related proteins sharing common evolutionary origin | IPR013872: p53 family |
|
||||
| `domain` | Distinct structural/functional unit that can exist independently | IPR011615: p53 tetramerisation domain |
|
||||
| `homologous_superfamily` | Proteins related by structure but not necessarily sequence | IPR009003: Peptidase, aspartic |
|
||||
| `repeat` | Short sequence unit that occurs in multiple copies | IPR000822: Ankyrin repeat |
|
||||
| `site` | Residues important for function | IPR018060: Metalloprotease active site |
|
||||
| `conserved_site` | Conserved sequence motif (functional) | IPR016152: PTB/PI domain binding site |
|
||||
| `active_site` | Catalytic residues | IPR000743: RING domain |
|
||||
| `binding_site` | Residues involved in binding | — |
|
||||
| `ptm` | Post-translational modification site | — |
|
||||
|
||||
## Common Domain Accessions
|
||||
|
||||
### Signaling Domains
|
||||
|
||||
| Accession | Name | Function |
|
||||
|-----------|------|---------|
|
||||
| IPR000719 | Protein kinase domain | ATP-dependent phosphorylation |
|
||||
| IPR001245 | Serine-threonine/tyrosine-protein kinase | Kinase catalytic domain |
|
||||
| IPR000980 | SH2 domain | Phosphotyrosine binding |
|
||||
| IPR001452 | SH3 domain | Proline-rich sequence binding |
|
||||
| IPR011993 | PH domain | Phosphoinositide binding |
|
||||
| IPR000048 | IQ motif | Calmodulin binding |
|
||||
| IPR000008 | C2 domain | Ca2+/phospholipid binding |
|
||||
| IPR001849 | PH domain | Pleckstrin homology |
|
||||
|
||||
### DNA Binding Domains
|
||||
|
||||
| Accession | Name | Function |
|
||||
|-----------|------|---------|
|
||||
| IPR013087 | Zinc finger, C2H2 | DNA binding |
|
||||
| IPR017456 | CCCH zinc finger | RNA binding |
|
||||
| IPR011991 | Winged helix-turn-helix | Transcription factor DNA binding |
|
||||
| IPR011607 | MH1 domain | SMAD DNA binding |
|
||||
| IPR003313 | ARID domain | AT-rich DNA binding |
|
||||
| IPR014756 | E1-E2 ATPase, nucleotide-binding | — |
|
||||
|
||||
### Structural Domains
|
||||
|
||||
| Accession | Name | Function |
|
||||
|-----------|------|---------|
|
||||
| IPR001357 | BRCT domain | DNA repair protein interaction |
|
||||
| IPR000536 | Nuclear hormone receptor, ligand-binding | Hormone binding |
|
||||
| IPR001628 | Zinc finger, nuclear hormone receptor | DNA binding (NHR) |
|
||||
| IPR003961 | Fibronectin type III | Cell adhesion |
|
||||
| IPR000742 | EGF-like domain | Receptor-ligand interaction |
|
||||
|
||||
## Domain Architecture Patterns
|
||||
|
||||
Common multi-domain architectures and their biological meanings:
|
||||
|
||||
### Receptor Tyrosine Kinases
|
||||
```
|
||||
[EGF domain]... - [TM] - [Kinase domain]
|
||||
e.g., EGFR: IPR000742 (EGF) + IPR000719 (kinase)
|
||||
```
|
||||
|
||||
### Adapter Proteins
|
||||
```
|
||||
[SH3] - [SH2] - [SH3]
|
||||
e.g., Grb2, Crk — signaling adapters
|
||||
```
|
||||
|
||||
### Nuclear Receptors
|
||||
```
|
||||
[DBD/C2H2 zinc finger] - [Ligand binding domain]
|
||||
e.g., ERα (ESR1)
|
||||
```
|
||||
|
||||
### Kinases
|
||||
```
|
||||
[N-lobe] - [Activation loop] - [C-lobe]
|
||||
Standard protein kinase fold (IPR000719)
|
||||
```
|
||||
|
||||
## GO Term Categories
|
||||
|
||||
InterPro GO annotations use three ontologies:
|
||||
|
||||
| Code | Ontology | Examples |
|
||||
|------|----------|---------|
|
||||
| P | Biological Process | GO:0006468 (protein phosphorylation) |
|
||||
| F | Molecular Function | GO:0004672 (protein kinase activity) |
|
||||
| C | Cellular Component | GO:0005886 (plasma membrane) |
|
||||
|
||||
## InterProScan for Novel Sequences
|
||||
|
||||
For protein sequences not in UniProt (novel/predicted sequences), run InterProScan:
|
||||
|
||||
```bash
|
||||
# Command-line (install InterProScan locally)
|
||||
./interproscan.sh -i my_proteins.fasta -f tsv,json -dp
|
||||
|
||||
# Options:
|
||||
# -i: input FASTA
|
||||
# -f: output formats (tsv, json, xml, gff3, html)
|
||||
# -dp: disable precalculation lookup (use for non-UniProt sequences)
|
||||
# --goterms: include GO term mappings
|
||||
# --pathways: include pathway mappings
|
||||
|
||||
# Or use the web service:
|
||||
# https://www.ebi.ac.uk/interpro/search/sequence/
|
||||
```
|
||||
|
||||
**Output fields (TSV):**
|
||||
1. Protein accession
|
||||
2. Sequence MD5
|
||||
3. Sequence length
|
||||
4. Analysis (e.g., Pfam, SMART)
|
||||
5. Signature accession (e.g., PF00397)
|
||||
6. Signature description
|
||||
7. Start
|
||||
8. Stop
|
||||
9. Score
|
||||
10. Status (T = true)
|
||||
11. Date
|
||||
12. InterPro accession (if integrated)
|
||||
13. InterPro description
|
||||
|
||||
## Useful Entry ID Collections
|
||||
|
||||
### Human Disease-Relevant Domains
|
||||
|
||||
```python
|
||||
DISEASE_DOMAINS = {
|
||||
# Cancer
|
||||
"IPR011615": "p53 tetramerization",
|
||||
"IPR012346": "p53/p63/p73, tetramerization domain",
|
||||
"IPR000719": "Protein kinase domain",
|
||||
"IPR004827": "Basic-leucine zipper (bZIP) TF",
|
||||
|
||||
# Neurodegenerative
|
||||
"IPR003527": "MAP kinase, ERK1/2",
|
||||
"IPR016024": "ARM-type fold",
|
||||
|
||||
# Metabolic
|
||||
"IPR001764": "Glycoside hydrolase, family 13 (amylase)",
|
||||
"IPR006047": "Glycoside hydrolase superfamily",
|
||||
}
|
||||
```
|
||||
|
||||
### Commonly Referenced Pfam IDs
|
||||
|
||||
| Pfam ID | Domain Name |
|
||||
|---------|-------------|
|
||||
| PF00069 | Pkinase (protein kinase) |
|
||||
| PF00076 | RRM_1 (RNA recognition motif) |
|
||||
| PF00096 | zf-C2H2 (zinc finger) |
|
||||
| PF00397 | WW domain |
|
||||
| PF00400 | WD40 repeat |
|
||||
| PF00415 | RasGEF domain |
|
||||
| PF00018 | SH3 domain |
|
||||
| PF00017 | SH2 domain |
|
||||
| PF02196 | zf-C3HC4 (RING finger) |
|
||||
351
scientific-skills/jaspar-database/SKILL.md
Normal file
351
scientific-skills/jaspar-database/SKILL.md
Normal file
@@ -0,0 +1,351 @@
|
||||
---
|
||||
name: jaspar-database
|
||||
description: Query JASPAR for transcription factor binding site (TFBS) profiles (PWMs/PFMs). Search by TF name, species, or class; scan DNA sequences for TF binding sites; compare matrices; essential for regulatory genomics, motif analysis, and GWAS regulatory variant interpretation.
|
||||
license: CC0-1.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# JASPAR Database
|
||||
|
||||
## Overview
|
||||
|
||||
JASPAR (https://jaspar.elixir.no/) is the gold-standard open-access database of curated, non-redundant transcription factor (TF) binding profiles stored as position frequency matrices (PFMs). JASPAR 2024 contains 1,210 non-redundant TF binding profiles for 164 eukaryotic species. Each profile is experimentally derived (ChIP-seq, SELEX, HT-SELEX, protein binding microarray, etc.) and rigorously validated.
|
||||
|
||||
**Key resources:**
|
||||
- JASPAR portal: https://jaspar.elixir.no/
|
||||
- REST API: https://jaspar.elixir.no/api/v1/
|
||||
- API docs: https://jaspar.elixir.no/api/v1/docs/
|
||||
- Python package: `jaspar` (via Biopython) or direct API
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use JASPAR when:
|
||||
|
||||
- **TF binding site prediction**: Scan a DNA sequence for potential binding sites of a TF
|
||||
- **Regulatory variant interpretation**: Does a GWAS/eQTL variant disrupt a TF binding motif?
|
||||
- **Promoter/enhancer analysis**: What TFs are predicted to bind to a regulatory element?
|
||||
- **Gene regulatory network construction**: Link TFs to their target genes via motif scanning
|
||||
- **TF family analysis**: Compare binding profiles across a TF family (e.g., all homeobox factors)
|
||||
- **ChIP-seq analysis**: Find known TF motifs enriched in ChIP-seq peaks
|
||||
- **ENCODE/ATAC-seq interpretation**: Match open chromatin regions to TF binding profiles
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. JASPAR REST API
|
||||
|
||||
Base URL: `https://jaspar.elixir.no/api/v1/`
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://jaspar.elixir.no/api/v1"
|
||||
|
||||
def jaspar_get(endpoint, params=None):
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.get(url, params=params, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Search for TF Profiles
|
||||
|
||||
```python
|
||||
def search_jaspar(
|
||||
tf_name=None,
|
||||
species=None,
|
||||
collection="CORE",
|
||||
tf_class=None,
|
||||
tf_family=None,
|
||||
page=1,
|
||||
page_size=25
|
||||
):
|
||||
"""Search JASPAR for TF binding profiles."""
|
||||
params = {
|
||||
"collection": collection,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"format": "json"
|
||||
}
|
||||
if tf_name:
|
||||
params["name"] = tf_name
|
||||
if species:
|
||||
params["species"] = species # Use taxonomy ID or name, e.g., "9606" for human
|
||||
if tf_class:
|
||||
params["tf_class"] = tf_class
|
||||
if tf_family:
|
||||
params["tf_family"] = tf_family
|
||||
|
||||
return jaspar_get("matrix", params)
|
||||
|
||||
# Examples:
|
||||
# Search for human CTCF profile
|
||||
ctcf = search_jaspar("CTCF", species="9606")
|
||||
print(f"Found {ctcf['count']} CTCF profiles")
|
||||
|
||||
# Search for all homeobox TFs in human
|
||||
hox_tfs = search_jaspar(tf_class="Homeodomain", species="9606")
|
||||
|
||||
# Search for a TF family
|
||||
nfkb = search_jaspar(tf_family="NF-kappaB")
|
||||
```
|
||||
|
||||
### 3. Fetch a Specific Matrix (PFM/PWM)
|
||||
|
||||
```python
|
||||
def get_matrix(matrix_id):
|
||||
"""Fetch a specific JASPAR matrix by ID (e.g., 'MA0139.1' for CTCF)."""
|
||||
return jaspar_get(f"matrix/{matrix_id}/")
|
||||
|
||||
# Example: Get CTCF matrix
|
||||
ctcf_matrix = get_matrix("MA0139.1")
|
||||
|
||||
# Matrix structure:
|
||||
# {
|
||||
# "matrix_id": "MA0139.1",
|
||||
# "name": "CTCF",
|
||||
# "collection": "CORE",
|
||||
# "tax_group": "vertebrates",
|
||||
# "pfm": { "A": [...], "C": [...], "G": [...], "T": [...] },
|
||||
# "consensus": "CCGCGNGGNGGCAG",
|
||||
# "length": 19,
|
||||
# "species": [{"tax_id": 9606, "name": "Homo sapiens"}],
|
||||
# "class": ["C2H2 zinc finger factors"],
|
||||
# "family": ["BEN domain factors"],
|
||||
# "type": "ChIP-seq",
|
||||
# "uniprot_ids": ["P49711"]
|
||||
# }
|
||||
```
|
||||
|
||||
### 4. Download PFM/PWM as Matrix
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
def get_pwm(matrix_id, pseudocount=0.8):
|
||||
"""
|
||||
Fetch a PFM from JASPAR and convert to PWM (log-odds).
|
||||
Returns numpy array of shape (4, L) in order A, C, G, T.
|
||||
"""
|
||||
matrix = get_matrix(matrix_id)
|
||||
pfm = matrix["pfm"]
|
||||
|
||||
# Convert PFM to numpy
|
||||
pfm_array = np.array([pfm["A"], pfm["C"], pfm["G"], pfm["T"]], dtype=float)
|
||||
|
||||
# Add pseudocount
|
||||
pfm_array += pseudocount
|
||||
|
||||
# Normalize to get PPM
|
||||
ppm = pfm_array / pfm_array.sum(axis=0, keepdims=True)
|
||||
|
||||
# Convert to PWM (log-odds relative to background 0.25)
|
||||
background = 0.25
|
||||
pwm = np.log2(ppm / background)
|
||||
|
||||
return pwm, matrix["name"]
|
||||
|
||||
# Example
|
||||
pwm, name = get_pwm("MA0139.1") # CTCF
|
||||
print(f"PWM for {name}: shape {pwm.shape}")
|
||||
max_score = pwm.max(axis=0).sum()
|
||||
print(f"Maximum possible score: {max_score:.2f} bits")
|
||||
```
|
||||
|
||||
### 5. Scan a DNA Sequence for TF Binding Sites
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
from typing import List, Tuple
|
||||
|
||||
NUCLEOTIDE_MAP = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||
|
||||
def scan_sequence(sequence: str, pwm: np.ndarray, threshold_pct: float = 0.8) -> List[dict]:
|
||||
"""
|
||||
Scan a DNA sequence for TF binding sites using a PWM.
|
||||
|
||||
Args:
|
||||
sequence: DNA sequence string
|
||||
pwm: PWM array (4 x L) in ACGT order
|
||||
threshold_pct: Fraction of max score to use as threshold (0-1)
|
||||
|
||||
Returns:
|
||||
List of hits with position, score, and matched sequence
|
||||
"""
|
||||
motif_len = pwm.shape[1]
|
||||
max_score = pwm.max(axis=0).sum()
|
||||
min_score = pwm.min(axis=0).sum()
|
||||
threshold = min_score + threshold_pct * (max_score - min_score)
|
||||
|
||||
hits = []
|
||||
seq = sequence.upper()
|
||||
|
||||
for i in range(len(seq) - motif_len + 1):
|
||||
subseq = seq[i:i + motif_len]
|
||||
# Skip if contains non-ACGT
|
||||
if any(c not in NUCLEOTIDE_MAP for c in subseq):
|
||||
continue
|
||||
|
||||
score = sum(pwm[NUCLEOTIDE_MAP[c], j] for j, c in enumerate(subseq))
|
||||
|
||||
if score >= threshold:
|
||||
relative_score = (score - min_score) / (max_score - min_score)
|
||||
hits.append({
|
||||
"position": i + 1, # 1-based
|
||||
"score": score,
|
||||
"relative_score": relative_score,
|
||||
"sequence": subseq,
|
||||
"strand": "+"
|
||||
})
|
||||
|
||||
return hits
|
||||
|
||||
# Example: Scan a promoter sequence for CTCF binding sites
|
||||
promoter = "AGCCCGCGAGGNGGCAGTTGCCTGGAGCAGGATCAGCAGATC"
|
||||
pwm, name = get_pwm("MA0139.1")
|
||||
hits = scan_sequence(promoter, pwm, threshold_pct=0.75)
|
||||
for hit in hits:
|
||||
print(f" Position {hit['position']}: {hit['sequence']} (score: {hit['score']:.2f}, {hit['relative_score']:.0%})")
|
||||
```
|
||||
|
||||
### 6. Scan Both Strands
|
||||
|
||||
```python
|
||||
def reverse_complement(seq: str) -> str:
|
||||
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
|
||||
return ''.join(complement.get(b, 'N') for b in reversed(seq.upper()))
|
||||
|
||||
def scan_both_strands(sequence: str, pwm: np.ndarray, threshold_pct: float = 0.8):
|
||||
"""Scan forward and reverse complement strands."""
|
||||
fwd_hits = scan_sequence(sequence, pwm, threshold_pct)
|
||||
for h in fwd_hits:
|
||||
h["strand"] = "+"
|
||||
|
||||
rev_seq = reverse_complement(sequence)
|
||||
rev_hits = scan_sequence(rev_seq, pwm, threshold_pct)
|
||||
seq_len = len(sequence)
|
||||
for h in rev_hits:
|
||||
h["strand"] = "-"
|
||||
h["position"] = seq_len - h["position"] - len(h["sequence"]) + 2 # Convert to fwd coords
|
||||
|
||||
all_hits = fwd_hits + rev_hits
|
||||
return sorted(all_hits, key=lambda x: x["position"])
|
||||
```
|
||||
|
||||
### 7. Variant Impact on TF Binding
|
||||
|
||||
```python
|
||||
def variant_tfbs_impact(ref_seq: str, alt_seq: str, pwm: np.ndarray,
|
||||
tf_name: str, threshold_pct: float = 0.7):
|
||||
"""
|
||||
Assess impact of a SNP on TF binding by comparing ref vs alt sequences.
|
||||
Both sequences should be centered on the variant with flanking context.
|
||||
"""
|
||||
ref_hits = scan_both_strands(ref_seq, pwm, threshold_pct)
|
||||
alt_hits = scan_both_strands(alt_seq, pwm, threshold_pct)
|
||||
|
||||
max_ref = max((h["score"] for h in ref_hits), default=None)
|
||||
max_alt = max((h["score"] for h in alt_hits), default=None)
|
||||
|
||||
result = {
|
||||
"tf": tf_name,
|
||||
"ref_max_score": max_ref,
|
||||
"alt_max_score": max_alt,
|
||||
"ref_has_site": len(ref_hits) > 0,
|
||||
"alt_has_site": len(alt_hits) > 0,
|
||||
}
|
||||
if max_ref and max_alt:
|
||||
result["score_change"] = max_alt - max_ref
|
||||
result["effect"] = "gained" if max_alt > max_ref else "disrupted"
|
||||
elif max_ref and not max_alt:
|
||||
result["effect"] = "disrupted"
|
||||
elif not max_ref and max_alt:
|
||||
result["effect"] = "gained"
|
||||
else:
|
||||
result["effect"] = "no_site"
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: Find All TF Binding Sites in a Promoter
|
||||
|
||||
```python
|
||||
import requests, numpy as np
|
||||
|
||||
# 1. Get relevant TF matrices (e.g., all human TFs in CORE collection)
|
||||
response = requests.get(
|
||||
"https://jaspar.elixir.no/api/v1/matrix/",
|
||||
params={"species": "9606", "collection": "CORE", "page_size": 500, "page": 1}
|
||||
)
|
||||
matrices = response.json()["results"]
|
||||
|
||||
# 2. For each matrix, compute PWM and scan promoter
|
||||
promoter = "CCCGCCCGCCCGCCGCCCGCAGTTAATGAGCCCAGCGTGCC" # Example
|
||||
|
||||
all_hits = []
|
||||
for m in matrices[:10]: # Limit for demo
|
||||
pwm_data = requests.get(f"https://jaspar.elixir.no/api/v1/matrix/{m['matrix_id']}/").json()
|
||||
pfm = pfm_data["pfm"]
|
||||
pfm_arr = np.array([pfm["A"], pfm["C"], pfm["G"], pfm["T"]], dtype=float) + 0.8
|
||||
ppm = pfm_arr / pfm_arr.sum(axis=0)
|
||||
pwm = np.log2(ppm / 0.25)
|
||||
|
||||
hits = scan_sequence(promoter, pwm, threshold_pct=0.8)
|
||||
for h in hits:
|
||||
h["tf_name"] = m["name"]
|
||||
h["matrix_id"] = m["matrix_id"]
|
||||
all_hits.extend(hits)
|
||||
|
||||
print(f"Found {len(all_hits)} TF binding sites")
|
||||
for h in sorted(all_hits, key=lambda x: -x["score"])[:5]:
|
||||
print(f" {h['tf_name']} ({h['matrix_id']}): pos {h['position']}, score {h['score']:.2f}")
|
||||
```
|
||||
|
||||
### Workflow 2: SNP Impact on TF Binding (Regulatory Variant Analysis)
|
||||
|
||||
1. Retrieve the genomic sequence flanking the SNP (±20 bp each side)
|
||||
2. Construct ref and alt sequences
|
||||
3. Scan with all relevant TF PWMs
|
||||
4. Report TFs whose binding is created or destroyed by the SNP
|
||||
|
||||
### Workflow 3: Motif Enrichment Analysis
|
||||
|
||||
1. Identify a set of peak sequences (e.g., from ChIP-seq or ATAC-seq)
|
||||
2. Scan all peaks with JASPAR PWMs
|
||||
3. Compare hit rates in peaks vs. background sequences
|
||||
4. Report significantly enriched motifs (Fisher's exact test or FIMO-style scoring)
|
||||
|
||||
## Collections Available
|
||||
|
||||
| Collection | Description | Profiles |
|
||||
|------------|-------------|----------|
|
||||
| `CORE` | Non-redundant, high-quality profiles | ~1,210 |
|
||||
| `UNVALIDATED` | Experimentally derived but not validated | ~500 |
|
||||
| `PHYLOFACTS` | Phylogenetically conserved sites | ~50 |
|
||||
| `CNE` | Conserved non-coding elements | ~30 |
|
||||
| `POLII` | RNA Pol II binding profiles | ~20 |
|
||||
| `FAM` | TF family representative profiles | ~170 |
|
||||
| `SPLICE` | Splice factor profiles | ~20 |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use CORE collection** for most analyses — best validated and non-redundant
|
||||
- **Threshold selection**: 80% of max score is common for de novo prediction; 90% for high-confidence
|
||||
- **Always scan both strands** — TFs can bind in either orientation
|
||||
- **Provide flanking context** for variant analysis: at least (motif_length - 1) bp on each side
|
||||
- **Consider background**: PWM scores relative to uniform (0.25) background; adjust for actual GC content
|
||||
- **Cross-validate with ChIP-seq data** when available — motif scanning has many false positives
|
||||
- **Use Biopython's motifs module** for full-featured scanning: `from Bio import motifs`
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **JASPAR portal**: https://jaspar.elixir.no/
|
||||
- **API documentation**: https://jaspar.elixir.no/api/v1/docs/
|
||||
- **JASPAR 2024 paper**: Castro-Mondragon et al. (2022) Nucleic Acids Research. PMID: 34875674
|
||||
- **Biopython motifs**: https://biopython.org/docs/latest/Tutorial/chapter_motifs.html
|
||||
- **FIMO tool** (for large-scale scanning): https://meme-suite.org/meme/tools/fimo
|
||||
- **HOMER** (motif enrichment): http://homer.ucsd.edu/homer/
|
||||
- **GitHub**: https://github.com/wassermanlab/JASPAR-UCSC-tracks
|
||||
182
scientific-skills/jaspar-database/references/api_reference.md
Normal file
182
scientific-skills/jaspar-database/references/api_reference.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# JASPAR API v1 Reference
|
||||
|
||||
## Base URL
|
||||
|
||||
```
|
||||
https://jaspar.elixir.no/api/v1/
|
||||
```
|
||||
|
||||
No authentication required. Responses are JSON.
|
||||
|
||||
## Core Endpoints
|
||||
|
||||
### `GET /matrix/`
|
||||
|
||||
Search and list TF binding profiles.
|
||||
|
||||
**Parameters:**
|
||||
|
||||
| Parameter | Type | Description | Example |
|
||||
|-----------|------|-------------|---------|
|
||||
| `name` | string | TF name (partial match) | `CTCF` |
|
||||
| `matrix_id` | string | Exact matrix ID | `MA0139.1` |
|
||||
| `collection` | string | Collection name | `CORE` |
|
||||
| `tax_group` | string | Taxonomic group | `vertebrates` |
|
||||
| `species` | string | Species name or tax ID | `9606`, `Homo sapiens` |
|
||||
| `tf_class` | string | TF structural class | `C2H2 zinc finger factors` |
|
||||
| `tf_family` | string | TF family | `BEN domain factors` |
|
||||
| `type` | string | Experimental method | `ChIP-seq`, `SELEX` |
|
||||
| `version` | string | `latest` or specific version | `latest` |
|
||||
| `page` | int | Page number | `1` |
|
||||
| `page_size` | int | Results per page (max 500) | `25` |
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"count": 1210,
|
||||
"next": "https://jaspar.elixir.no/api/v1/matrix/?page=2",
|
||||
"previous": null,
|
||||
"results": [
|
||||
{
|
||||
"matrix_id": "MA0139.1",
|
||||
"name": "CTCF",
|
||||
"collection": "CORE",
|
||||
"tax_group": "vertebrates"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### `GET /matrix/{id}/`
|
||||
|
||||
Fetch a specific matrix with full details.
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"matrix_id": "MA0139.1",
|
||||
"name": "CTCF",
|
||||
"collection": "CORE",
|
||||
"tax_group": "vertebrates",
|
||||
"pfm": {
|
||||
"A": [87, 167, 281, ...],
|
||||
"C": [291, 145, 49, ...],
|
||||
"G": [76, 414, 504, ...],
|
||||
"T": [205, 114, 27, ...]
|
||||
},
|
||||
"consensus": "CCGCGNGGNGGCAG",
|
||||
"length": 19,
|
||||
"species": [{"tax_id": 9606, "name": "Homo sapiens"}],
|
||||
"class": ["C2H2 zinc finger factors"],
|
||||
"family": ["BEN domain factors"],
|
||||
"type": "ChIP-seq",
|
||||
"uniprot_ids": ["P49711"],
|
||||
"pubmed_ids": ["19172222"],
|
||||
"version": 1,
|
||||
"latest": true
|
||||
}
|
||||
```
|
||||
|
||||
### `GET /matrix/{id}/logo/`
|
||||
|
||||
Returns SVG/PNG logo for the matrix.
|
||||
|
||||
**Parameters:** `format` = `svg` (default) or `png`
|
||||
|
||||
### `GET /taxon/`
|
||||
|
||||
List available species/taxa.
|
||||
|
||||
### `GET /tf_class/`
|
||||
|
||||
List available TF structural classes.
|
||||
|
||||
### `GET /tf_family/`
|
||||
|
||||
List available TF families.
|
||||
|
||||
### `GET /collection/`
|
||||
|
||||
List available collections.
|
||||
|
||||
## Matrix ID Format
|
||||
|
||||
```
|
||||
MA{number}.{version}
|
||||
```
|
||||
|
||||
- `MA` prefix = Manual curation
|
||||
- `PB` prefix = Published (automatic)
|
||||
- `UN` prefix = Unvalidated
|
||||
- Version increments when profile is updated
|
||||
|
||||
## Common Matrix IDs
|
||||
|
||||
| Matrix ID | TF | Species | Method |
|
||||
|-----------|-----|---------|--------|
|
||||
| `MA0139.1` | CTCF | Human | ChIP-seq |
|
||||
| `MA0098.3` | ETS1 | Human | SELEX |
|
||||
| `MA0107.1` | RELA (NF-kB p65) | Human | SELEX |
|
||||
| `MA0048.2` | NHLH1 | Human | SELEX |
|
||||
| `MA0079.4` | SP1 | Human | SELEX |
|
||||
| `MA0080.4` | SPI1 (PU.1) | Human | ChIP-seq |
|
||||
| `MA0025.2` | NFIL3 | Human | SELEX |
|
||||
| `MA0002.2` | RUNX1 | Human | SELEX |
|
||||
| `MA0004.1` | Arnt | Mouse | SELEX |
|
||||
| `MA0009.2` | TAL1::GATA1 | Human | SELEX |
|
||||
|
||||
## TF Classes (partial list)
|
||||
|
||||
- `C2H2 zinc finger factors`
|
||||
- `Basic leucine zipper factors (bZIP)`
|
||||
- `Basic helix-loop-helix factors (bHLH)`
|
||||
- `Homeodomain factors`
|
||||
- `Forkhead box (FOX) factors`
|
||||
- `ETS-domain factors`
|
||||
- `Nuclear hormone receptors`
|
||||
- `Tryptophan cluster factors`
|
||||
- `p53-like transcription factors`
|
||||
- `STAT factors`
|
||||
- `MADS box factors`
|
||||
- `T-box factors`
|
||||
|
||||
## Python Example: Batch Download
|
||||
|
||||
```python
|
||||
import requests, json, time
|
||||
|
||||
def download_all_human_profiles(output_file="jaspar_human_profiles.json"):
|
||||
"""Download all human TF profiles from JASPAR CORE collection."""
|
||||
url = "https://jaspar.elixir.no/api/v1/matrix/"
|
||||
params = {
|
||||
"collection": "CORE",
|
||||
"species": "9606",
|
||||
"version": "latest",
|
||||
"page_size": 500,
|
||||
"page": 1
|
||||
}
|
||||
|
||||
profiles = []
|
||||
while True:
|
||||
response = requests.get(url, params=params)
|
||||
data = response.json()
|
||||
profiles.extend(data["results"])
|
||||
|
||||
if not data["next"]:
|
||||
break
|
||||
params["page"] += 1
|
||||
time.sleep(0.5)
|
||||
|
||||
# Fetch full details for each profile
|
||||
full_profiles = []
|
||||
for p in profiles:
|
||||
detail_url = f"https://jaspar.elixir.no/api/v1/matrix/{p['matrix_id']}/"
|
||||
detail = requests.get(detail_url).json()
|
||||
full_profiles.append(detail)
|
||||
time.sleep(0.1) # Be respectful
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(full_profiles, f)
|
||||
|
||||
return full_profiles
|
||||
```
|
||||
457
scientific-skills/molecular-dynamics/SKILL.md
Normal file
457
scientific-skills/molecular-dynamics/SKILL.md
Normal file
@@ -0,0 +1,457 @@
|
||||
---
|
||||
name: molecular-dynamics
|
||||
description: Run and analyze molecular dynamics simulations with OpenMM and MDAnalysis. Set up protein/small molecule systems, define force fields, run energy minimization and production MD, analyze trajectories (RMSD, RMSF, contact maps, free energy surfaces). For structural biology, drug binding, and biophysics.
|
||||
license: MIT
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# Molecular Dynamics
|
||||
|
||||
## Overview
|
||||
|
||||
Molecular dynamics (MD) simulation computationally models the time evolution of molecular systems by integrating Newton's equations of motion. This skill covers two complementary tools:
|
||||
|
||||
- **OpenMM** (https://openmm.org/): High-performance MD simulation engine with GPU support, Python API, and flexible force field support
|
||||
- **MDAnalysis** (https://mdanalysis.org/): Python library for reading, writing, and analyzing MD trajectories from all major simulation packages
|
||||
|
||||
**Installation:**
|
||||
```bash
|
||||
conda install -c conda-forge openmm mdanalysis nglview
|
||||
# or
|
||||
pip install openmm mdanalysis
|
||||
```
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use molecular dynamics when:
|
||||
|
||||
- **Protein stability analysis**: How does a mutation affect protein dynamics?
|
||||
- **Drug binding simulations**: Characterize binding mode and residence time of a ligand
|
||||
- **Conformational sampling**: Explore protein flexibility and conformational changes
|
||||
- **Protein-protein interaction**: Model interface dynamics and binding energetics
|
||||
- **RMSD/RMSF analysis**: Quantify structural fluctuations from a reference structure
|
||||
- **Free energy estimation**: Compute binding free energy or conformational free energy
|
||||
- **Membrane simulations**: Model proteins in lipid bilayers
|
||||
- **Intrinsically disordered proteins**: Study IDR conformational ensembles
|
||||
|
||||
## Core Workflow: OpenMM Simulation
|
||||
|
||||
### 1. System Preparation
|
||||
|
||||
```python
|
||||
from openmm.app import *
|
||||
from openmm import *
|
||||
from openmm.unit import *
|
||||
import sys
|
||||
|
||||
def prepare_system_from_pdb(pdb_file, forcefield_name="amber14-all.xml",
|
||||
water_model="amber14/tip3pfb.xml"):
|
||||
"""
|
||||
Prepare an OpenMM system from a PDB file.
|
||||
|
||||
Args:
|
||||
pdb_file: Path to cleaned PDB file (use PDBFixer for raw PDB files)
|
||||
forcefield_name: Force field XML file
|
||||
water_model: Water model XML file
|
||||
|
||||
Returns:
|
||||
pdb, forcefield, system, topology
|
||||
"""
|
||||
# Load PDB
|
||||
pdb = PDBFile(pdb_file)
|
||||
|
||||
# Load force field
|
||||
forcefield = ForceField(forcefield_name, water_model)
|
||||
|
||||
# Add hydrogens and solvate
|
||||
modeller = Modeller(pdb.topology, pdb.positions)
|
||||
modeller.addHydrogens(forcefield)
|
||||
|
||||
# Add solvent box (10 Å padding, 150 mM NaCl)
|
||||
modeller.addSolvent(
|
||||
forcefield,
|
||||
model='tip3p',
|
||||
padding=10*angstroms,
|
||||
ionicStrength=0.15*molar
|
||||
)
|
||||
|
||||
print(f"System: {modeller.topology.getNumAtoms()} atoms, "
|
||||
f"{modeller.topology.getNumResidues()} residues")
|
||||
|
||||
# Create system
|
||||
system = forcefield.createSystem(
|
||||
modeller.topology,
|
||||
nonbondedMethod=PME, # Particle Mesh Ewald for long-range electrostatics
|
||||
nonbondedCutoff=1.0*nanometer,
|
||||
constraints=HBonds, # Constrain hydrogen bonds (allows 2 fs timestep)
|
||||
rigidWater=True,
|
||||
ewaldErrorTolerance=0.0005
|
||||
)
|
||||
|
||||
return modeller, system
|
||||
```
|
||||
|
||||
### 2. Energy Minimization
|
||||
|
||||
```python
|
||||
from openmm.app import *
|
||||
from openmm import *
|
||||
from openmm.unit import *
|
||||
|
||||
def minimize_energy(modeller, system, output_pdb="minimized.pdb",
|
||||
max_iterations=1000, tolerance=10.0):
|
||||
"""
|
||||
Energy minimize the system to remove steric clashes.
|
||||
|
||||
Args:
|
||||
modeller: Modeller object with topology and positions
|
||||
system: OpenMM System
|
||||
output_pdb: Path to save minimized structure
|
||||
max_iterations: Maximum minimization steps
|
||||
tolerance: Convergence criterion in kJ/mol/nm
|
||||
|
||||
Returns:
|
||||
simulation object with minimized positions
|
||||
"""
|
||||
# Set up integrator (doesn't matter for minimization)
|
||||
integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
|
||||
|
||||
# Create simulation
|
||||
# Use GPU if available (CUDA or OpenCL), fall back to CPU
|
||||
try:
|
||||
platform = Platform.getPlatformByName('CUDA')
|
||||
properties = {'DeviceIndex': '0', 'Precision': 'mixed'}
|
||||
except Exception:
|
||||
try:
|
||||
platform = Platform.getPlatformByName('OpenCL')
|
||||
properties = {}
|
||||
except Exception:
|
||||
platform = Platform.getPlatformByName('CPU')
|
||||
properties = {}
|
||||
|
||||
simulation = Simulation(
|
||||
modeller.topology, system, integrator,
|
||||
platform, properties
|
||||
)
|
||||
simulation.context.setPositions(modeller.positions)
|
||||
|
||||
# Check initial energy
|
||||
state = simulation.context.getState(getEnergy=True)
|
||||
print(f"Initial energy: {state.getPotentialEnergy()}")
|
||||
|
||||
# Minimize
|
||||
simulation.minimizeEnergy(
|
||||
tolerance=tolerance*kilojoules_per_mole/nanometer,
|
||||
maxIterations=max_iterations
|
||||
)
|
||||
|
||||
state = simulation.context.getState(getEnergy=True, getPositions=True)
|
||||
print(f"Minimized energy: {state.getPotentialEnergy()}")
|
||||
|
||||
# Save minimized structure
|
||||
with open(output_pdb, 'w') as f:
|
||||
PDBFile.writeFile(simulation.topology, state.getPositions(), f)
|
||||
|
||||
return simulation
|
||||
```
|
||||
|
||||
### 3. NVT Equilibration
|
||||
|
||||
```python
|
||||
from openmm.app import *
|
||||
from openmm import *
|
||||
from openmm.unit import *
|
||||
|
||||
def run_nvt_equilibration(simulation, n_steps=50000, temperature=300,
|
||||
report_interval=1000, output_prefix="nvt"):
|
||||
"""
|
||||
NVT equilibration: constant N, V, T.
|
||||
Equilibrate velocities to target temperature.
|
||||
|
||||
Args:
|
||||
simulation: OpenMM Simulation (after minimization)
|
||||
n_steps: Number of MD steps (50000 × 2fs = 100 ps)
|
||||
temperature: Temperature in Kelvin
|
||||
report_interval: Steps between data reports
|
||||
output_prefix: File prefix for trajectory and log
|
||||
"""
|
||||
# Add position restraints for backbone during NVT
|
||||
# (Optional: restraint heavy atoms)
|
||||
|
||||
# Set temperature
|
||||
simulation.context.setVelocitiesToTemperature(temperature*kelvin)
|
||||
|
||||
# Add reporters
|
||||
simulation.reporters = []
|
||||
|
||||
# Log file
|
||||
simulation.reporters.append(
|
||||
StateDataReporter(
|
||||
f"{output_prefix}_log.txt",
|
||||
report_interval,
|
||||
step=True,
|
||||
potentialEnergy=True,
|
||||
kineticEnergy=True,
|
||||
temperature=True,
|
||||
volume=True,
|
||||
speed=True
|
||||
)
|
||||
)
|
||||
|
||||
# DCD trajectory (compact binary format)
|
||||
simulation.reporters.append(
|
||||
DCDReporter(f"{output_prefix}_traj.dcd", report_interval)
|
||||
)
|
||||
|
||||
print(f"Running NVT equilibration: {n_steps} steps ({n_steps*2/1000:.1f} ps)")
|
||||
simulation.step(n_steps)
|
||||
print("NVT equilibration complete")
|
||||
|
||||
return simulation
|
||||
```
|
||||
|
||||
### 4. NPT Equilibration and Production
|
||||
|
||||
```python
|
||||
def run_npt_production(simulation, n_steps=500000, temperature=300, pressure=1.0,
|
||||
report_interval=5000, output_prefix="npt"):
|
||||
"""
|
||||
NPT production run: constant N, P, T.
|
||||
|
||||
Args:
|
||||
n_steps: Production steps (500000 × 2fs = 1 ns)
|
||||
temperature: Temperature in Kelvin
|
||||
pressure: Pressure in bar
|
||||
report_interval: Steps between reports
|
||||
"""
|
||||
# Add Monte Carlo barostat for pressure control
|
||||
system = simulation.context.getSystem()
|
||||
system.addForce(MonteCarloBarostat(pressure*bar, temperature*kelvin, 25))
|
||||
simulation.context.reinitialize(preserveState=True)
|
||||
|
||||
# Update reporters
|
||||
simulation.reporters = []
|
||||
simulation.reporters.append(
|
||||
StateDataReporter(
|
||||
f"{output_prefix}_log.txt",
|
||||
report_interval,
|
||||
step=True,
|
||||
potentialEnergy=True,
|
||||
temperature=True,
|
||||
density=True,
|
||||
speed=True
|
||||
)
|
||||
)
|
||||
simulation.reporters.append(
|
||||
DCDReporter(f"{output_prefix}_traj.dcd", report_interval)
|
||||
)
|
||||
|
||||
# Save checkpoints
|
||||
simulation.reporters.append(
|
||||
CheckpointReporter(f"{output_prefix}_checkpoint.chk", 50000)
|
||||
)
|
||||
|
||||
print(f"Running NPT production: {n_steps} steps ({n_steps*2/1000000:.2f} ns)")
|
||||
simulation.step(n_steps)
|
||||
print("Production MD complete")
|
||||
return simulation
|
||||
```
|
||||
|
||||
## Trajectory Analysis with MDAnalysis
|
||||
|
||||
### 1. Load Trajectory
|
||||
|
||||
```python
|
||||
import MDAnalysis as mda
|
||||
from MDAnalysis.analysis import rms, align, contacts
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def load_trajectory(topology_file, trajectory_file):
|
||||
"""
|
||||
Load an MD trajectory with MDAnalysis.
|
||||
|
||||
Args:
|
||||
topology_file: PDB, PSF, or other topology file
|
||||
trajectory_file: DCD, XTC, TRR, or other trajectory
|
||||
"""
|
||||
u = mda.Universe(topology_file, trajectory_file)
|
||||
print(f"Universe: {u.atoms.n_atoms} atoms, {u.trajectory.n_frames} frames")
|
||||
print(f"Time range: 0 to {u.trajectory.totaltime:.0f} ps")
|
||||
return u
|
||||
```
|
||||
|
||||
### 2. RMSD Analysis
|
||||
|
||||
```python
|
||||
def compute_rmsd(u, selection="backbone", reference_frame=0):
|
||||
"""
|
||||
Compute RMSD of selected atoms relative to reference frame.
|
||||
|
||||
Args:
|
||||
u: MDAnalysis Universe
|
||||
selection: Atom selection string (MDAnalysis syntax)
|
||||
reference_frame: Frame index for reference structure
|
||||
|
||||
Returns:
|
||||
numpy array of (time, rmsd) values
|
||||
"""
|
||||
# Align trajectory to minimize RMSD
|
||||
aligner = align.AlignTraj(u, u, select=selection, in_memory=True)
|
||||
aligner.run()
|
||||
|
||||
# Compute RMSD
|
||||
R = rms.RMSD(u, select=selection, ref_frame=reference_frame)
|
||||
R.run()
|
||||
|
||||
rmsd_data = R.results.rmsd # columns: frame, time, RMSD
|
||||
return rmsd_data
|
||||
|
||||
def plot_rmsd(rmsd_data, title="RMSD over time", output_file="rmsd.png"):
|
||||
"""Plot RMSD over simulation time."""
|
||||
fig, ax = plt.subplots(figsize=(10, 4))
|
||||
ax.plot(rmsd_data[:, 1] / 1000, rmsd_data[:, 2], 'b-', linewidth=0.5)
|
||||
ax.set_xlabel("Time (ns)")
|
||||
ax.set_ylabel("RMSD (Å)")
|
||||
ax.set_title(title)
|
||||
ax.axhline(rmsd_data[:, 2].mean(), color='r', linestyle='--',
|
||||
label=f'Mean: {rmsd_data[:, 2].mean():.2f} Å')
|
||||
ax.legend()
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file, dpi=150)
|
||||
return fig
|
||||
```
|
||||
|
||||
### 3. RMSF Analysis (Per-Residue Flexibility)
|
||||
|
||||
```python
|
||||
def compute_rmsf(u, selection="backbone", start_frame=0):
|
||||
"""
|
||||
Compute per-residue RMSF (flexibility).
|
||||
|
||||
Returns:
|
||||
resids, rmsf_values arrays
|
||||
"""
|
||||
# Select atoms
|
||||
atoms = u.select_atoms(selection)
|
||||
|
||||
# Compute RMSF
|
||||
R = rms.RMSF(atoms)
|
||||
R.run(start=start_frame)
|
||||
|
||||
# Average by residue
|
||||
resids = []
|
||||
rmsf_per_res = []
|
||||
for res in u.select_atoms(selection).residues:
|
||||
res_atoms = res.atoms.intersection(atoms)
|
||||
if len(res_atoms) > 0:
|
||||
resids.append(res.resid)
|
||||
rmsf_per_res.append(R.results.rmsf[res_atoms.indices].mean())
|
||||
|
||||
return np.array(resids), np.array(rmsf_per_res)
|
||||
```
|
||||
|
||||
### 4. Protein-Ligand Contacts
|
||||
|
||||
```python
|
||||
def analyze_contacts(u, protein_sel="protein", ligand_sel="resname LIG",
|
||||
radius=4.5, start_frame=0):
|
||||
"""
|
||||
Track protein-ligand contacts over trajectory.
|
||||
|
||||
Args:
|
||||
radius: Contact distance cutoff in Angstroms
|
||||
"""
|
||||
protein = u.select_atoms(protein_sel)
|
||||
ligand = u.select_atoms(ligand_sel)
|
||||
|
||||
contact_frames = []
|
||||
for ts in u.trajectory[start_frame:]:
|
||||
# Find protein atoms within radius of ligand
|
||||
distances = contacts.contact_matrix(
|
||||
protein.positions, ligand.positions, radius
|
||||
)
|
||||
contact_residues = set()
|
||||
for i in range(distances.shape[0]):
|
||||
if distances[i].any():
|
||||
contact_residues.add(protein.atoms[i].resid)
|
||||
contact_frames.append(contact_residues)
|
||||
|
||||
return contact_frames
|
||||
```
|
||||
|
||||
## Force Field Selection Guide
|
||||
|
||||
| System | Recommended Force Field | Water Model |
|
||||
|--------|------------------------|-------------|
|
||||
| Standard proteins | AMBER14 (`amber14-all.xml`) | TIP3P-FB |
|
||||
| Proteins + small molecules | AMBER14 + GAFF2 | TIP3P-FB |
|
||||
| Membrane proteins | CHARMM36m | TIP3P |
|
||||
| Nucleic acids | AMBER99-bsc1 or AMBER14 | TIP3P |
|
||||
| Disordered proteins | ff19SB or CHARMM36m | TIP3P |
|
||||
|
||||
## System Preparation Tools
|
||||
|
||||
### PDBFixer (for raw PDB files)
|
||||
|
||||
```python
|
||||
from pdbfixer import PDBFixer
|
||||
from openmm.app import PDBFile
|
||||
|
||||
def fix_pdb(input_pdb, output_pdb, ph=7.0):
|
||||
"""Fix common PDB issues: missing residues, atoms, add H, standardize."""
|
||||
fixer = PDBFixer(filename=input_pdb)
|
||||
fixer.findMissingResidues()
|
||||
fixer.findNonstandardResidues()
|
||||
fixer.replaceNonstandardResidues()
|
||||
fixer.removeHeterogens(True) # Remove water/ligands
|
||||
fixer.findMissingAtoms()
|
||||
fixer.addMissingAtoms()
|
||||
fixer.addMissingHydrogens(ph)
|
||||
|
||||
with open(output_pdb, 'w') as f:
|
||||
PDBFile.writeFile(fixer.topology, fixer.positions, f)
|
||||
|
||||
return output_pdb
|
||||
```
|
||||
|
||||
### GAFF2 for Small Molecules (via OpenFF Toolkit)
|
||||
|
||||
```python
|
||||
# For ligand parameterization, use OpenFF toolkit or ACPYPE
|
||||
# pip install openff-toolkit
|
||||
from openff.toolkit import Molecule, ForceField as OFFForceField
|
||||
from openff.interchange import Interchange
|
||||
|
||||
def parameterize_ligand(smiles, ff_name="openff-2.0.0.offxml"):
|
||||
"""Generate GAFF2/OpenFF parameters for a small molecule."""
|
||||
mol = Molecule.from_smiles(smiles)
|
||||
mol.generate_conformers(n_conformers=1)
|
||||
|
||||
off_ff = OFFForceField(ff_name)
|
||||
interchange = off_ff.create_interchange(mol.to_topology())
|
||||
return interchange
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Always minimize before MD**: Raw PDB structures have steric clashes
|
||||
- **Equilibrate before production**: NVT (50–100 ps) → NPT (100–500 ps) → Production
|
||||
- **Use GPU**: Simulations are 10–100× faster on GPU (CUDA/OpenCL)
|
||||
- **2 fs timestep with HBonds constraints**: Standard; use 4 fs with HMR (hydrogen mass repartitioning)
|
||||
- **Analyze only equilibrated trajectory**: Discard first 20–50% as equilibration
|
||||
- **Save checkpoints**: MD runs can fail; checkpoints allow restart
|
||||
- **Periodic boundary conditions**: Required for solvated systems
|
||||
- **PME for electrostatics**: More accurate than cutoff methods for charged systems
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **OpenMM documentation**: https://openmm.org/documentation.html
|
||||
- **MDAnalysis user guide**: https://docs.mdanalysis.org/
|
||||
- **GROMACS** (alternative MD engine): https://manual.gromacs.org/
|
||||
- **NAMD** (alternative): https://www.ks.uiuc.edu/Research/namd/
|
||||
- **CHARMM-GUI** (web-based system builder): https://charmm-gui.org/
|
||||
- **AmberTools** (free Amber tools): https://ambermd.org/AmberTools.php
|
||||
- **OpenMM paper**: Eastman P et al. (2017) PLOS Computational Biology. PMID: 28278240
|
||||
- **MDAnalysis paper**: Michaud-Agrawal N et al. (2011) J Computational Chemistry. PMID: 21500218
|
||||
@@ -0,0 +1,208 @@
|
||||
# MDAnalysis Analysis Reference
|
||||
|
||||
## MDAnalysis Universe and AtomGroup
|
||||
|
||||
```python
|
||||
import MDAnalysis as mda
|
||||
|
||||
# Load Universe
|
||||
u = mda.Universe("topology.pdb", "trajectory.dcd")
|
||||
# or for single structure:
|
||||
u = mda.Universe("structure.pdb")
|
||||
|
||||
# Key attributes
|
||||
print(u.atoms.n_atoms) # Total atoms
|
||||
print(u.residues.n_residues) # Total residues
|
||||
print(u.trajectory.n_frames) # Number of frames
|
||||
print(u.trajectory.dt) # Time step in ps
|
||||
print(u.trajectory.totaltime) # Total simulation time in ps
|
||||
```
|
||||
|
||||
## Atom Selection Language
|
||||
|
||||
MDAnalysis uses a rich selection language:
|
||||
|
||||
```python
|
||||
# Basic selections
|
||||
protein = u.select_atoms("protein")
|
||||
backbone = u.select_atoms("backbone") # CA, N, C, O
|
||||
calpha = u.select_atoms("name CA")
|
||||
water = u.select_atoms("resname WAT or resname HOH or resname TIP3")
|
||||
ligand = u.select_atoms("resname LIG")
|
||||
|
||||
# By residue number
|
||||
region = u.select_atoms("resid 10:50")
|
||||
specific = u.select_atoms("resid 45 and name CA")
|
||||
|
||||
# By proximity
|
||||
near_ligand = u.select_atoms("protein and around 5.0 resname LIG")
|
||||
|
||||
# By property
|
||||
charged = u.select_atoms("resname ARG LYS ASP GLU")
|
||||
hydrophobic = u.select_atoms("resname ALA VAL LEU ILE PRO PHE TRP MET")
|
||||
|
||||
# Boolean combinations
|
||||
active_site = u.select_atoms("(resid 100 102 145 200) and protein")
|
||||
|
||||
# Inverse
|
||||
not_water = u.select_atoms("not (resname WAT HOH)")
|
||||
```
|
||||
|
||||
## Common Analysis Modules
|
||||
|
||||
### RMSD and RMSF
|
||||
|
||||
```python
|
||||
from MDAnalysis.analysis import rms, align
|
||||
|
||||
# Align trajectory to first frame
|
||||
align.AlignTraj(u, u, select='backbone', in_memory=True).run()
|
||||
|
||||
# RMSD
|
||||
R = rms.RMSD(u, u, select='backbone', groupselections=['name CA'])
|
||||
R.run()
|
||||
# R.results.rmsd: shape (n_frames, 3) = [frame, time, RMSD]
|
||||
|
||||
# RMSF (per-atom fluctuations)
|
||||
from MDAnalysis.analysis.rms import RMSF
|
||||
rmsf = RMSF(u.select_atoms('backbone')).run()
|
||||
# rmsf.results.rmsf: per-atom RMSF values in Angstroms
|
||||
```
|
||||
|
||||
### Radius of Gyration
|
||||
|
||||
```python
|
||||
rg = []
|
||||
for ts in u.trajectory:
|
||||
rg.append(u.select_atoms("protein").radius_of_gyration())
|
||||
import numpy as np
|
||||
print(f"Mean Rg: {np.mean(rg):.2f} Å")
|
||||
```
|
||||
|
||||
### Secondary Structure Analysis
|
||||
|
||||
```python
|
||||
from MDAnalysis.analysis.dssp import DSSP
|
||||
|
||||
# DSSP secondary structure assignment per frame
|
||||
dssp = DSSP(u).run()
|
||||
# dssp.results.dssp: per-residue per-frame secondary structure codes
|
||||
# H = alpha-helix, E = beta-strand, C = coil
|
||||
```
|
||||
|
||||
### Hydrogen Bonds
|
||||
|
||||
```python
|
||||
from MDAnalysis.analysis.hydrogenbonds import HydrogenBondAnalysis
|
||||
|
||||
hbonds = HydrogenBondAnalysis(
|
||||
u,
|
||||
donors_sel="protein and name N",
|
||||
acceptors_sel="protein and name O",
|
||||
d_h_cutoff=1.2, # donor-H distance (Å)
|
||||
d_a_cutoff=3.0, # donor-acceptor distance (Å)
|
||||
d_h_a_angle_cutoff=150 # D-H-A angle (degrees)
|
||||
)
|
||||
hbonds.run()
|
||||
|
||||
# Count H-bonds per frame
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(hbonds.results.hbonds,
|
||||
columns=['frame', 'donor_ix', 'hydrogen_ix', 'acceptor_ix',
|
||||
'DA_dist', 'DHA_angle'])
|
||||
```
|
||||
|
||||
### Principal Component Analysis (PCA)
|
||||
|
||||
```python
|
||||
from MDAnalysis.analysis import pca
|
||||
|
||||
pca_analysis = pca.PCA(u, select='backbone', align=True).run()
|
||||
|
||||
# PC variances
|
||||
print(pca_analysis.results.variance[:5]) # % variance of first 5 PCs
|
||||
|
||||
# Project trajectory onto PCs
|
||||
projected = pca_analysis.transform(u.select_atoms('backbone'), n_components=3)
|
||||
# Shape: (n_frames, n_components)
|
||||
```
|
||||
|
||||
### Free Energy Surface (FES)
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
def plot_free_energy_surface(x, y, bins=50, T=300, xlabel="PC1", ylabel="PC2",
|
||||
output="fes.png"):
|
||||
"""
|
||||
Compute 2D free energy surface from two order parameters.
|
||||
FES = -kT * ln(P(x,y))
|
||||
"""
|
||||
kB = 0.0083144621 # kJ/mol/K
|
||||
kT = kB * T
|
||||
|
||||
# 2D histogram
|
||||
H, xedges, yedges = np.histogram2d(x, y, bins=bins, density=True)
|
||||
H = H.T
|
||||
|
||||
# Free energy
|
||||
H_safe = np.where(H > 0, H, np.nan)
|
||||
fes = -kT * np.log(H_safe)
|
||||
fes -= np.nanmin(fes) # Shift minimum to 0
|
||||
|
||||
# Plot
|
||||
fig, ax = plt.subplots(figsize=(8, 6))
|
||||
im = ax.contourf(xedges[:-1], yedges[:-1], fes, levels=20, cmap='RdYlBu_r')
|
||||
plt.colorbar(im, ax=ax, label='Free Energy (kJ/mol)')
|
||||
ax.set_xlabel(xlabel)
|
||||
ax.set_ylabel(ylabel)
|
||||
plt.savefig(output, dpi=150, bbox_inches='tight')
|
||||
return fig
|
||||
```
|
||||
|
||||
## Trajectory Formats Supported
|
||||
|
||||
| Format | Extension | Notes |
|
||||
|--------|-----------|-------|
|
||||
| DCD | `.dcd` | CHARMM/NAMD binary; widely used |
|
||||
| XTC | `.xtc` | GROMACS compressed |
|
||||
| TRR | `.trr` | GROMACS full precision |
|
||||
| NetCDF | `.nc` | AMBER format |
|
||||
| LAMMPS | `.lammpstrj` | LAMMPS dump |
|
||||
| HDF5 | `.h5md` | H5MD standard |
|
||||
| PDB | `.pdb` | Multi-model PDB |
|
||||
|
||||
## MDAnalysis Interoperability
|
||||
|
||||
```python
|
||||
# Convert to numpy
|
||||
positions = u.atoms.positions # Current frame: shape (N, 3)
|
||||
|
||||
# Write to PDB
|
||||
with mda.Writer("frame_10.pdb", u.atoms.n_atoms) as W:
|
||||
u.trajectory[10] # Move to frame 10
|
||||
W.write(u.atoms)
|
||||
|
||||
# Write trajectory subset
|
||||
with mda.Writer("protein_traj.dcd", u.select_atoms("protein").n_atoms) as W:
|
||||
for ts in u.trajectory:
|
||||
W.write(u.select_atoms("protein"))
|
||||
|
||||
# Convert to MDTraj (for compatibility)
|
||||
# import mdtraj as md
|
||||
# traj = md.load("trajectory.dcd", top="topology.pdb")
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
- **Use `in_memory=True`** for AlignTraj when RAM allows (much faster iteration)
|
||||
- **Select minimal atoms** before analysis to reduce memory/compute
|
||||
- **Use multiprocessing** for independent frame analyses
|
||||
- **Process in chunks** for very long trajectories using `start`/`stop`/`step` parameters:
|
||||
|
||||
```python
|
||||
# Analyze every 10th frame from frame 100 to 1000
|
||||
R.run(start=100, stop=1000, step=10)
|
||||
```
|
||||
372
scientific-skills/monarch-database/SKILL.md
Normal file
372
scientific-skills/monarch-database/SKILL.md
Normal file
@@ -0,0 +1,372 @@
|
||||
---
|
||||
name: monarch-database
|
||||
description: Query the Monarch Initiative knowledge graph for disease-gene-phenotype associations across species. Integrates OMIM, ORPHANET, HPO, ClinVar, and model organism databases. Use for rare disease gene discovery, phenotype-to-gene mapping, cross-species disease modeling, and HPO term lookup.
|
||||
license: CC0-1.0
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# Monarch Initiative Database
|
||||
|
||||
## Overview
|
||||
|
||||
The Monarch Initiative (https://monarchinitiative.org/) is a multi-species integrated knowledgebase that links genes, diseases, and phenotypes across humans and model organisms. It integrates data from over 40 sources including OMIM, ORPHANET, HPO (Human Phenotype Ontology), ClinVar, MGI (Mouse Genome Informatics), ZFIN (Zebrafish), RGD (Rat), FlyBase, and WormBase.
|
||||
|
||||
Monarch enables:
|
||||
- Mapping phenotypes across species to identify candidate disease genes
|
||||
- Finding all genes associated with a disease or phenotype
|
||||
- Discovering model organisms for human diseases
|
||||
- Navigating the HPO hierarchy for phenotype ontology queries
|
||||
|
||||
**Key resources:**
|
||||
- Monarch portal: https://monarchinitiative.org/
|
||||
- Monarch API v3: https://api-v3.monarchinitiative.org/v3/
|
||||
- API docs: https://api-v3.monarchinitiative.org/v3/docs
|
||||
- HPO browser: https://hpo.jax.org/
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use Monarch when:
|
||||
|
||||
- **Rare disease gene discovery**: What genes are associated with my patient's phenotypes (HPO terms)?
|
||||
- **Phenotype similarity**: Are two diseases similar based on their phenotypic profiles?
|
||||
- **Cross-species modeling**: Are there mouse/zebrafish models for my disease of interest?
|
||||
- **HPO term lookup**: Retrieve HPO term names, definitions, and ontology hierarchy
|
||||
- **Disease-phenotype mapping**: List all HPO terms associated with a specific disease
|
||||
- **Gene-phenotype associations**: What phenotypes are caused by variants in a gene?
|
||||
- **Ortholog-phenotype mapping**: Use animal model phenotypes to infer human gene function
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. Monarch API v3
|
||||
|
||||
Base URL: `https://api-v3.monarchinitiative.org/v3/`
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://api-v3.monarchinitiative.org/v3"
|
||||
|
||||
def monarch_get(endpoint, params=None):
|
||||
"""Make a GET request to the Monarch API."""
|
||||
url = f"{BASE_URL}/{endpoint}"
|
||||
response = requests.get(url, params=params, headers={"Accept": "application/json"})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### 2. Phenotype-to-Gene Association (Pheno2Gene)
|
||||
|
||||
```python
|
||||
def get_genes_for_phenotypes(hpo_ids, limit=50, offset=0):
|
||||
"""
|
||||
Find genes associated with a list of HPO phenotype terms.
|
||||
Core use case: rare disease differential diagnosis.
|
||||
|
||||
Args:
|
||||
hpo_ids: List of HPO term IDs (e.g., ["HP:0001250", "HP:0004322"])
|
||||
limit: Maximum number of results
|
||||
"""
|
||||
params = {
|
||||
"terms": hpo_ids,
|
||||
"limit": limit,
|
||||
"offset": offset
|
||||
}
|
||||
return monarch_get("semsim/termset-pairwise-similarity/analyze", params)
|
||||
|
||||
def phenotype_to_gene(hpo_ids):
|
||||
"""
|
||||
Return genes whose phenotypes match the given HPO terms.
|
||||
Uses semantic similarity scoring.
|
||||
"""
|
||||
# Use the /association endpoint for direct phenotype-gene links
|
||||
all_genes = []
|
||||
for hpo_id in hpo_ids:
|
||||
data = monarch_get("association/all", {
|
||||
"subject": hpo_id,
|
||||
"predicate": "biolink:has_phenotype",
|
||||
"category": "biolink:GeneToPhenotypicFeatureAssociation",
|
||||
"limit": 50
|
||||
})
|
||||
for assoc in data.get("items", []):
|
||||
all_genes.append({
|
||||
"phenotype_id": hpo_id,
|
||||
"gene_id": assoc.get("object", {}).get("id"),
|
||||
"gene_name": assoc.get("object", {}).get("name"),
|
||||
"evidence": assoc.get("evidence_type")
|
||||
})
|
||||
return all_genes
|
||||
|
||||
# Example: Find genes associated with seizures and short stature
|
||||
hpo_terms = ["HP:0001250", "HP:0004322"] # Seizures, Short stature
|
||||
genes = phenotype_to_gene(hpo_terms)
|
||||
```
|
||||
|
||||
### 3. Disease-to-Gene Associations
|
||||
|
||||
```python
|
||||
def get_genes_for_disease(disease_id, limit=100):
|
||||
"""
|
||||
Get all genes associated with a disease.
|
||||
Disease IDs: OMIM:146300, MONDO:0007739, ORPHANET:558, etc.
|
||||
"""
|
||||
params = {
|
||||
"object": disease_id,
|
||||
"category": "biolink:DiseaseToDiseaseAssociation",
|
||||
"limit": limit
|
||||
}
|
||||
# Use the gene-disease association endpoint
|
||||
gene_params = {
|
||||
"subject": disease_id,
|
||||
"category": "biolink:GeneToPhenotypicFeatureAssociation",
|
||||
"limit": limit
|
||||
}
|
||||
|
||||
data = monarch_get("association/all", {
|
||||
"object": disease_id,
|
||||
"predicate": "biolink:has_phenotype",
|
||||
"limit": limit
|
||||
})
|
||||
return data
|
||||
|
||||
def get_disease_genes(disease_id, limit=100):
|
||||
"""Get genes causally linked to a disease."""
|
||||
data = monarch_get("association/all", {
|
||||
"subject_category": "biolink:Gene",
|
||||
"object": disease_id,
|
||||
"predicate": "biolink:causes",
|
||||
"limit": limit
|
||||
})
|
||||
return data.get("items", [])
|
||||
|
||||
# MONDO disease IDs (preferred over OMIM for cross-ontology queries)
|
||||
# MONDO:0007739 - Huntington disease
|
||||
# MONDO:0009061 - Cystic fibrosis
|
||||
# OMIM:104300 - Alzheimer disease, susceptibility to, type 1
|
||||
```
|
||||
|
||||
### 4. Gene-to-Phenotype and Disease
|
||||
|
||||
```python
|
||||
def get_phenotypes_for_gene(gene_id, limit=100):
|
||||
"""
|
||||
Get all phenotypes associated with a gene.
|
||||
Gene IDs: HGNC:7884, NCBIGene:4137, etc.
|
||||
"""
|
||||
data = monarch_get("association/all", {
|
||||
"subject": gene_id,
|
||||
"predicate": "biolink:has_phenotype",
|
||||
"limit": limit
|
||||
})
|
||||
return data.get("items", [])
|
||||
|
||||
def get_diseases_for_gene(gene_id, limit=100):
|
||||
"""Get diseases caused by variants in a gene."""
|
||||
data = monarch_get("association/all", {
|
||||
"subject": gene_id,
|
||||
"object_category": "biolink:Disease",
|
||||
"limit": limit
|
||||
})
|
||||
return data.get("items", [])
|
||||
|
||||
# Example: What diseases does BRCA1 cause?
|
||||
brca1_diseases = get_diseases_for_gene("HGNC:1100")
|
||||
for assoc in brca1_diseases:
|
||||
print(f" {assoc.get('object', {}).get('name')} ({assoc.get('object', {}).get('id')})")
|
||||
```
|
||||
|
||||
### 5. HPO Term Lookup
|
||||
|
||||
```python
|
||||
def get_hpo_term(hpo_id):
|
||||
"""Fetch information about an HPO term."""
|
||||
return monarch_get(f"entity/{hpo_id}")
|
||||
|
||||
def search_hpo_terms(query, limit=20):
|
||||
"""Search for HPO terms by name."""
|
||||
params = {
|
||||
"q": query,
|
||||
"category": "biolink:PhenotypicFeature",
|
||||
"limit": limit
|
||||
}
|
||||
return monarch_get("search", params)
|
||||
|
||||
# Example: look up the HPO term for seizures
|
||||
seizure_term = get_hpo_term("HP:0001250")
|
||||
print(f"Name: {seizure_term.get('name')}")
|
||||
print(f"Definition: {seizure_term.get('description')}")
|
||||
|
||||
# Search for related terms
|
||||
epilepsy_terms = search_hpo_terms("epilepsy")
|
||||
for term in epilepsy_terms.get("items", [])[:5]:
|
||||
print(f" {term['id']}: {term['name']}")
|
||||
```
|
||||
|
||||
### 6. Semantic Similarity (Disease Comparison)
|
||||
|
||||
```python
|
||||
def compare_disease_phenotypes(disease_id_1, disease_id_2):
|
||||
"""
|
||||
Compare two diseases by semantic similarity of their phenotype profiles.
|
||||
Returns similarity score using HPO hierarchy.
|
||||
"""
|
||||
params = {
|
||||
"subjects": [disease_id_1],
|
||||
"objects": [disease_id_2],
|
||||
"metric": "ancestor_information_content"
|
||||
}
|
||||
return monarch_get("semsim/compare", params)
|
||||
|
||||
# Example: Compare Dravet syndrome with CDKL5-deficiency disorder
|
||||
similarity = compare_disease_phenotypes("MONDO:0100135", "MONDO:0014917")
|
||||
```
|
||||
|
||||
### 7. Cross-Species Orthologs
|
||||
|
||||
```python
|
||||
def get_orthologs(gene_id, species=None):
|
||||
"""
|
||||
Get orthologs of a human gene in model organisms.
|
||||
Useful for finding animal models of human diseases.
|
||||
"""
|
||||
params = {"limit": 50}
|
||||
if species:
|
||||
params["subject_taxon"] = species
|
||||
|
||||
data = monarch_get("association/all", {
|
||||
"subject": gene_id,
|
||||
"predicate": "biolink:orthologous_to",
|
||||
"limit": 50
|
||||
})
|
||||
return data.get("items", [])
|
||||
|
||||
# NCBI Taxonomy IDs for common model organisms:
|
||||
# Mouse: 10090 (Mus musculus)
|
||||
# Zebrafish: 7955 (Danio rerio)
|
||||
# Fruit fly: 7227 (Drosophila melanogaster)
|
||||
# C. elegans: 6239
|
||||
# Rat: 10116 (Rattus norvegicus)
|
||||
```
|
||||
|
||||
### 8. Full Workflow: Rare Disease Gene Prioritization
|
||||
|
||||
```python
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
def rare_disease_gene_finder(patient_hpo_terms, candidate_gene_ids=None, top_n=20):
|
||||
"""
|
||||
Find genes that match a patient's HPO phenotype profile.
|
||||
|
||||
Args:
|
||||
patient_hpo_terms: List of HPO IDs from clinical assessment
|
||||
candidate_gene_ids: Optional list to restrict search
|
||||
top_n: Number of top candidates to return
|
||||
"""
|
||||
BASE_URL = "https://api-v3.monarchinitiative.org/v3"
|
||||
|
||||
# 1. Find genes associated with each phenotype
|
||||
gene_phenotype_counts = {}
|
||||
|
||||
for hpo_id in patient_hpo_terms:
|
||||
data = requests.get(
|
||||
f"{BASE_URL}/association/all",
|
||||
params={
|
||||
"object": hpo_id,
|
||||
"subject_category": "biolink:Gene",
|
||||
"limit": 100
|
||||
}
|
||||
).json()
|
||||
|
||||
for item in data.get("items", []):
|
||||
gene_id = item.get("subject", {}).get("id")
|
||||
gene_name = item.get("subject", {}).get("name")
|
||||
if gene_id:
|
||||
if gene_id not in gene_phenotype_counts:
|
||||
gene_phenotype_counts[gene_id] = {"name": gene_name, "count": 0, "phenotypes": []}
|
||||
gene_phenotype_counts[gene_id]["count"] += 1
|
||||
gene_phenotype_counts[gene_id]["phenotypes"].append(hpo_id)
|
||||
|
||||
# 2. Rank by number of matching phenotypes
|
||||
ranked = sorted(gene_phenotype_counts.items(),
|
||||
key=lambda x: -x[1]["count"])[:top_n]
|
||||
|
||||
results = []
|
||||
for gene_id, info in ranked:
|
||||
results.append({
|
||||
"gene_id": gene_id,
|
||||
"gene_name": info["name"],
|
||||
"matching_phenotypes": info["count"],
|
||||
"total_patient_phenotypes": len(patient_hpo_terms),
|
||||
"phenotype_overlap": info["count"] / len(patient_hpo_terms),
|
||||
"matching_hpo_terms": info["phenotypes"]
|
||||
})
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
# Example usage
|
||||
patient_phenotypes = [
|
||||
"HP:0001250", # Seizures
|
||||
"HP:0004322", # Short stature
|
||||
"HP:0001252", # Hypotonia
|
||||
"HP:0000252", # Microcephaly
|
||||
"HP:0001263", # Global developmental delay
|
||||
]
|
||||
candidates = rare_disease_gene_finder(patient_phenotypes)
|
||||
print(candidates[["gene_name", "matching_phenotypes", "phenotype_overlap"]].to_string())
|
||||
```
|
||||
|
||||
## Query Workflows
|
||||
|
||||
### Workflow 1: HPO-Based Differential Diagnosis
|
||||
|
||||
1. Extract HPO terms from clinical notes or genetics consultation
|
||||
2. Run phenotype-to-gene query against Monarch
|
||||
3. Rank candidate genes by number of matching phenotypes
|
||||
4. Cross-reference with gnomAD (constraint scores) and ClinVar (variant evidence)
|
||||
5. Prioritize genes with high pLI and known pathogenic variants
|
||||
|
||||
### Workflow 2: Disease Model Discovery
|
||||
|
||||
1. Identify gene or disease of interest
|
||||
2. Query Monarch for cross-species orthologs
|
||||
3. Find phenotype associations in model organism databases
|
||||
4. Identify experimental models that recapitulate human disease features
|
||||
|
||||
### Workflow 3: Phenotype Annotation of Novel Genes
|
||||
|
||||
1. For a gene with unknown function, query all known phenotype associations
|
||||
2. Map to HPO hierarchy to understand affected body systems
|
||||
3. Cross-reference with OMIM and ORPHANET for disease links
|
||||
|
||||
## Common Identifier Prefixes
|
||||
|
||||
| Prefix | Namespace | Example |
|
||||
|--------|-----------|---------|
|
||||
| `HP:` | Human Phenotype Ontology | HP:0001250 (Seizures) |
|
||||
| `MONDO:` | Monarch Disease Ontology | MONDO:0007739 |
|
||||
| `OMIM:` | OMIM disease | OMIM:104300 |
|
||||
| `ORPHANET:` | Orphanet rare disease | ORPHANET:558 |
|
||||
| `HGNC:` | HGNC gene symbol | HGNC:7884 |
|
||||
| `NCBIGene:` | NCBI gene ID | NCBIGene:4137 |
|
||||
| `ENSEMBL:` | Ensembl gene | ENSEMBL:ENSG... |
|
||||
| `MGI:` | Mouse gene | MGI:1338833 |
|
||||
| `ZFIN:` | Zebrafish gene | ZFIN:ZDB-GENE... |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Use MONDO IDs** for diseases — they unify OMIM/ORPHANET/MESH identifiers
|
||||
- **Use HPO IDs** for phenotypes — the standard for clinical phenotype description
|
||||
- **Handle pagination**: Large queries may require iterating with offset parameter
|
||||
- **Semantic similarity is better than exact match**: Ancestor HPO terms catch related phenotypes
|
||||
- **Cross-validate with ClinVar and OMIM**: Monarch aggregates many sources; quality varies
|
||||
- **Use HGNC IDs for genes**: More stable than gene symbols across database versions
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **Monarch portal**: https://monarchinitiative.org/
|
||||
- **API v3 docs**: https://api-v3.monarchinitiative.org/v3/docs
|
||||
- **HPO browser**: https://hpo.jax.org/
|
||||
- **MONDO ontology**: https://mondo.monarchinitiative.org/
|
||||
- **Citation**: Shefchek KA et al. (2020) Nucleic Acids Research. PMID: 31701156
|
||||
- **Phenomizer** (HPO-based diagnosis): https://compbio.charite.de/phenomizer/
|
||||
@@ -0,0 +1,160 @@
|
||||
# HPO and Disease Ontology Reference for Monarch
|
||||
|
||||
## Human Phenotype Ontology (HPO)
|
||||
|
||||
### HPO Structure
|
||||
|
||||
HPO is organized hierarchically:
|
||||
- **Root**: HP:0000001 (All)
|
||||
- HP:0000118 (Phenotypic abnormality)
|
||||
- HP:0000478 (Abnormality of the eye)
|
||||
- HP:0000707 (Abnormality of the nervous system)
|
||||
- HP:0001507 (Growth abnormality)
|
||||
- HP:0001626 (Abnormality of the cardiovascular system)
|
||||
- etc.
|
||||
|
||||
### Top-Level HPO Categories
|
||||
|
||||
| HPO ID | Name |
|
||||
|--------|------|
|
||||
| HP:0000924 | Abnormality of the skeletal system |
|
||||
| HP:0000707 | Abnormality of the nervous system |
|
||||
| HP:0000478 | Abnormality of the eye |
|
||||
| HP:0000598 | Abnormality of the ear |
|
||||
| HP:0001507 | Growth abnormality |
|
||||
| HP:0001626 | Abnormality of the cardiovascular system |
|
||||
| HP:0002086 | Abnormality of the respiratory system |
|
||||
| HP:0001939 | Abnormality of metabolism/homeostasis |
|
||||
| HP:0002664 | Neoplasm |
|
||||
| HP:0000818 | Abnormality of the endocrine system |
|
||||
| HP:0000119 | Abnormality of the genitourinary system |
|
||||
| HP:0001197 | Abnormality of prenatal development/birth |
|
||||
|
||||
### Common HPO Terms in Rare Disease Genetics
|
||||
|
||||
#### Neurological
|
||||
| HPO ID | Term |
|
||||
|--------|------|
|
||||
| HP:0001250 | Seizures |
|
||||
| HP:0001251 | Ataxia |
|
||||
| HP:0001252 | Muscular hypotonia |
|
||||
| HP:0001263 | Global developmental delay |
|
||||
| HP:0001270 | Motor delay |
|
||||
| HP:0002167 | Neurological speech impairment |
|
||||
| HP:0000716 | Depressivity |
|
||||
| HP:0000729 | Autistic behavior |
|
||||
| HP:0001332 | Dystonia |
|
||||
| HP:0002071 | Abnormality of extrapyramidal motor function |
|
||||
|
||||
#### Growth/Morphology
|
||||
| HPO ID | Term |
|
||||
|--------|------|
|
||||
| HP:0004322 | Short stature |
|
||||
| HP:0001508 | Failure to thrive |
|
||||
| HP:0000252 | Microcephaly |
|
||||
| HP:0000256 | Macrocephaly |
|
||||
| HP:0001511 | Intrauterine growth retardation |
|
||||
|
||||
#### Facial Features
|
||||
| HPO ID | Term |
|
||||
|--------|------|
|
||||
| HP:0000324 | Facial asymmetry |
|
||||
| HP:0001249 | Intellectual disability |
|
||||
| HP:0000219 | Thin upper lip vermilion |
|
||||
| HP:0000303 | Mandibular prognathia |
|
||||
| HP:0000463 | Anteverted nares |
|
||||
|
||||
#### Metabolic
|
||||
| HPO ID | Term |
|
||||
|--------|------|
|
||||
| HP:0001943 | Hypoglycemia |
|
||||
| HP:0001944 | Hyperglycemia (Diabetes mellitus) |
|
||||
| HP:0000822 | Hypertension |
|
||||
| HP:0001712 | Left ventricular hypertrophy |
|
||||
|
||||
## MONDO Disease Ontology
|
||||
|
||||
MONDO integrates disease classifications from multiple sources:
|
||||
- OMIM (Mendelian diseases)
|
||||
- ORPHANET (rare diseases)
|
||||
- MeSH (medical subject headings)
|
||||
- SNOMED CT
|
||||
- DOID (Disease Ontology)
|
||||
- EFO (Experimental Factor Ontology)
|
||||
|
||||
### Key MONDO IDs for Common Rare Diseases
|
||||
|
||||
| MONDO ID | Disease | OMIM |
|
||||
|----------|---------|------|
|
||||
| MONDO:0007739 | Huntington disease | OMIM:143100 |
|
||||
| MONDO:0009061 | Cystic fibrosis | OMIM:219700 |
|
||||
| MONDO:0008608 | Down syndrome | OMIM:190685 |
|
||||
| MONDO:0019391 | Fragile X syndrome | OMIM:300624 |
|
||||
| MONDO:0010726 | Rett syndrome | OMIM:312750 |
|
||||
| MONDO:0014517 | Dravet syndrome | OMIM:607208 |
|
||||
| MONDO:0024522 | SCN1A-related epilepsy | — |
|
||||
| MONDO:0014817 | CHARGE syndrome | OMIM:214800 |
|
||||
| MONDO:0009764 | Marfan syndrome | OMIM:154700 |
|
||||
| MONDO:0013282 | Alpha-1-antitrypsin deficiency | OMIM:613490 |
|
||||
|
||||
### OMIM ID Patterns
|
||||
|
||||
- **Phenotype only**: OMIM number alone (e.g., OMIM:104300)
|
||||
- **Gene and phenotype**: Same gene, multiple phenotype entries
|
||||
- **Phenotype series**: Grouped phenotypes at a locus
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def omim_to_mondo(omim_id):
|
||||
"""Convert OMIM ID to MONDO ID via Monarch API."""
|
||||
search_id = f"OMIM:{omim_id}" if not omim_id.startswith("OMIM:") else omim_id
|
||||
data = requests.get(
|
||||
f"https://api-v3.monarchinitiative.org/v3/entity/{search_id}"
|
||||
).json()
|
||||
# Check for same_as/equivalent_id links to MONDO
|
||||
return data
|
||||
```
|
||||
|
||||
## Association Evidence Codes
|
||||
|
||||
Monarch associations include evidence types:
|
||||
|
||||
| Code | Evidence Type |
|
||||
|------|--------------|
|
||||
| `IEA` | Inferred from electronic annotation |
|
||||
| `TAS` | Traceable author statement |
|
||||
| `IMP` | Inferred from mutant phenotype |
|
||||
| `IGI` | Inferred from genetic interaction |
|
||||
| `IDA` | Inferred from direct assay |
|
||||
| `ISS` | Inferred from sequence or structural similarity |
|
||||
| `IBA` | Inferred from biological aspect of ancestor |
|
||||
|
||||
Higher-quality evidence: IDA > TAS > IMP > IEA
|
||||
|
||||
## Semantic Similarity Metrics
|
||||
|
||||
Monarch supports multiple similarity metrics:
|
||||
|
||||
| Metric | Description | Use case |
|
||||
|--------|-------------|---------|
|
||||
| `ancestor_information_content` | IC of most informative common ancestor (MICA) | Disease similarity |
|
||||
| `jaccard_similarity` | Overlap coefficient | Simple set comparison |
|
||||
| `cosine` | Cosine similarity of IC vectors | Large-scale comparisons |
|
||||
| `phenodigm` | Combined MICA + Jaccard | Model organism matching |
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def compute_disease_similarity(disease_ids_1, disease_ids_2, metric="ancestor_information_content"):
|
||||
"""Compute semantic similarity between two sets of disease phenotypes."""
|
||||
# Get phenotype sets for each disease
|
||||
url = "https://api-v3.monarchinitiative.org/v3/semsim/compare"
|
||||
params = {
|
||||
"subjects": disease_ids_1,
|
||||
"objects": disease_ids_2,
|
||||
"metric": metric
|
||||
}
|
||||
response = requests.get(url, params=params)
|
||||
return response.json()
|
||||
```
|
||||
404
scientific-skills/phylogenetics/SKILL.md
Normal file
404
scientific-skills/phylogenetics/SKILL.md
Normal file
@@ -0,0 +1,404 @@
|
||||
---
|
||||
name: phylogenetics
|
||||
description: Build and analyze phylogenetic trees using MAFFT (multiple alignment), IQ-TREE 2 (maximum likelihood), and FastTree (fast NJ/ML). Visualize with ETE3 or FigTree. For evolutionary analysis, microbial genomics, viral phylodynamics, protein family analysis, and molecular clock studies.
|
||||
license: Unknown
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# Phylogenetics
|
||||
|
||||
## Overview
|
||||
|
||||
Phylogenetic analysis reconstructs the evolutionary history of biological sequences (genes, proteins, genomes) by inferring the branching pattern of descent. This skill covers the standard pipeline:
|
||||
|
||||
1. **MAFFT** — Multiple sequence alignment
|
||||
2. **IQ-TREE 2** — Maximum likelihood tree inference with model selection
|
||||
3. **FastTree** — Fast approximate maximum likelihood (for large datasets)
|
||||
4. **ETE3** — Python library for tree manipulation and visualization
|
||||
|
||||
**Installation:**
|
||||
```bash
|
||||
# Conda (recommended for CLI tools)
|
||||
conda install -c bioconda mafft iqtree fasttree
|
||||
pip install ete3
|
||||
```
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use phylogenetics when:
|
||||
|
||||
- **Evolutionary relationships**: Which organism/gene is most closely related to my sequence?
|
||||
- **Viral phylodynamics**: Trace outbreak spread and estimate transmission dates
|
||||
- **Protein family analysis**: Infer evolutionary relationships within a gene family
|
||||
- **Horizontal gene transfer detection**: Identify genes with discordant species/gene trees
|
||||
- **Ancestral sequence reconstruction**: Infer ancestral protein sequences
|
||||
- **Molecular clock analysis**: Estimate divergence dates using temporal sampling
|
||||
- **GWAS companion**: Place variants in evolutionary context (e.g., SARS-CoV-2 variants)
|
||||
- **Microbiology**: Species phylogeny from 16S rRNA or core genome phylogeny
|
||||
|
||||
## Standard Workflow
|
||||
|
||||
### 1. Multiple Sequence Alignment with MAFFT
|
||||
|
||||
```python
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
def run_mafft(input_fasta: str, output_fasta: str, method: str = "auto",
|
||||
n_threads: int = 4) -> str:
|
||||
"""
|
||||
Align sequences with MAFFT.
|
||||
|
||||
Args:
|
||||
input_fasta: Path to unaligned FASTA file
|
||||
output_fasta: Path for aligned output
|
||||
method: 'auto' (auto-select), 'einsi' (accurate), 'linsi' (accurate, slow),
|
||||
'fftnsi' (medium), 'fftns' (fast), 'retree2' (fast)
|
||||
n_threads: Number of CPU threads
|
||||
|
||||
Returns:
|
||||
Path to aligned FASTA file
|
||||
"""
|
||||
methods = {
|
||||
"auto": ["mafft", "--auto"],
|
||||
"einsi": ["mafft", "--genafpair", "--maxiterate", "1000"],
|
||||
"linsi": ["mafft", "--localpair", "--maxiterate", "1000"],
|
||||
"fftnsi": ["mafft", "--fftnsi"],
|
||||
"fftns": ["mafft", "--fftns"],
|
||||
"retree2": ["mafft", "--retree", "2"],
|
||||
}
|
||||
|
||||
cmd = methods.get(method, methods["auto"])
|
||||
cmd += ["--thread", str(n_threads), "--inputorder", input_fasta]
|
||||
|
||||
with open(output_fasta, 'w') as out:
|
||||
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"MAFFT failed:\n{result.stderr}")
|
||||
|
||||
# Count aligned sequences
|
||||
with open(output_fasta) as f:
|
||||
n_seqs = sum(1 for line in f if line.startswith('>'))
|
||||
print(f"MAFFT: aligned {n_seqs} sequences → {output_fasta}")
|
||||
|
||||
return output_fasta
|
||||
|
||||
# MAFFT method selection guide:
|
||||
# Few sequences (<200), accurate: linsi or einsi
|
||||
# Many sequences (<1000), moderate: fftnsi
|
||||
# Large datasets (>1000): fftns or auto
|
||||
# Ultra-fast (>10000): mafft --retree 1
|
||||
```
|
||||
|
||||
### 2. Trim Alignment (Optional but Recommended)
|
||||
|
||||
```python
|
||||
def trim_alignment_trimal(aligned_fasta: str, output_fasta: str,
|
||||
method: str = "automated1") -> str:
|
||||
"""
|
||||
Trim poorly aligned columns with TrimAl.
|
||||
|
||||
Methods:
|
||||
- 'automated1': Automatic heuristic (recommended)
|
||||
- 'gappyout': Remove gappy columns
|
||||
- 'strict': Strict gap threshold
|
||||
"""
|
||||
cmd = ["trimal", f"-{method}", "-in", aligned_fasta, "-out", output_fasta, "-fasta"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"TrimAl warning: {result.stderr}")
|
||||
# Fall back to using the untrimmed alignment
|
||||
import shutil
|
||||
shutil.copy(aligned_fasta, output_fasta)
|
||||
return output_fasta
|
||||
```
|
||||
|
||||
### 3. IQ-TREE 2 — Maximum Likelihood Tree
|
||||
|
||||
```python
|
||||
def run_iqtree(aligned_fasta: str, output_prefix: str,
|
||||
model: str = "TEST", bootstrap: int = 1000,
|
||||
n_threads: int = 4, extra_args: list = None) -> dict:
|
||||
"""
|
||||
Build a maximum likelihood tree with IQ-TREE 2.
|
||||
|
||||
Args:
|
||||
aligned_fasta: Aligned FASTA file
|
||||
output_prefix: Prefix for output files
|
||||
model: 'TEST' for automatic model selection, or specify (e.g., 'GTR+G' for DNA,
|
||||
'LG+G4' for proteins, 'JTT+G' for proteins)
|
||||
bootstrap: Number of ultrafast bootstrap replicates (1000 recommended)
|
||||
n_threads: Number of threads ('AUTO' to auto-detect)
|
||||
extra_args: Additional IQ-TREE arguments
|
||||
|
||||
Returns:
|
||||
Dict with paths to output files
|
||||
"""
|
||||
cmd = [
|
||||
"iqtree2",
|
||||
"-s", aligned_fasta,
|
||||
"--prefix", output_prefix,
|
||||
"-m", model,
|
||||
"-B", str(bootstrap), # Ultrafast bootstrap
|
||||
"-T", str(n_threads),
|
||||
"--redo" # Overwrite existing results
|
||||
]
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"IQ-TREE failed:\n{result.stderr}")
|
||||
|
||||
# Print model selection result
|
||||
log_file = f"{output_prefix}.log"
|
||||
if os.path.exists(log_file):
|
||||
with open(log_file) as f:
|
||||
for line in f:
|
||||
if "Best-fit model" in line:
|
||||
print(f"IQ-TREE: {line.strip()}")
|
||||
|
||||
output_files = {
|
||||
"tree": f"{output_prefix}.treefile",
|
||||
"log": f"{output_prefix}.log",
|
||||
"iqtree": f"{output_prefix}.iqtree", # Full report
|
||||
"model": f"{output_prefix}.model.gz",
|
||||
}
|
||||
|
||||
print(f"IQ-TREE: Tree saved to {output_files['tree']}")
|
||||
return output_files
|
||||
|
||||
# IQ-TREE model selection guide:
|
||||
# DNA: TEST → GTR+G, HKY+G, TrN+G
|
||||
# Protein: TEST → LG+G4, WAG+G, JTT+G, Q.pfam+G
|
||||
# Codon: TEST → MG+F3X4
|
||||
|
||||
# For temporal (molecular clock) analysis, add:
|
||||
# extra_args = ["--date", "dates.txt", "--clock-test", "--date-CI", "95"]
|
||||
```
|
||||
|
||||
### 4. FastTree — Fast Approximate ML
|
||||
|
||||
For large datasets (>1000 sequences) where IQ-TREE is too slow:
|
||||
|
||||
```python
|
||||
def run_fasttree(aligned_fasta: str, output_tree: str,
|
||||
sequence_type: str = "nt", model: str = "gtr",
|
||||
n_threads: int = 4) -> str:
|
||||
"""
|
||||
Build a fast approximate ML tree with FastTree.
|
||||
|
||||
Args:
|
||||
sequence_type: 'nt' for nucleotide or 'aa' for amino acid
|
||||
model: For nt: 'gtr' (recommended) or 'jc'; for aa: 'lg', 'wag', 'jtt'
|
||||
"""
|
||||
if sequence_type == "nt":
|
||||
cmd = ["FastTree", "-nt", "-gtr"]
|
||||
else:
|
||||
cmd = ["FastTree", f"-{model}"]
|
||||
|
||||
cmd += [aligned_fasta]
|
||||
|
||||
with open(output_tree, 'w') as out:
|
||||
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FastTree failed:\n{result.stderr}")
|
||||
|
||||
print(f"FastTree: Tree saved to {output_tree}")
|
||||
return output_tree
|
||||
```
|
||||
|
||||
### 5. Tree Analysis and Visualization with ETE3
|
||||
|
||||
```python
|
||||
from ete3 import Tree, TreeStyle, NodeStyle, TextFace, PhyloTree
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def load_tree(tree_file: str) -> Tree:
|
||||
"""Load a Newick tree file."""
|
||||
t = Tree(tree_file)
|
||||
print(f"Tree: {len(t)} leaves, {len(list(t.traverse()))} nodes")
|
||||
return t
|
||||
|
||||
def basic_tree_stats(t: Tree) -> dict:
|
||||
"""Compute basic tree statistics."""
|
||||
leaves = t.get_leaves()
|
||||
distances = [t.get_distance(l1, l2) for l1 in leaves[:min(50, len(leaves))]
|
||||
for l2 in leaves[:min(50, len(leaves))] if l1 != l2]
|
||||
|
||||
stats = {
|
||||
"n_leaves": len(leaves),
|
||||
"n_internal_nodes": len(t) - len(leaves),
|
||||
"total_branch_length": sum(n.dist for n in t.traverse()),
|
||||
"max_leaf_distance": max(distances) if distances else 0,
|
||||
"mean_leaf_distance": sum(distances)/len(distances) if distances else 0,
|
||||
}
|
||||
return stats
|
||||
|
||||
def find_mrca(t: Tree, leaf_names: list) -> Tree:
|
||||
"""Find the most recent common ancestor of a set of leaves."""
|
||||
return t.get_common_ancestor(*leaf_names)
|
||||
|
||||
def visualize_tree(t: Tree, output_file: str = "tree.png",
|
||||
show_branch_support: bool = True,
|
||||
color_groups: dict = None,
|
||||
width: int = 800) -> None:
|
||||
"""
|
||||
Render phylogenetic tree to image.
|
||||
|
||||
Args:
|
||||
t: ETE3 Tree object
|
||||
color_groups: Dict mapping leaf_name → color (for coloring taxa)
|
||||
show_branch_support: Show bootstrap values
|
||||
"""
|
||||
ts = TreeStyle()
|
||||
ts.show_leaf_name = True
|
||||
ts.show_branch_support = show_branch_support
|
||||
ts.mode = "r" # 'r' = rectangular, 'c' = circular
|
||||
|
||||
if color_groups:
|
||||
for node in t.traverse():
|
||||
if node.is_leaf() and node.name in color_groups:
|
||||
nstyle = NodeStyle()
|
||||
nstyle["fgcolor"] = color_groups[node.name]
|
||||
nstyle["size"] = 8
|
||||
node.set_style(nstyle)
|
||||
|
||||
t.render(output_file, tree_style=ts, w=width, units="px")
|
||||
print(f"Tree saved to: {output_file}")
|
||||
|
||||
def midpoint_root(t: Tree) -> Tree:
|
||||
"""Root tree at midpoint (use when outgroup unknown)."""
|
||||
t.set_outgroup(t.get_midpoint_outgroup())
|
||||
return t
|
||||
|
||||
def prune_tree(t: Tree, keep_leaves: list) -> Tree:
|
||||
"""Prune tree to keep only specified leaves."""
|
||||
t.prune(keep_leaves, preserve_branch_length=True)
|
||||
return t
|
||||
```
|
||||
|
||||
### 6. Complete Analysis Script
|
||||
|
||||
```python
|
||||
import subprocess, os
|
||||
from ete3 import Tree
|
||||
|
||||
def full_phylogenetic_analysis(
|
||||
input_fasta: str,
|
||||
output_dir: str = "phylo_results",
|
||||
sequence_type: str = "nt",
|
||||
n_threads: int = 4,
|
||||
bootstrap: int = 1000,
|
||||
use_fasttree: bool = False
|
||||
) -> dict:
|
||||
"""
|
||||
Complete phylogenetic pipeline: align → trim → tree → visualize.
|
||||
|
||||
Args:
|
||||
input_fasta: Unaligned FASTA
|
||||
sequence_type: 'nt' (nucleotide) or 'aa' (amino acid/protein)
|
||||
use_fasttree: Use FastTree instead of IQ-TREE (faster for large datasets)
|
||||
"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
prefix = os.path.join(output_dir, "phylo")
|
||||
|
||||
print("=" * 50)
|
||||
print("Step 1: Multiple Sequence Alignment (MAFFT)")
|
||||
aligned = run_mafft(input_fasta, f"{prefix}_aligned.fasta",
|
||||
method="auto", n_threads=n_threads)
|
||||
|
||||
print("\nStep 2: Tree Inference")
|
||||
if use_fasttree:
|
||||
tree_file = run_fasttree(
|
||||
aligned, f"{prefix}.tree",
|
||||
sequence_type=sequence_type,
|
||||
model="gtr" if sequence_type == "nt" else "lg"
|
||||
)
|
||||
else:
|
||||
model = "TEST" if sequence_type == "nt" else "TEST"
|
||||
iqtree_files = run_iqtree(
|
||||
aligned, prefix,
|
||||
model=model,
|
||||
bootstrap=bootstrap,
|
||||
n_threads=n_threads
|
||||
)
|
||||
tree_file = iqtree_files["tree"]
|
||||
|
||||
print("\nStep 3: Tree Analysis")
|
||||
t = Tree(tree_file)
|
||||
t = midpoint_root(t)
|
||||
|
||||
stats = basic_tree_stats(t)
|
||||
print(f"Tree statistics: {stats}")
|
||||
|
||||
print("\nStep 4: Visualization")
|
||||
visualize_tree(t, f"{prefix}_tree.png", show_branch_support=True)
|
||||
|
||||
# Save rooted tree
|
||||
rooted_tree_file = f"{prefix}_rooted.nwk"
|
||||
t.write(format=1, outfile=rooted_tree_file)
|
||||
|
||||
results = {
|
||||
"aligned_fasta": aligned,
|
||||
"tree_file": tree_file,
|
||||
"rooted_tree": rooted_tree_file,
|
||||
"visualization": f"{prefix}_tree.png",
|
||||
"stats": stats
|
||||
}
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("Phylogenetic analysis complete!")
|
||||
print(f"Results in: {output_dir}/")
|
||||
return results
|
||||
```
|
||||
|
||||
## IQ-TREE Model Guide
|
||||
|
||||
### DNA Models
|
||||
|
||||
| Model | Description | Use case |
|
||||
|-------|-------------|---------|
|
||||
| `GTR+G4` | General Time Reversible + Gamma | Most flexible DNA model |
|
||||
| `HKY+G4` | Hasegawa-Kishino-Yano + Gamma | Two-rate model (common) |
|
||||
| `TrN+G4` | Tamura-Nei | Unequal transitions |
|
||||
| `JC` | Jukes-Cantor | Simplest; all rates equal |
|
||||
|
||||
### Protein Models
|
||||
|
||||
| Model | Description | Use case |
|
||||
|-------|-------------|---------|
|
||||
| `LG+G4` | Le-Gascuel + Gamma | Best average protein model |
|
||||
| `WAG+G4` | Whelan-Goldman | Widely used |
|
||||
| `JTT+G4` | Jones-Taylor-Thornton | Classical model |
|
||||
| `Q.pfam+G4` | pfam-trained | For Pfam-like protein families |
|
||||
| `Q.bird+G4` | Bird-specific | Vertebrate proteins |
|
||||
|
||||
**Tip:** Use `-m TEST` to let IQ-TREE automatically select the best model.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Alignment quality first**: Poor alignment → unreliable trees; check alignment manually
|
||||
- **Use `linsi` for small (<200 seq), `fftns` or `auto` for large alignments**
|
||||
- **Model selection**: Always use `-m TEST` for IQ-TREE unless you have a specific reason
|
||||
- **Bootstrap**: Use ≥1000 ultrafast bootstraps (`-B 1000`) for branch support
|
||||
- **Root the tree**: Unrooted trees can be misleading; use outgroup or midpoint rooting
|
||||
- **FastTree for >5000 sequences**: IQ-TREE becomes slow; FastTree is 10–100× faster
|
||||
- **Trim long alignments**: TrimAl removes unreliable columns; improves tree accuracy
|
||||
- **Check for recombination** in viral/bacterial sequences before building trees (`RDP4`, `GARD`)
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **MAFFT**: https://mafft.cbrc.jp/alignment/software/
|
||||
- **IQ-TREE 2**: http://www.iqtree.org/ | Tutorial: https://www.iqtree.org/workshop/molevol2022
|
||||
- **FastTree**: http://www.microbesonline.org/fasttree/
|
||||
- **ETE3**: http://etetoolkit.org/
|
||||
- **FigTree** (GUI visualization): https://tree.bio.ed.ac.uk/software/figtree/
|
||||
- **iTOL** (web visualization): https://itol.embl.de/
|
||||
- **MUSCLE** (alternative aligner): https://www.drive5.com/muscle/
|
||||
- **TrimAl** (alignment trimming): https://vicfero.github.io/trimal/
|
||||
181
scientific-skills/phylogenetics/references/iqtree_inference.md
Normal file
181
scientific-skills/phylogenetics/references/iqtree_inference.md
Normal file
@@ -0,0 +1,181 @@
|
||||
# IQ-TREE 2 Phylogenetic Inference Reference
|
||||
|
||||
## Basic Command Syntax
|
||||
|
||||
```bash
|
||||
iqtree2 -s alignment.fasta --prefix output -m TEST -B 1000 -T AUTO --redo
|
||||
```
|
||||
|
||||
## Key Parameters
|
||||
|
||||
| Flag | Description | Default |
|
||||
|------|-------------|---------|
|
||||
| `-s` | Input alignment file | Required |
|
||||
| `--prefix` | Output file prefix | alignment name |
|
||||
| `-m` | Substitution model (or TEST) | GTR+G |
|
||||
| `-B` | Ultrafast bootstrap replicates | Off |
|
||||
| `-b` | Standard bootstrap replicates (slow) | Off |
|
||||
| `-T` | Number of threads (or AUTO) | 1 |
|
||||
| `-o` | Outgroup taxa name(s) | None (unrooted) |
|
||||
| `--redo` | Overwrite existing results | Off |
|
||||
| `-alrt` | SH-aLRT test replicates | Off |
|
||||
|
||||
## Model Selection
|
||||
|
||||
```bash
|
||||
# Full model testing (automatically selects best model)
|
||||
iqtree2 -s alignment.fasta -m TEST --prefix test_run -B 1000 -T 4
|
||||
|
||||
# Specify model explicitly
|
||||
iqtree2 -s alignment.fasta -m GTR+G4 --prefix gtr_run -B 1000
|
||||
|
||||
# Protein sequences
|
||||
iqtree2 -s protein.fasta -m TEST --prefix prot_tree -B 1000
|
||||
|
||||
# Codon-based analysis
|
||||
iqtree2 -s codon.fasta -m GY --prefix codon_tree -B 1000
|
||||
```
|
||||
|
||||
## Bootstrapping Methods
|
||||
|
||||
### Ultrafast Bootstrap (UFBoot, recommended)
|
||||
```bash
|
||||
iqtree2 -s alignment.fasta -B 1000 # 1000 replicates
|
||||
# Values ≥95 are reliable
|
||||
# ~10× faster than standard bootstrap
|
||||
```
|
||||
|
||||
### Standard Bootstrap
|
||||
```bash
|
||||
iqtree2 -s alignment.fasta -b 100 # 100 replicates (very slow)
|
||||
```
|
||||
|
||||
### SH-aLRT Test (fast alternative)
|
||||
```bash
|
||||
iqtree2 -s alignment.fasta -alrt 1000 -B 1000 # Both SH-aLRT and UFBoot
|
||||
# SH-aLRT ≥80 AND UFBoot ≥95 = well-supported branch
|
||||
```
|
||||
|
||||
## Branch Support Interpretation
|
||||
|
||||
| Bootstrap Value | Interpretation |
|
||||
|----------------|----------------|
|
||||
| ≥ 95 | Well-supported (strongly supported) |
|
||||
| 70–94 | Moderately supported |
|
||||
| 50–69 | Weakly supported |
|
||||
| < 50 | Unreliable (not supported) |
|
||||
|
||||
## Output Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `{prefix}.treefile` | Best ML tree in Newick format |
|
||||
| `{prefix}.iqtree` | Full analysis report |
|
||||
| `{prefix}.log` | Computation log |
|
||||
| `{prefix}.contree` | Consensus tree from bootstrap |
|
||||
| `{prefix}.splits.nex` | Network splits |
|
||||
| `{prefix}.bionj` | BioNJ starting tree |
|
||||
| `{prefix}.model.gz` | Saved model parameters |
|
||||
|
||||
## Advanced Analyses
|
||||
|
||||
### Molecular Clock (Dating)
|
||||
|
||||
```bash
|
||||
# Temporal analysis with sampling dates
|
||||
iqtree2 -s alignment.fasta -m GTR+G \
|
||||
--date dates.tsv \ # Tab-separated: taxon_name YYYY-MM-DD
|
||||
--clock-test \ # Test for clock-like evolution
|
||||
--date-CI 95 \ # 95% CI for node dates
|
||||
--prefix dated_tree
|
||||
```
|
||||
|
||||
### Concordance Factors
|
||||
|
||||
```bash
|
||||
# Gene concordance factor (gCF) - requires multiple gene alignments
|
||||
iqtree2 --gcf gene_trees.nwk \
|
||||
--tree main_tree.treefile \
|
||||
--cf-verbose \
|
||||
--prefix cf_analysis
|
||||
```
|
||||
|
||||
### Ancestral Sequence Reconstruction
|
||||
|
||||
```bash
|
||||
iqtree2 -s alignment.fasta -m LG+G4 \
|
||||
-asr \ # Marginal ancestral state reconstruction
|
||||
--prefix anc_tree
|
||||
# Output: {prefix}.state (ancestral sequences per node)
|
||||
```
|
||||
|
||||
### Partition Model (Multi-Gene)
|
||||
|
||||
```bash
|
||||
# Create partition file (partitions.txt):
|
||||
# DNA, gene1 = 1-500
|
||||
# DNA, gene2 = 501-1000
|
||||
|
||||
iqtree2 -s concat_alignment.fasta \
|
||||
-p partitions.txt \
|
||||
-m TEST \
|
||||
-B 1000 \
|
||||
--prefix partition_tree
|
||||
```
|
||||
|
||||
## IQ-TREE Log Parsing
|
||||
|
||||
```python
|
||||
def parse_iqtree_log(log_file: str) -> dict:
|
||||
"""Extract key results from IQ-TREE log file."""
|
||||
results = {}
|
||||
with open(log_file) as f:
|
||||
for line in f:
|
||||
if "Best-fit model" in line:
|
||||
results["best_model"] = line.split(":")[1].strip()
|
||||
elif "Log-likelihood of the tree:" in line:
|
||||
results["log_likelihood"] = float(line.split(":")[1].strip())
|
||||
elif "Number of free parameters" in line:
|
||||
results["free_params"] = int(line.split(":")[1].strip())
|
||||
elif "Akaike information criterion" in line:
|
||||
results["AIC"] = float(line.split(":")[1].strip())
|
||||
elif "Bayesian information criterion" in line:
|
||||
results["BIC"] = float(line.split(":")[1].strip())
|
||||
elif "Total CPU time used" in line:
|
||||
results["cpu_time"] = line.split(":")[1].strip()
|
||||
return results
|
||||
|
||||
# Example:
|
||||
# results = parse_iqtree_log("output.log")
|
||||
# print(f"Best model: {results['best_model']}")
|
||||
# print(f"Log-likelihood: {results['log_likelihood']:.2f}")
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
| Issue | Likely Cause | Solution |
|
||||
|-------|-------------|---------|
|
||||
| All bootstrap values = 0 | Too few taxa | Need ≥4 taxa for bootstrap |
|
||||
| Very long branches | Alignment artifacts | Re-trim alignment; check for outliers |
|
||||
| Memory error | Too many sequences | Use FastTree; or reduce `-T` to 1 |
|
||||
| Poor model fit | Wrong alphabet | Check nucleotide vs. protein specification |
|
||||
| Identical sequences | Duplicate sequences | Remove duplicates before alignment |
|
||||
|
||||
## MAFFT Alignment Guide
|
||||
|
||||
```bash
|
||||
# Accurate (< 200 sequences)
|
||||
mafft --localpair --maxiterate 1000 input.fasta > aligned.fasta
|
||||
|
||||
# Medium (200-1000 sequences)
|
||||
mafft --auto input.fasta > aligned.fasta
|
||||
|
||||
# Fast (> 1000 sequences)
|
||||
mafft --fftns input.fasta > aligned.fasta
|
||||
|
||||
# Very large (> 10000 sequences)
|
||||
mafft --retree 1 input.fasta > aligned.fasta
|
||||
|
||||
# Using multiple threads
|
||||
mafft --thread 8 --auto input.fasta > aligned.fasta
|
||||
```
|
||||
270
scientific-skills/phylogenetics/scripts/phylogenetic_analysis.py
Normal file
270
scientific-skills/phylogenetics/scripts/phylogenetic_analysis.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""
|
||||
Phylogenetic Analysis Pipeline
|
||||
===============================
|
||||
Complete workflow: MAFFT alignment → IQ-TREE tree → ETE3 visualization.
|
||||
|
||||
Requirements:
|
||||
conda install -c bioconda mafft iqtree
|
||||
pip install ete3
|
||||
|
||||
Usage:
|
||||
python phylogenetic_analysis.py sequences.fasta --type nt --threads 4
|
||||
python phylogenetic_analysis.py proteins.fasta --type aa --fasttree
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
"""Check that required tools are installed."""
|
||||
tools = {
|
||||
"mafft": "conda install -c bioconda mafft",
|
||||
"iqtree2": "conda install -c bioconda iqtree",
|
||||
}
|
||||
missing = []
|
||||
for tool, install_cmd in tools.items():
|
||||
result = subprocess.run(["which", tool], capture_output=True)
|
||||
if result.returncode != 0:
|
||||
missing.append(f" {tool}: {install_cmd}")
|
||||
|
||||
if missing:
|
||||
print("Missing dependencies:")
|
||||
for m in missing:
|
||||
print(m)
|
||||
sys.exit(1)
|
||||
print("All dependencies found.")
|
||||
|
||||
|
||||
def count_sequences(fasta_file: str) -> int:
|
||||
"""Count sequences in a FASTA file."""
|
||||
with open(fasta_file) as f:
|
||||
return sum(1 for line in f if line.startswith('>'))
|
||||
|
||||
|
||||
def run_mafft(input_fasta: str, output_fasta: str, n_threads: int = 4,
|
||||
method: str = "auto") -> str:
|
||||
"""Run MAFFT multiple sequence alignment."""
|
||||
n_seqs = count_sequences(input_fasta)
|
||||
print(f"MAFFT: Aligning {n_seqs} sequences...")
|
||||
|
||||
# Auto-select method based on dataset size
|
||||
if method == "auto":
|
||||
if n_seqs <= 200:
|
||||
cmd = ["mafft", "--localpair", "--maxiterate", "1000",
|
||||
"--thread", str(n_threads), "--inputorder", input_fasta]
|
||||
elif n_seqs <= 1000:
|
||||
cmd = ["mafft", "--auto", "--thread", str(n_threads),
|
||||
"--inputorder", input_fasta]
|
||||
else:
|
||||
cmd = ["mafft", "--fftns", "--thread", str(n_threads),
|
||||
"--inputorder", input_fasta]
|
||||
else:
|
||||
cmd = ["mafft", f"--{method}", "--thread", str(n_threads),
|
||||
"--inputorder", input_fasta]
|
||||
|
||||
with open(output_fasta, 'w') as out:
|
||||
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"MAFFT failed:\n{result.stderr[:500]}")
|
||||
|
||||
print(f" Alignment complete → {output_fasta}")
|
||||
return output_fasta
|
||||
|
||||
|
||||
def run_iqtree(aligned_fasta: str, prefix: str, seq_type: str = "nt",
|
||||
bootstrap: int = 1000, n_threads: int = 4,
|
||||
outgroup: str = None) -> str:
|
||||
"""Run IQ-TREE 2 phylogenetic inference."""
|
||||
print(f"IQ-TREE 2: Building maximum likelihood tree...")
|
||||
|
||||
cmd = [
|
||||
"iqtree2",
|
||||
"-s", aligned_fasta,
|
||||
"--prefix", prefix,
|
||||
"-m", "TEST", # Auto model selection
|
||||
"-B", str(bootstrap), # Ultrafast bootstrap
|
||||
"-T", str(n_threads),
|
||||
"--redo",
|
||||
"-alrt", "1000", # SH-aLRT test
|
||||
]
|
||||
|
||||
if outgroup:
|
||||
cmd += ["-o", outgroup]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"IQ-TREE failed:\n{result.stderr[:500]}")
|
||||
|
||||
tree_file = f"{prefix}.treefile"
|
||||
|
||||
# Extract best model from log
|
||||
log_file = f"{prefix}.log"
|
||||
if os.path.exists(log_file):
|
||||
with open(log_file) as f:
|
||||
for line in f:
|
||||
if "Best-fit model" in line:
|
||||
print(f" {line.strip()}")
|
||||
|
||||
print(f" Tree saved → {tree_file}")
|
||||
return tree_file
|
||||
|
||||
|
||||
def run_fasttree(aligned_fasta: str, output_tree: str, seq_type: str = "nt") -> str:
|
||||
"""Run FastTree (faster alternative for large datasets)."""
|
||||
print("FastTree: Building approximate ML tree (faster)...")
|
||||
|
||||
if seq_type == "nt":
|
||||
cmd = ["FastTree", "-nt", "-gtr", "-gamma", aligned_fasta]
|
||||
else:
|
||||
cmd = ["FastTree", "-lg", "-gamma", aligned_fasta]
|
||||
|
||||
with open(output_tree, 'w') as out:
|
||||
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FastTree failed:\n{result.stderr[:500]}")
|
||||
|
||||
print(f" Tree saved → {output_tree}")
|
||||
return output_tree
|
||||
|
||||
|
||||
def visualize_tree(tree_file: str, output_png: str, outgroup: str = None) -> None:
|
||||
"""Visualize the phylogenetic tree with ETE3."""
|
||||
try:
|
||||
from ete3 import Tree, TreeStyle, NodeStyle
|
||||
except ImportError:
|
||||
print("ETE3 not installed. Skipping visualization.")
|
||||
print(" Install: pip install ete3")
|
||||
return
|
||||
|
||||
t = Tree(tree_file)
|
||||
|
||||
# Root the tree
|
||||
if outgroup and outgroup in [leaf.name for leaf in t.get_leaves()]:
|
||||
t.set_outgroup(outgroup)
|
||||
print(f" Rooted at outgroup: {outgroup}")
|
||||
else:
|
||||
# Midpoint rooting
|
||||
t.set_outgroup(t.get_midpoint_outgroup())
|
||||
print(" Applied midpoint rooting")
|
||||
|
||||
# Style
|
||||
ts = TreeStyle()
|
||||
ts.show_leaf_name = True
|
||||
ts.show_branch_support = True
|
||||
ts.mode = "r" # rectangular
|
||||
|
||||
try:
|
||||
t.render(output_png, tree_style=ts, w=800, units="px")
|
||||
print(f" Visualization saved → {output_png}")
|
||||
except Exception as e:
|
||||
print(f" Visualization failed (display issue?): {e}")
|
||||
# Save tree in Newick format as fallback
|
||||
rooted_nwk = output_png.replace(".png", "_rooted.nwk")
|
||||
t.write(format=1, outfile=rooted_nwk)
|
||||
print(f" Rooted tree saved → {rooted_nwk}")
|
||||
|
||||
|
||||
def tree_summary(tree_file: str) -> dict:
|
||||
"""Print summary statistics for the tree."""
|
||||
try:
|
||||
from ete3 import Tree
|
||||
t = Tree(tree_file)
|
||||
t.set_outgroup(t.get_midpoint_outgroup())
|
||||
|
||||
leaves = t.get_leaves()
|
||||
branch_lengths = [n.dist for n in t.traverse() if n.dist > 0]
|
||||
|
||||
stats = {
|
||||
"n_taxa": len(leaves),
|
||||
"total_branch_length": sum(branch_lengths),
|
||||
"mean_branch_length": sum(branch_lengths) / len(branch_lengths) if branch_lengths else 0,
|
||||
"max_branch_length": max(branch_lengths) if branch_lengths else 0,
|
||||
}
|
||||
|
||||
print("\nTree Summary:")
|
||||
for k, v in stats.items():
|
||||
if isinstance(v, float):
|
||||
print(f" {k}: {v:.6f}")
|
||||
else:
|
||||
print(f" {k}: {v}")
|
||||
|
||||
return stats
|
||||
except Exception as e:
|
||||
print(f"Could not compute tree stats: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Phylogenetic analysis pipeline")
|
||||
parser.add_argument("input", help="Input FASTA file (unaligned)")
|
||||
parser.add_argument("--type", choices=["nt", "aa"], default="nt",
|
||||
help="Sequence type: nt (nucleotide) or aa (amino acid)")
|
||||
parser.add_argument("--threads", type=int, default=4, help="Number of threads")
|
||||
parser.add_argument("--bootstrap", type=int, default=1000,
|
||||
help="Bootstrap replicates for IQ-TREE")
|
||||
parser.add_argument("--fasttree", action="store_true",
|
||||
help="Use FastTree instead of IQ-TREE (faster, less accurate)")
|
||||
parser.add_argument("--outgroup", help="Outgroup taxon name for rooting")
|
||||
parser.add_argument("--mafft-method", default="auto",
|
||||
choices=["auto", "linsi", "einsi", "fftnsi", "fftns"],
|
||||
help="MAFFT alignment method")
|
||||
parser.add_argument("--output-dir", default="phylo_results",
|
||||
help="Output directory")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
prefix = os.path.join(args.output_dir, Path(args.input).stem)
|
||||
|
||||
print("=" * 60)
|
||||
print("Phylogenetic Analysis Pipeline")
|
||||
print("=" * 60)
|
||||
print(f"Input: {args.input}")
|
||||
print(f"Sequence type: {args.type}")
|
||||
print(f"Output dir: {args.output_dir}")
|
||||
|
||||
# Step 1: Multiple Sequence Alignment
|
||||
print("\n[Step 1/3] Multiple Sequence Alignment (MAFFT)")
|
||||
aligned = run_mafft(
|
||||
args.input,
|
||||
f"{prefix}_aligned.fasta",
|
||||
n_threads=args.threads,
|
||||
method=args.mafft_method
|
||||
)
|
||||
|
||||
# Step 2: Tree Inference
|
||||
print("\n[Step 2/3] Tree Inference")
|
||||
if args.fasttree:
|
||||
tree_file = run_fasttree(aligned, f"{prefix}.tree", seq_type=args.type)
|
||||
else:
|
||||
tree_file = run_iqtree(
|
||||
aligned, prefix,
|
||||
seq_type=args.type,
|
||||
bootstrap=args.bootstrap,
|
||||
n_threads=args.threads,
|
||||
outgroup=args.outgroup
|
||||
)
|
||||
|
||||
# Step 3: Visualization
|
||||
print("\n[Step 3/3] Visualization (ETE3)")
|
||||
visualize_tree(tree_file, f"{prefix}_tree.png", outgroup=args.outgroup)
|
||||
tree_summary(tree_file)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Analysis complete!")
|
||||
print(f"Key outputs:")
|
||||
print(f" Aligned sequences: {aligned}")
|
||||
print(f" Tree file: {tree_file}")
|
||||
print(f" Visualization: {prefix}_tree.png")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
321
scientific-skills/scvelo/SKILL.md
Normal file
321
scientific-skills/scvelo/SKILL.md
Normal file
@@ -0,0 +1,321 @@
|
||||
---
|
||||
name: scvelo
|
||||
description: RNA velocity analysis with scVelo. Estimate cell state transitions from unspliced/spliced mRNA dynamics, infer trajectory directions, compute latent time, and identify driver genes in single-cell RNA-seq data. Complements Scanpy/scVI-tools for trajectory inference.
|
||||
license: BSD-3-Clause
|
||||
metadata:
|
||||
skill-author: Kuan-lin Huang
|
||||
---
|
||||
|
||||
# scVelo — RNA Velocity Analysis
|
||||
|
||||
## Overview
|
||||
|
||||
scVelo is the leading Python package for RNA velocity analysis in single-cell RNA-seq data. It infers cell state transitions by modeling the kinetics of mRNA splicing — using the ratio of unspliced (pre-mRNA) to spliced (mature mRNA) abundances to determine whether a gene is being upregulated or downregulated in each cell. This allows reconstruction of developmental trajectories and identification of cell fate decisions without requiring time-course data.
|
||||
|
||||
**Installation:** `pip install scvelo`
|
||||
|
||||
**Key resources:**
|
||||
- Documentation: https://scvelo.readthedocs.io/
|
||||
- GitHub: https://github.com/theislab/scvelo
|
||||
- Paper: Bergen et al. (2020) Nature Biotechnology. PMID: 32747759
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
Use scVelo when:
|
||||
|
||||
- **Trajectory inference from snapshot data**: Determine which direction cells are differentiating
|
||||
- **Cell fate prediction**: Identify progenitor cells and their downstream fates
|
||||
- **Driver gene identification**: Find genes whose dynamics best explain observed trajectories
|
||||
- **Developmental biology**: Model hematopoiesis, neurogenesis, epithelial-to-mesenchymal transitions
|
||||
- **Latent time estimation**: Order cells along a pseudotime derived from splicing dynamics
|
||||
- **Complement to Scanpy**: Add directional information to UMAP embeddings
|
||||
|
||||
## Prerequisites
|
||||
|
||||
scVelo requires count matrices for both **unspliced** and **spliced** RNA. These are generated by:
|
||||
1. **STARsolo** or **kallisto|bustools** with `lamanno` mode
|
||||
2. **velocyto** CLI: `velocyto run10x` / `velocyto run`
|
||||
3. **alevin-fry** / **simpleaf** with spliced/unspliced output
|
||||
|
||||
Data is stored in an `AnnData` object with `layers["spliced"]` and `layers["unspliced"]`.
|
||||
|
||||
## Standard RNA Velocity Workflow
|
||||
|
||||
### 1. Setup and Data Loading
|
||||
|
||||
```python
|
||||
import scvelo as scv
|
||||
import scanpy as sc
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Configure settings
|
||||
scv.settings.verbosity = 3 # Show computation steps
|
||||
scv.settings.presenter_view = True
|
||||
scv.settings.set_figure_params('scvelo')
|
||||
|
||||
# Load data (AnnData with spliced/unspliced layers)
|
||||
# Option A: Load from loom (velocyto output)
|
||||
adata = scv.read("cellranger_output.loom", cache=True)
|
||||
|
||||
# Option B: Merge velocyto loom with Scanpy-processed AnnData
|
||||
adata_processed = sc.read_h5ad("processed.h5ad") # Has UMAP, clusters
|
||||
adata_velocity = scv.read("velocyto.loom")
|
||||
adata = scv.utils.merge(adata_processed, adata_velocity)
|
||||
|
||||
# Verify layers
|
||||
print(adata)
|
||||
# obs × var: N × G
|
||||
# layers: 'spliced', 'unspliced' (required)
|
||||
# obsm['X_umap'] (required for visualization)
|
||||
```
|
||||
|
||||
### 2. Preprocessing
|
||||
|
||||
```python
|
||||
# Filter and normalize (follows Scanpy conventions)
|
||||
scv.pp.filter_and_normalize(
|
||||
adata,
|
||||
min_shared_counts=20, # Minimum counts in spliced+unspliced
|
||||
n_top_genes=2000 # Top highly variable genes
|
||||
)
|
||||
|
||||
# Compute first and second order moments (means and variances)
|
||||
# knn_connectivities must be computed first
|
||||
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
|
||||
scv.pp.moments(
|
||||
adata,
|
||||
n_pcs=30,
|
||||
n_neighbors=30
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Velocity Estimation — Stochastic Model
|
||||
|
||||
The stochastic model is fast and suitable for exploratory analysis:
|
||||
|
||||
```python
|
||||
# Stochastic velocity (faster, less accurate)
|
||||
scv.tl.velocity(adata, mode='stochastic')
|
||||
scv.tl.velocity_graph(adata)
|
||||
|
||||
# Visualize
|
||||
scv.pl.velocity_embedding_stream(
|
||||
adata,
|
||||
basis='umap',
|
||||
color='leiden',
|
||||
title="RNA Velocity (Stochastic)"
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Velocity Estimation — Dynamical Model (Recommended)
|
||||
|
||||
The dynamical model fits the full splicing kinetics and is more accurate:
|
||||
|
||||
```python
|
||||
# Recover dynamics (computationally intensive; ~10-30 min for 10K cells)
|
||||
scv.tl.recover_dynamics(adata, n_jobs=4)
|
||||
|
||||
# Compute velocity from dynamical model
|
||||
scv.tl.velocity(adata, mode='dynamical')
|
||||
scv.tl.velocity_graph(adata)
|
||||
```
|
||||
|
||||
### 5. Latent Time
|
||||
|
||||
The dynamical model enables computation of a shared latent time (pseudotime):
|
||||
|
||||
```python
|
||||
# Compute latent time
|
||||
scv.tl.latent_time(adata)
|
||||
|
||||
# Visualize latent time on UMAP
|
||||
scv.pl.scatter(
|
||||
adata,
|
||||
color='latent_time',
|
||||
color_map='gnuplot',
|
||||
size=80,
|
||||
title='Latent time'
|
||||
)
|
||||
|
||||
# Identify top genes ordered by latent time
|
||||
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
|
||||
scv.pl.heatmap(
|
||||
adata,
|
||||
var_names=top_genes,
|
||||
sortby='latent_time',
|
||||
col_color='leiden',
|
||||
n_convolve=100
|
||||
)
|
||||
```
|
||||
|
||||
### 6. Driver Gene Analysis
|
||||
|
||||
```python
|
||||
# Identify genes with highest velocity fit
|
||||
scv.tl.rank_velocity_genes(adata, groupby='leiden', min_corr=0.3)
|
||||
df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
|
||||
print(df.head(10))
|
||||
|
||||
# Speed and coherence
|
||||
scv.tl.velocity_confidence(adata)
|
||||
scv.pl.scatter(
|
||||
adata,
|
||||
c=['velocity_length', 'velocity_confidence'],
|
||||
cmap='coolwarm',
|
||||
perc=[5, 95]
|
||||
)
|
||||
|
||||
# Phase portraits for specific genes
|
||||
scv.pl.velocity(adata, ['Cpe', 'Gnao1', 'Ins2'],
|
||||
ncols=3, figsize=(16, 4))
|
||||
```
|
||||
|
||||
### 7. Velocity Arrows and Pseudotime
|
||||
|
||||
```python
|
||||
# Arrow plot on UMAP
|
||||
scv.pl.velocity_embedding(
|
||||
adata,
|
||||
arrow_length=3,
|
||||
arrow_size=2,
|
||||
color='leiden',
|
||||
basis='umap'
|
||||
)
|
||||
|
||||
# Stream plot (cleaner visualization)
|
||||
scv.pl.velocity_embedding_stream(
|
||||
adata,
|
||||
basis='umap',
|
||||
color='leiden',
|
||||
smooth=0.8,
|
||||
min_mass=4
|
||||
)
|
||||
|
||||
# Velocity pseudotime (alternative to latent time)
|
||||
scv.tl.velocity_pseudotime(adata)
|
||||
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot')
|
||||
```
|
||||
|
||||
### 8. PAGA Trajectory Graph
|
||||
|
||||
```python
|
||||
# PAGA graph with velocity-informed transitions
|
||||
scv.tl.paga(adata, groups='leiden')
|
||||
df = scv.get_df(adata, 'paga/transitions_confidence', precision=2).T
|
||||
df.style.background_gradient(cmap='Blues').format('{:.2g}')
|
||||
|
||||
# Plot PAGA with velocity
|
||||
scv.pl.paga(
|
||||
adata,
|
||||
basis='umap',
|
||||
size=50,
|
||||
alpha=0.1,
|
||||
min_edge_width=2,
|
||||
node_size_scale=1.5
|
||||
)
|
||||
```
|
||||
|
||||
## Complete Workflow Script
|
||||
|
||||
```python
|
||||
import scvelo as scv
|
||||
import scanpy as sc
|
||||
|
||||
def run_rna_velocity(adata, n_top_genes=2000, mode='dynamical', n_jobs=4):
|
||||
"""
|
||||
Complete RNA velocity workflow.
|
||||
|
||||
Args:
|
||||
adata: AnnData with 'spliced' and 'unspliced' layers, UMAP in obsm
|
||||
n_top_genes: Number of top HVGs for velocity
|
||||
mode: 'stochastic' (fast) or 'dynamical' (accurate)
|
||||
n_jobs: Parallel jobs for dynamical model
|
||||
|
||||
Returns:
|
||||
Processed AnnData with velocity information
|
||||
"""
|
||||
scv.settings.verbosity = 2
|
||||
|
||||
# 1. Preprocessing
|
||||
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=n_top_genes)
|
||||
|
||||
if 'neighbors' not in adata.uns:
|
||||
sc.pp.neighbors(adata, n_neighbors=30)
|
||||
|
||||
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
|
||||
|
||||
# 2. Velocity estimation
|
||||
if mode == 'dynamical':
|
||||
scv.tl.recover_dynamics(adata, n_jobs=n_jobs)
|
||||
|
||||
scv.tl.velocity(adata, mode=mode)
|
||||
scv.tl.velocity_graph(adata)
|
||||
|
||||
# 3. Downstream analyses
|
||||
if mode == 'dynamical':
|
||||
scv.tl.latent_time(adata)
|
||||
scv.tl.rank_velocity_genes(adata, groupby='leiden', min_corr=0.3)
|
||||
|
||||
scv.tl.velocity_confidence(adata)
|
||||
scv.tl.velocity_pseudotime(adata)
|
||||
|
||||
return adata
|
||||
```
|
||||
|
||||
## Key Output Fields in AnnData
|
||||
|
||||
After running the workflow, the following fields are added:
|
||||
|
||||
| Location | Key | Description |
|
||||
|----------|-----|-------------|
|
||||
| `adata.layers` | `velocity` | RNA velocity per gene per cell |
|
||||
| `adata.layers` | `fit_t` | Fitted latent time per gene per cell |
|
||||
| `adata.obsm` | `velocity_umap` | 2D velocity vectors on UMAP |
|
||||
| `adata.obs` | `velocity_pseudotime` | Pseudotime from velocity |
|
||||
| `adata.obs` | `latent_time` | Latent time from dynamical model |
|
||||
| `adata.obs` | `velocity_length` | Speed of each cell |
|
||||
| `adata.obs` | `velocity_confidence` | Confidence score per cell |
|
||||
| `adata.var` | `fit_likelihood` | Gene-level model fit quality |
|
||||
| `adata.var` | `fit_alpha` | Transcription rate |
|
||||
| `adata.var` | `fit_beta` | Splicing rate |
|
||||
| `adata.var` | `fit_gamma` | Degradation rate |
|
||||
| `adata.uns` | `velocity_graph` | Cell-cell transition probability matrix |
|
||||
|
||||
## Velocity Models Comparison
|
||||
|
||||
| Model | Speed | Accuracy | When to Use |
|
||||
|-------|-------|----------|-------------|
|
||||
| `stochastic` | Fast | Moderate | Exploratory; large datasets |
|
||||
| `deterministic` | Medium | Moderate | Simple linear kinetics |
|
||||
| `dynamical` | Slow | High | Publication-quality; identifies driver genes |
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Start with stochastic mode** for exploration; switch to dynamical for final analysis
|
||||
- **Need good coverage of unspliced reads**: Short reads (< 100 bp) may miss intron coverage
|
||||
- **Minimum 2,000 cells**: RNA velocity is noisy with fewer cells
|
||||
- **Velocity should be coherent**: Arrows should follow known biology; randomness indicates issues
|
||||
- **k-NN bandwidth matters**: Too few neighbors → noisy velocity; too many → oversmoothed
|
||||
- **Sanity check**: Root cells (progenitors) should have high unspliced/spliced ratios for marker genes
|
||||
- **Dynamical model requires distinct kinetic states**: Works best for clear differentiation processes
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Problem | Solution |
|
||||
|---------|---------|
|
||||
| Missing unspliced layer | Re-run velocyto or use STARsolo with `--soloFeatures Gene Velocyto` |
|
||||
| Very few velocity genes | Lower `min_shared_counts`; check sequencing depth |
|
||||
| Random-looking arrows | Try different `n_neighbors` or velocity model |
|
||||
| Memory error with dynamical | Set `n_jobs=1`; reduce `n_top_genes` |
|
||||
| Negative velocity everywhere | Check that spliced/unspliced layers are not swapped |
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **scVelo documentation**: https://scvelo.readthedocs.io/
|
||||
- **Tutorial notebooks**: https://scvelo.readthedocs.io/tutorials/
|
||||
- **GitHub**: https://github.com/theislab/scvelo
|
||||
- **Paper**: Bergen V et al. (2020) Nature Biotechnology. PMID: 32747759
|
||||
- **velocyto** (preprocessing): http://velocyto.org/
|
||||
- **CellRank** (fate prediction, extends scVelo): https://cellrank.readthedocs.io/
|
||||
- **dynamo** (metabolic labeling alternative): https://dynamo-release.readthedocs.io/
|
||||
168
scientific-skills/scvelo/references/velocity_models.md
Normal file
168
scientific-skills/scvelo/references/velocity_models.md
Normal file
@@ -0,0 +1,168 @@
|
||||
# scVelo Velocity Models Reference
|
||||
|
||||
## Mathematical Framework
|
||||
|
||||
RNA velocity is based on the kinetic model of transcription:
|
||||
|
||||
```
|
||||
dx_s/dt = β·x_u - γ·x_s (spliced dynamics)
|
||||
dx_u/dt = α(t) - β·x_u (unspliced dynamics)
|
||||
```
|
||||
|
||||
Where:
|
||||
- `x_s`: spliced mRNA abundance
|
||||
- `x_u`: unspliced (pre-mRNA) abundance
|
||||
- `α(t)`: transcription rate (varies over time)
|
||||
- `β`: splicing rate
|
||||
- `γ`: degradation rate
|
||||
|
||||
**Velocity** is defined as: `v = dx_s/dt = β·x_u - γ·x_s`
|
||||
|
||||
- **v > 0**: Gene is being upregulated (more unspliced than expected at steady state)
|
||||
- **v < 0**: Gene is being downregulated (less unspliced than expected)
|
||||
|
||||
## Model Comparison
|
||||
|
||||
### Steady-State (Velocyto, original)
|
||||
|
||||
- Assumes constant α (transcription rate)
|
||||
- Fits γ using linear regression on steady-state cells
|
||||
- **Limitation**: Requires identifiable steady states; assumes constant transcription
|
||||
|
||||
```python
|
||||
# Use with scVelo for backward compatibility
|
||||
scv.tl.velocity(adata, mode='steady_state')
|
||||
```
|
||||
|
||||
### Stochastic Model (scVelo v1)
|
||||
|
||||
- Extends steady-state with variance/covariance terms
|
||||
- Models cell-to-cell variability in mRNA counts
|
||||
- More robust to noise than steady-state
|
||||
|
||||
```python
|
||||
scv.tl.velocity(adata, mode='stochastic')
|
||||
```
|
||||
|
||||
### Dynamical Model (scVelo v2, recommended)
|
||||
|
||||
- Jointly estimates all kinetic rates (α, β, γ) and cell-specific latent time
|
||||
- Does not assume steady state
|
||||
- Identifies induction vs. repression phases
|
||||
- Computes fit_likelihood per gene (quality measure)
|
||||
|
||||
```python
|
||||
scv.tl.recover_dynamics(adata, n_jobs=4)
|
||||
scv.tl.velocity(adata, mode='dynamical')
|
||||
```
|
||||
|
||||
**Kinetic states identified by dynamical model:**
|
||||
|
||||
| State | Description |
|
||||
|-------|-------------|
|
||||
| Induction | α > 0, x_u increasing |
|
||||
| Steady-state on | α > 0, constant high expression |
|
||||
| Repression | α = 0, x_u decreasing |
|
||||
| Steady-state off | α = 0, constant low expression |
|
||||
|
||||
## Velocity Graph
|
||||
|
||||
The velocity graph connects cells based on their velocity similarity to neighboring cells' states:
|
||||
|
||||
```python
|
||||
scv.tl.velocity_graph(adata)
|
||||
# Stored in adata.uns['velocity_graph']
|
||||
# Entry [i,j] = probability that cell i transitions to cell j
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `n_neighbors`: Number of neighbors considered
|
||||
- `sqrt_transform`: Apply sqrt transform to data (default: False for spliced)
|
||||
- `approx`: Use approximate nearest neighbor search (faster for large datasets)
|
||||
|
||||
## Latent Time Interpretation
|
||||
|
||||
Latent time τ ∈ [0, 1] for each gene represents:
|
||||
- τ = 0: Gene is at onset of induction
|
||||
- τ = 0.5: Gene is at peak of induction (for a complete cycle)
|
||||
- τ = 1: Gene has returned to steady-state off
|
||||
|
||||
**Shared latent time** is computed by taking the average over all velocity genes, weighted by fit_likelihood.
|
||||
|
||||
## Quality Metrics
|
||||
|
||||
### Gene-level
|
||||
- `fit_likelihood`: Goodness-of-fit of dynamical model (0-1; higher = better)
|
||||
- Use for filtering driver genes: `adata.var[adata.var['fit_likelihood'] > 0.1]`
|
||||
- `fit_alpha`: Transcription rate during induction
|
||||
- `fit_gamma`: mRNA degradation rate
|
||||
- `fit_r2`: R² of kinetic fit
|
||||
|
||||
### Cell-level
|
||||
- `velocity_length`: Magnitude of velocity vector (cell speed)
|
||||
- `velocity_confidence`: Coherence of velocity with neighboring cells (0-1)
|
||||
|
||||
### Dataset-level
|
||||
```python
|
||||
# Check overall velocity quality
|
||||
scv.pl.proportions(adata) # Ratio of spliced/unspliced per cell
|
||||
scv.pl.velocity_confidence(adata, groupby='leiden')
|
||||
```
|
||||
|
||||
## Parameter Tuning Guide
|
||||
|
||||
| Parameter | Function | Default | When to Change |
|
||||
|-----------|----------|---------|----------------|
|
||||
| `min_shared_counts` | Filter genes | 20 | Increase for deep sequencing; decrease for shallow |
|
||||
| `n_top_genes` | HVG selection | 2000 | Increase for complex datasets |
|
||||
| `n_neighbors` | kNN graph | 30 | Decrease for small datasets; increase for noisy |
|
||||
| `n_pcs` | PCA dimensions | 30 | Match to elbow in scree plot |
|
||||
| `t_max_rank` | Latent time constraint | None | Set if known developmental direction |
|
||||
|
||||
## Integration with Other Tools
|
||||
|
||||
### CellRank (Fate Prediction)
|
||||
|
||||
```python
|
||||
import cellrank as cr
|
||||
from cellrank.kernels import VelocityKernel, ConnectivityKernel
|
||||
|
||||
# Combine velocity and connectivity kernels
|
||||
vk = VelocityKernel(adata).compute_transition_matrix()
|
||||
ck = ConnectivityKernel(adata).compute_transition_matrix()
|
||||
combined = 0.8 * vk + 0.2 * ck
|
||||
|
||||
# Compute macrostates (terminal and initial states)
|
||||
g = cr.estimators.GPCCA(combined)
|
||||
g.compute_macrostates(n_states=4, cluster_key='leiden')
|
||||
g.plot_macrostates(which="all")
|
||||
|
||||
# Compute fate probabilities
|
||||
g.compute_fate_probabilities()
|
||||
g.plot_fate_probabilities()
|
||||
```
|
||||
|
||||
### Scanpy Integration
|
||||
|
||||
scVelo works natively with Scanpy's AnnData:
|
||||
|
||||
```python
|
||||
import scanpy as sc
|
||||
import scvelo as scv
|
||||
|
||||
# Run standard Scanpy pipeline first
|
||||
sc.pp.normalize_total(adata)
|
||||
sc.pp.log1p(adata)
|
||||
sc.pp.highly_variable_genes(adata)
|
||||
sc.pp.pca(adata)
|
||||
sc.pp.neighbors(adata)
|
||||
sc.tl.umap(adata)
|
||||
sc.tl.leiden(adata)
|
||||
|
||||
# Then add velocity on top
|
||||
scv.pp.moments(adata)
|
||||
scv.tl.recover_dynamics(adata)
|
||||
scv.tl.velocity(adata, mode='dynamical')
|
||||
scv.tl.velocity_graph(adata)
|
||||
scv.tl.latent_time(adata)
|
||||
```
|
||||
232
scientific-skills/scvelo/scripts/rna_velocity_workflow.py
Normal file
232
scientific-skills/scvelo/scripts/rna_velocity_workflow.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
RNA Velocity Analysis Workflow using scVelo
|
||||
===========================================
|
||||
Complete pipeline from raw data to velocity visualization.
|
||||
|
||||
Usage:
|
||||
python rna_velocity_workflow.py
|
||||
|
||||
Or import and use run_velocity_analysis() with your AnnData object.
|
||||
"""
|
||||
|
||||
import scvelo as scv
|
||||
import scanpy as sc
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') # Non-interactive backend
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
|
||||
|
||||
def run_velocity_analysis(
|
||||
adata,
|
||||
groupby="leiden",
|
||||
n_top_genes=2000,
|
||||
n_neighbors=30,
|
||||
mode="dynamical",
|
||||
n_jobs=4,
|
||||
output_dir="velocity_results",
|
||||
):
|
||||
"""
|
||||
Complete RNA velocity analysis workflow.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adata : AnnData
|
||||
AnnData object with 'spliced' and 'unspliced' layers.
|
||||
Should already have UMAP and cluster annotations.
|
||||
groupby : str
|
||||
Column in adata.obs for cell type labels.
|
||||
n_top_genes : int
|
||||
Number of top highly variable genes.
|
||||
n_neighbors : int
|
||||
Number of neighbors for moment computation.
|
||||
mode : str
|
||||
Velocity model: 'stochastic' (fast) or 'dynamical' (accurate).
|
||||
n_jobs : int
|
||||
Parallel jobs for dynamical model fitting.
|
||||
output_dir : str
|
||||
Directory for saving output figures.
|
||||
|
||||
Returns
|
||||
-------
|
||||
AnnData with velocity annotations.
|
||||
"""
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# ── Settings ──────────────────────────────────────────────────────────────
|
||||
scv.settings.verbosity = 2
|
||||
scv.settings.figdir = output_dir
|
||||
|
||||
# ── Step 1: Check layers ───────────────────────────────────────────────────
|
||||
assert "spliced" in adata.layers, "Missing 'spliced' layer. Run velocyto first."
|
||||
assert "unspliced" in adata.layers, "Missing 'unspliced' layer. Run velocyto first."
|
||||
print(f"Input: {adata.n_obs} cells × {adata.n_vars} genes")
|
||||
|
||||
# ── Step 2: Preprocessing ─────────────────────────────────────────────────
|
||||
print("Step 1/5: Preprocessing...")
|
||||
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=n_top_genes)
|
||||
|
||||
if "neighbors" not in adata.uns:
|
||||
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=30)
|
||||
|
||||
scv.pp.moments(adata, n_pcs=30, n_neighbors=n_neighbors)
|
||||
print(f" {adata.n_vars} velocity genes selected")
|
||||
|
||||
# ── Step 3: Velocity estimation ────────────────────────────────────────────
|
||||
print(f"Step 2/5: Fitting velocity model ({mode})...")
|
||||
if mode == "dynamical":
|
||||
scv.tl.recover_dynamics(adata, n_jobs=n_jobs)
|
||||
scv.tl.velocity(adata, mode=mode)
|
||||
scv.tl.velocity_graph(adata)
|
||||
print(" Velocity graph computed")
|
||||
|
||||
# ── Step 4: Downstream analyses ────────────────────────────────────────────
|
||||
print("Step 3/5: Computing latent time and confidence...")
|
||||
scv.tl.velocity_confidence(adata)
|
||||
scv.tl.velocity_pseudotime(adata)
|
||||
|
||||
if mode == "dynamical":
|
||||
scv.tl.latent_time(adata)
|
||||
|
||||
if groupby in adata.obs.columns:
|
||||
scv.tl.rank_velocity_genes(adata, groupby=groupby, min_corr=0.3)
|
||||
|
||||
# ── Step 5: Visualization ─────────────────────────────────────────────────
|
||||
print("Step 4/5: Generating figures...")
|
||||
|
||||
# Stream plot
|
||||
scv.pl.velocity_embedding_stream(
|
||||
adata,
|
||||
basis="umap",
|
||||
color=groupby,
|
||||
title="RNA Velocity",
|
||||
save=f"{output_dir}/velocity_stream.png",
|
||||
)
|
||||
|
||||
# Arrow plot
|
||||
scv.pl.velocity_embedding(
|
||||
adata,
|
||||
arrow_length=3,
|
||||
arrow_size=2,
|
||||
color=groupby,
|
||||
basis="umap",
|
||||
save=f"{output_dir}/velocity_arrows.png",
|
||||
)
|
||||
|
||||
# Pseudotime
|
||||
scv.pl.scatter(
|
||||
adata,
|
||||
color="velocity_pseudotime",
|
||||
cmap="gnuplot",
|
||||
title="Velocity Pseudotime",
|
||||
save=f"{output_dir}/pseudotime.png",
|
||||
)
|
||||
|
||||
if mode == "dynamical" and "latent_time" in adata.obs:
|
||||
scv.pl.scatter(
|
||||
adata,
|
||||
color="latent_time",
|
||||
color_map="gnuplot",
|
||||
title="Latent Time",
|
||||
save=f"{output_dir}/latent_time.png",
|
||||
)
|
||||
|
||||
# Speed and coherence
|
||||
scv.pl.scatter(
|
||||
adata,
|
||||
c=["velocity_length", "velocity_confidence"],
|
||||
cmap="coolwarm",
|
||||
perc=[5, 95],
|
||||
save=f"{output_dir}/velocity_quality.png",
|
||||
)
|
||||
|
||||
# Top driver genes heatmap (dynamical only)
|
||||
if mode == "dynamical" and "fit_likelihood" in adata.var:
|
||||
top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:50]
|
||||
scv.pl.heatmap(
|
||||
adata,
|
||||
var_names=top_genes,
|
||||
sortby="latent_time",
|
||||
col_color=groupby,
|
||||
n_convolve=50,
|
||||
save=f"{output_dir}/driver_gene_heatmap.png",
|
||||
)
|
||||
|
||||
# ── Step 6: Save results ───────────────────────────────────────────────────
|
||||
print("Step 5/5: Saving results...")
|
||||
output_h5ad = os.path.join(output_dir, "adata_velocity.h5ad")
|
||||
adata.write_h5ad(output_h5ad)
|
||||
print(f" Saved to {output_h5ad}")
|
||||
|
||||
# Summary statistics
|
||||
confidence = adata.obs["velocity_confidence"].dropna()
|
||||
print("\nSummary:")
|
||||
print(f" Velocity model: {mode}")
|
||||
print(f" Cells: {adata.n_obs}")
|
||||
print(f" Velocity genes: {adata.n_vars}")
|
||||
print(f" Mean velocity confidence: {confidence.mean():.3f}")
|
||||
print(f" High-confidence cells (>0.7): {(confidence > 0.7).sum()} ({(confidence > 0.7).mean():.1%})")
|
||||
|
||||
if mode == "dynamical" and "fit_likelihood" in adata.var:
|
||||
good_genes = (adata.var["fit_likelihood"] > 0.1).sum()
|
||||
print(f" Well-fit genes (likelihood>0.1): {good_genes}")
|
||||
|
||||
print(f"\nOutput files saved to: {output_dir}/")
|
||||
return adata
|
||||
|
||||
|
||||
def load_from_loom(loom_path, processed_h5ad=None):
|
||||
"""
|
||||
Load velocity data from velocyto loom file.
|
||||
|
||||
Args:
|
||||
loom_path: Path to velocyto output loom file
|
||||
processed_h5ad: Optional path to pre-processed Scanpy h5ad file
|
||||
"""
|
||||
adata_loom = scv.read(loom_path, cache=True)
|
||||
|
||||
if processed_h5ad:
|
||||
adata_processed = sc.read_h5ad(processed_h5ad)
|
||||
# Merge: keep processed metadata and add velocity layers
|
||||
adata = scv.utils.merge(adata_processed, adata_loom)
|
||||
else:
|
||||
adata = adata_loom
|
||||
# Run basic Scanpy pipeline
|
||||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||||
sc.pp.log1p(adata)
|
||||
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
|
||||
sc.pp.pca(adata)
|
||||
sc.pp.neighbors(adata)
|
||||
sc.tl.umap(adata)
|
||||
sc.tl.leiden(adata, resolution=0.5)
|
||||
|
||||
return adata
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage with simulated data (for testing)
|
||||
print("scVelo RNA Velocity Workflow - Demo Mode")
|
||||
print("=" * 50)
|
||||
|
||||
# Load example dataset
|
||||
adata = scv.datasets.pancreas()
|
||||
print(f"Loaded pancreas dataset: {adata}")
|
||||
|
||||
# Run analysis
|
||||
adata = run_velocity_analysis(
|
||||
adata,
|
||||
groupby="clusters",
|
||||
n_top_genes=2000,
|
||||
mode="dynamical",
|
||||
n_jobs=2,
|
||||
output_dir="pancreas_velocity",
|
||||
)
|
||||
|
||||
print("\nAnalysis complete!")
|
||||
print(f"Key results:")
|
||||
print(f" adata.layers['velocity']: velocity per gene per cell")
|
||||
print(f" adata.obs['latent_time']: pseudotime from dynamics")
|
||||
print(f" adata.obs['velocity_confidence']: per-cell confidence")
|
||||
if "rank_velocity_genes" in adata.uns:
|
||||
print(f" adata.uns['rank_velocity_genes']: driver genes per cluster")
|
||||
Reference in New Issue
Block a user