Add scVelo RNA velocity analysis workflow and IQ-TREE reference documentation

- Introduced a comprehensive RNA velocity analysis pipeline using scVelo, including data loading, preprocessing, velocity estimation, and visualization.
- Added a script for running RNA velocity analysis with customizable parameters and output options.
- Created detailed documentation for IQ-TREE 2 phylogenetic inference, covering command syntax, model selection, bootstrapping methods, and output interpretation.
- Included references for velocity models and their mathematical framework, along with a comparison of different models.
- Enhanced the scVelo skill documentation with installation instructions, use cases, and best practices for RNA velocity analysis.
This commit is contained in:
huangkuanlin
2026-03-03 07:15:36 -05:00
parent b271271df4
commit 7f94783fab
27 changed files with 6961 additions and 0 deletions

View File

@@ -0,0 +1,332 @@
---
name: bindingdb-database
description: Query BindingDB for measured drug-target binding affinities (Ki, Kd, IC50, EC50). Search by target (UniProt ID), compound (SMILES/name), or pathogen. Essential for drug discovery, lead optimization, polypharmacology analysis, and structure-activity relationship (SAR) studies.
license: CC-BY-3.0
metadata:
skill-author: Kuan-lin Huang
---
# BindingDB Database
## Overview
BindingDB (https://www.bindingdb.org/) is the primary public database of measured drug-protein binding affinities. It contains over 3 million binding data records for ~1.4 million compounds tested against ~9,200 protein targets, curated from scientific literature and patent literature. BindingDB stores quantitative binding measurements (Ki, Kd, IC50, EC50) essential for drug discovery, pharmacology, and computational chemistry research.
**Key resources:**
- BindingDB website: https://www.bindingdb.org/
- REST API: https://www.bindingdb.org/axis2/services/BDBService
- Downloads: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
- GitHub: https://github.com/drugilsberg/bindingdb
## When to Use This Skill
Use BindingDB when:
- **Target-based drug discovery**: What known compounds bind to a target protein? What are their affinities?
- **SAR analysis**: How do structural modifications affect binding affinity for a series of analogs?
- **Lead compound profiling**: What targets does a compound bind (selectivity/polypharmacology)?
- **Benchmark datasets**: Obtain curated protein-ligand affinity data for ML model training
- **Repurposing analysis**: Does an approved drug bind to an unintended target?
- **Competitive analysis**: What is the best reported affinity for a target class?
- **Fragment screening**: Find validated binding data for fragments against a target
## Core Capabilities
### 1. BindingDB REST API
Base URL: `https://www.bindingdb.org/axis2/services/BDBService`
```python
import requests
BASE_URL = "https://www.bindingdb.org/axis2/services/BDBService"
def bindingdb_query(method, params):
"""Query the BindingDB REST API."""
url = f"{BASE_URL}/{method}"
response = requests.get(url, params=params, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
```
### 2. Query by Target (UniProt ID)
```python
def get_ligands_for_target(uniprot_id, affinity_type="Ki", cutoff=10000, unit="nM"):
"""
Get all ligands with measured affinity for a UniProt target.
Args:
uniprot_id: UniProt accession (e.g., "P00519" for ABL1)
affinity_type: "Ki", "Kd", "IC50", "EC50"
cutoff: Maximum affinity value to return (in nM)
unit: "nM" or "uM"
"""
params = {
"uniprot_id": uniprot_id,
"affinity_type": affinity_type,
"affinity_cutoff": cutoff,
"response": "json"
}
return bindingdb_query("getLigandsByUniprotID", params)
# Example: Get all compounds binding ABL1 (imatinib target)
ligands = get_ligands_for_target("P00519", affinity_type="Ki", cutoff=100)
```
### 3. Query by Compound Name or SMILES
```python
def search_by_name(compound_name, limit=100):
"""Search BindingDB for compounds by name."""
params = {
"compound_name": compound_name,
"response": "json",
"max_results": limit
}
return bindingdb_query("getAffinitiesByCompoundName", params)
def search_by_smiles(smiles, similarity=100, limit=50):
"""
Search BindingDB by SMILES string.
Args:
smiles: SMILES string of the compound
similarity: Tanimoto similarity threshold (1-100, 100 = exact)
"""
params = {
"SMILES": smiles,
"similarity": similarity,
"response": "json",
"max_results": limit
}
return bindingdb_query("getAffinitiesByBEI", params)
# Example: Search for imatinib binding data
result = search_by_name("imatinib")
```
### 4. Download-Based Analysis (Recommended for Large Queries)
For comprehensive analyses, download BindingDB data directly:
```python
import pandas as pd
def load_bindingdb(filepath="BindingDB_All.tsv"):
"""
Load BindingDB TSV file.
Download from: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
"""
# Key columns
usecols = [
"BindingDB Reactant_set_id",
"Ligand SMILES",
"Ligand InChI",
"Ligand InChI Key",
"BindingDB Target Chain Sequence",
"PDB ID(s) for Ligand-Target Complex",
"UniProt (SwissProt) Entry Name of Target Chain",
"UniProt (SwissProt) Primary ID of Target Chain",
"UniProt (TrEMBL) Primary ID of Target Chain",
"Ki (nM)",
"IC50 (nM)",
"Kd (nM)",
"EC50 (nM)",
"kon (M-1-s-1)",
"koff (s-1)",
"Target Name",
"Target Source Organism According to Curator or DataSource",
"Number of Protein Chains in Target (>1 implies a multichain complex)",
"PubChem CID",
"PubChem SID",
"ChEMBL ID of Ligand",
"DrugBank ID of Ligand",
]
df = pd.read_csv(filepath, sep="\t", usecols=[c for c in usecols if c],
low_memory=False, on_bad_lines='skip')
# Convert affinity columns to numeric
for col in ["Ki (nM)", "IC50 (nM)", "Kd (nM)", "EC50 (nM)"]:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def query_target_affinity(df, uniprot_id, affinity_types=None, max_nm=10000):
"""Query loaded BindingDB for a specific target."""
if affinity_types is None:
affinity_types = ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]
# Filter by UniProt ID
mask = df["UniProt (SwissProt) Primary ID of Target Chain"] == uniprot_id
target_df = df[mask].copy()
# Filter by affinity cutoff
has_affinity = pd.Series(False, index=target_df.index)
for col in affinity_types:
if col in target_df.columns:
has_affinity |= target_df[col] <= max_nm
result = target_df[has_affinity][["Ligand SMILES"] + affinity_types +
["PubChem CID", "ChEMBL ID of Ligand"]].dropna(how='all')
return result.sort_values(affinity_types[0])
```
### 5. SAR Analysis
```python
import pandas as pd
def sar_analysis(df, target_uniprot, affinity_col="IC50 (nM)"):
"""
Structure-activity relationship analysis for a target.
Retrieves all compounds with affinity data and ranks by potency.
"""
target_data = query_target_affinity(df, target_uniprot, [affinity_col])
if target_data.empty:
return target_data
# Add pIC50 (negative log of IC50 in molar)
if affinity_col in target_data.columns:
target_data = target_data[target_data[affinity_col].notna()].copy()
target_data["pAffinity"] = -((target_data[affinity_col] * 1e-9).apply(
lambda x: __import__('math').log10(x)
))
target_data = target_data.sort_values("pAffinity", ascending=False)
return target_data
# Most potent compounds against EGFR (P00533)
# sar = sar_analysis(df, "P00533", "IC50 (nM)")
# print(sar.head(20))
```
### 6. Polypharmacology Profile
```python
def polypharmacology_profile(df, ligand_smiles_or_name, affinity_cutoff_nM=1000):
"""
Find all targets a compound binds to.
Uses PubChem CID or SMILES for matching.
"""
# Search by ligand SMILES (exact match)
mask = df["Ligand SMILES"] == ligand_smiles_or_name
ligand_data = df[mask].copy()
# Filter by affinity
aff_cols = ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]
has_aff = pd.Series(False, index=ligand_data.index)
for col in aff_cols:
if col in ligand_data.columns:
has_aff |= ligand_data[col] <= affinity_cutoff_nM
result = ligand_data[has_aff][
["Target Name", "UniProt (SwissProt) Primary ID of Target Chain"] + aff_cols
].dropna(how='all')
return result.sort_values("Ki (nM)")
```
## Query Workflows
### Workflow 1: Find Best Inhibitors for a Target
```python
import pandas as pd
def find_best_inhibitors(uniprot_id, affinity_type="IC50 (nM)", top_n=20):
"""Find the most potent inhibitors for a target in BindingDB."""
df = load_bindingdb("BindingDB_All.tsv") # Load once and reuse
result = query_target_affinity(df, uniprot_id, [affinity_type])
if result.empty:
print(f"No data found for {uniprot_id}")
return result
result = result.sort_values(affinity_type).head(top_n)
print(f"Top {top_n} inhibitors for {uniprot_id} by {affinity_type}:")
for _, row in result.iterrows():
print(f" {row['PubChem CID']}: {row[affinity_type]:.1f} nM | SMILES: {row['Ligand SMILES'][:40]}...")
return result
```
### Workflow 2: Selectivity Profiling
1. Get all affinity data for your compound across all targets
2. Compare affinity ratios between on-target and off-targets
3. Identify selectivity cliffs (structural changes that improve selectivity)
4. Cross-reference with ChEMBL for additional selectivity data
### Workflow 3: Machine Learning Dataset Preparation
```python
def prepare_ml_dataset(df, uniprot_ids, affinity_col="IC50 (nM)",
max_affinity_nM=100000, min_count=50):
"""Prepare BindingDB data for ML model training."""
records = []
for uid in uniprot_ids:
target_df = query_target_affinity(df, uid, [affinity_col], max_affinity_nM)
if len(target_df) >= min_count:
target_df = target_df.copy()
target_df["target"] = uid
records.append(target_df)
if not records:
return pd.DataFrame()
combined = pd.concat(records)
# Add pAffinity (normalized)
combined["pAffinity"] = -((combined[affinity_col] * 1e-9).apply(
lambda x: __import__('math').log10(max(x, 1e-12))
))
return combined[["Ligand SMILES", "target", "pAffinity", affinity_col]].dropna()
```
## Key Data Fields
| Field | Description |
|-------|-------------|
| `Ligand SMILES` | 2D structure of the compound |
| `Ligand InChI Key` | Unique chemical identifier |
| `Ki (nM)` | Inhibition constant (equilibrium, functional) |
| `Kd (nM)` | Dissociation constant (thermodynamic, binding) |
| `IC50 (nM)` | Half-maximal inhibitory concentration |
| `EC50 (nM)` | Half-maximal effective concentration |
| `kon (M-1-s-1)` | Association rate constant |
| `koff (s-1)` | Dissociation rate constant |
| `UniProt (SwissProt) Primary ID` | Target UniProt accession |
| `Target Name` | Protein name |
| `PDB ID(s) for Ligand-Target Complex` | Crystal structures |
| `PubChem CID` | PubChem compound ID |
| `ChEMBL ID of Ligand` | ChEMBL compound ID |
## Affinity Interpretation
| Affinity | Classification | Drug-likeness |
|----------|---------------|---------------|
| < 1 nM | Sub-nanomolar | Very potent (picomolar range) |
| 110 nM | Nanomolar | Potent, typical for approved drugs |
| 10100 nM | Moderate | Common lead compounds |
| 1001000 nM | Weak | Fragment/starting point |
| > 1000 nM | Very weak | Generally below drug-relevance threshold |
## Best Practices
- **Use Ki for direct binding**: Ki reflects true binding affinity independent of enzymatic mechanism
- **IC50 context-dependency**: IC50 values depend on substrate concentration (Cheng-Prusoff equation)
- **Normalize units**: BindingDB reports in nM; verify units when comparing across studies
- **Filter by target organism**: Use `Target Source Organism` to ensure human protein data
- **Handle missing values**: Not all compounds have all measurement types
- **Cross-reference with ChEMBL**: ChEMBL has more curated activity data for medicinal chemistry
## Additional Resources
- **BindingDB website**: https://www.bindingdb.org/
- **Data downloads**: https://www.bindingdb.org/bind/chemsearch/marvin/Download.jsp
- **API documentation**: https://www.bindingdb.org/bind/BindingDBRESTfulAPI.jsp
- **Citation**: Gilson MK et al. (2016) Nucleic Acids Research. PMID: 26481362
- **Related resources**: ChEMBL (https://www.ebi.ac.uk/chembl/), PubChem BioAssay

View File

@@ -0,0 +1,178 @@
# BindingDB Affinity Query Reference
## Affinity Measurement Types
### Ki (Inhibition Constant)
- **Definition**: Equilibrium constant for inhibitor-enzyme complex dissociation
- **Equation**: Ki = [E][I]/[EI]
- **Usage**: Enzyme inhibition; preferred for mechanistic studies
- **Note**: Independent of substrate concentration (unlike IC50)
### Kd (Dissociation Constant)
- **Definition**: Thermodynamic binding equilibrium constant
- **Equation**: Kd = [A][B]/[AB]
- **Usage**: Direct binding assays (SPR, ITC, fluorescence anisotropy)
- **Note**: True measure of binding strength; lower = tighter binding
### IC50 (Half-Maximal Inhibitory Concentration)
- **Definition**: Concentration of inhibitor that reduces target activity by 50%
- **Usage**: Most common in drug discovery; assay-dependent
- **Conversion to Ki**: Cheng-Prusoff equation: Ki = IC50 / (1 + [S]/Km)
- **Note**: Depends on substrate concentration and assay conditions
### EC50 (Half-Maximal Effective Concentration)
- **Definition**: Concentration that produces 50% of maximal effect
- **Usage**: Cell-based assays, agonist studies
### Kinetics Parameters
- **kon**: Association rate constant (M⁻¹s⁻¹); describes how fast complex forms
- **koff**: Dissociation rate constant (s⁻¹); describes how fast complex dissociates
- **Residence time**: τ = 1/koff; longer residence = more sustained effect
- **Kd from kinetics**: Kd = koff/kon
## Common API Query Patterns
### By UniProt ID (REST API)
```python
import requests
def query_by_uniprot(uniprot_id, affinity_type="Ki"):
"""
REST API query for BindingDB affinities by UniProt target ID.
"""
url = "https://www.bindingdb.org/axis2/services/BDBService/getLigandsByUniprotID"
params = {
"uniprot_id": uniprot_id,
"cutoff": "10000", # nM threshold
"affinity_type": affinity_type,
"response": "json"
}
response = requests.get(url, params=params)
return response.json()
# Important targets
COMMON_TARGETS = {
"ABL1": "P00519", # Imatinib, dasatinib target
"EGFR": "P00533", # Erlotinib, gefitinib target
"BRAF": "P15056", # Vemurafenib, dabrafenib target
"CDK2": "P24941", # Cell cycle kinase
"HDAC1": "Q13547", # Histone deacetylase
"BRD4": "O60885", # BET bromodomain reader
"MDM2": "Q00987", # p53 negative regulator
"BCL2": "P10415", # Antiapoptotic protein
"PCSK9": "Q8NBP7", # Cholesterol regulator
"JAK2": "O60674", # Cytokine signaling kinase
}
```
### By PubChem CID (REST API)
```python
def query_by_pubchem_cid(pubchem_cid):
"""Get all binding data for a specific compound by PubChem CID."""
url = "https://www.bindingdb.org/axis2/services/BDBService/getAffinitiesByCID"
params = {"cid": pubchem_cid, "response": "json"}
response = requests.get(url, params=params)
return response.json()
# Example: Imatinib PubChem CID = 5291
imatinib_data = query_by_pubchem_cid(5291)
```
### By Target Name
```python
def query_by_target_name(target_name, affinity_cutoff=100):
"""Query BindingDB by target name."""
url = "https://www.bindingdb.org/axis2/services/BDBService/getAffinitiesByTarget"
params = {
"target_name": target_name,
"cutoff": affinity_cutoff,
"response": "json"
}
response = requests.get(url, params=params)
return response.json()
```
## Dataset Download Guide
### Available Files
| File | Size | Contents |
|------|------|---------|
| `BindingDB_All.tsv.zip` | ~3.5 GB | All data: ~2.9M records |
| `BindingDB_All.sdf.zip` | ~7 GB | All data with 3D structures |
| `BindingDB_IC50.tsv` | ~1.5 GB | IC50 data only |
| `BindingDB_Ki.tsv` | ~0.8 GB | Ki data only |
| `BindingDB_Kd.tsv` | ~0.2 GB | Kd data only |
| `BindingDB_EC50.tsv` | ~0.5 GB | EC50 data only |
| `tdc_bindingdb_*` | Various | TDC-formatted subsets |
### Efficient Loading
```python
import pandas as pd
# For large files, use chunking
def load_bindingdb_chunked(filepath, uniprot_ids, affinity_col="Ki (nM)", chunk_size=100000):
"""Load BindingDB in chunks to filter for specific targets."""
results = []
for chunk in pd.read_csv(filepath, sep="\t", chunksize=chunk_size,
low_memory=False, on_bad_lines='skip'):
# Filter for target
mask = chunk["UniProt (SwissProt) Primary ID of Target Chain"].isin(uniprot_ids)
if mask.any():
results.append(chunk[mask])
if results:
return pd.concat(results)
return pd.DataFrame()
```
## pKi / pIC50 Conversion
Converting raw affinity to logarithmic scale (common in ML):
```python
import numpy as np
def to_log_affinity(affinity_nM):
"""Convert nM affinity to pAffinity (negative log molar)."""
affinity_M = affinity_nM * 1e-9 # Convert nM to M
return -np.log10(affinity_M)
# Examples:
# 1 nM → pAffinity = 9.0
# 10 nM → pAffinity = 8.0
# 100 nM → pAffinity = 7.0
# 1 μM → pAffinity = 6.0
# 10 μM → pAffinity = 5.0
```
## Quality Filters
When using BindingDB data for ML or SAR:
```python
def filter_quality(df):
"""Apply quality filters to BindingDB data."""
# 1. Require valid SMILES
df = df[df["Ligand SMILES"].notna() & (df["Ligand SMILES"] != "")]
# 2. Require valid affinity
df = df[df["Ki (nM)"].notna() | df["IC50 (nM)"].notna()]
# 3. Filter extreme values (artifacts)
for col in ["Ki (nM)", "IC50 (nM)", "Kd (nM)"]:
if col in df.columns:
df = df[~(df[col] > 1e6)] # Remove > 1 mM (non-specific)
# 4. Use only human targets
if "Target Source Organism According to Curator or DataSource" in df.columns:
df = df[df["Target Source Organism According to Curator or DataSource"].str.contains(
"Homo sapiens", na=False
)]
return df
```

View File

@@ -0,0 +1,367 @@
---
name: cbioportal-database
description: Query cBioPortal for cancer genomics data including somatic mutations, copy number alterations, gene expression, and survival data across hundreds of cancer studies. Essential for cancer target validation, oncogene/tumor suppressor analysis, and patient-level genomic profiling.
license: LGPL-3.0
metadata:
skill-author: Kuan-lin Huang
---
# cBioPortal Database
## Overview
cBioPortal for Cancer Genomics (https://www.cbioportal.org/) is an open-access resource for exploring, visualizing, and analyzing multidimensional cancer genomics data. It hosts data from The Cancer Genome Atlas (TCGA), AACR Project GENIE, MSK-IMPACT, and hundreds of other cancer studies — covering mutations, copy number alterations (CNA), structural variants, mRNA/protein expression, methylation, and clinical data for thousands of cancer samples.
**Key resources:**
- cBioPortal website: https://www.cbioportal.org/
- REST API: https://www.cbioportal.org/api/
- API docs (Swagger): https://www.cbioportal.org/api/swagger-ui/index.html
- Python client: `bravado` or `requests`
- GitHub: https://github.com/cBioPortal/cbioportal
## When to Use This Skill
Use cBioPortal when:
- **Mutation landscape**: What fraction of a cancer type has mutations in a specific gene?
- **Oncogene/TSG validation**: Is a gene frequently mutated, amplified, or deleted in cancer?
- **Co-mutation patterns**: Are mutations in gene A and gene B mutually exclusive or co-occurring?
- **Survival analysis**: Do mutations in a gene associate with better or worse patient outcomes?
- **Alteration profiles**: What types of alterations (missense, truncating, amplification, deletion) affect a gene?
- **Pan-cancer analysis**: Compare alteration frequencies across cancer types
- **Clinical associations**: Link genomic alterations to clinical variables (stage, grade, treatment response)
- **TCGA/GENIE exploration**: Systematic access to TCGA and clinical sequencing datasets
## Core Capabilities
### 1. cBioPortal REST API
Base URL: `https://www.cbioportal.org/api`
The API is RESTful, returns JSON, and requires no API key for public data.
```python
import requests
BASE_URL = "https://www.cbioportal.org/api"
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
def cbioportal_get(endpoint, params=None):
url = f"{BASE_URL}/{endpoint}"
response = requests.get(url, params=params, headers=HEADERS)
response.raise_for_status()
return response.json()
def cbioportal_post(endpoint, body):
url = f"{BASE_URL}/{endpoint}"
response = requests.post(url, json=body, headers=HEADERS)
response.raise_for_status()
return response.json()
```
### 2. Browse Studies
```python
def get_all_studies():
"""List all available cancer studies."""
return cbioportal_get("studies", {"pageSize": 500})
# Each study has:
# studyId: unique identifier (e.g., "brca_tcga")
# name: human-readable name
# description: dataset description
# cancerTypeId: cancer type abbreviation
# referenceGenome: GRCh37 or GRCh38
# pmid: associated publication
studies = get_all_studies()
print(f"Total studies: {len(studies)}")
# Common TCGA study IDs:
# brca_tcga, luad_tcga, coadread_tcga, gbm_tcga, prad_tcga,
# skcm_tcga, blca_tcga, hnsc_tcga, lihc_tcga, stad_tcga
# Filter for TCGA studies
tcga_studies = [s for s in studies if "tcga" in s["studyId"]]
print([s["studyId"] for s in tcga_studies[:10]])
```
### 3. Molecular Profiles
Each study has multiple molecular profiles (mutation, CNA, expression, etc.):
```python
def get_molecular_profiles(study_id):
"""Get all molecular profiles for a study."""
return cbioportal_get(f"studies/{study_id}/molecular-profiles")
profiles = get_molecular_profiles("brca_tcga")
for p in profiles:
print(f" {p['molecularProfileId']}: {p['name']} ({p['molecularAlterationType']})")
# Alteration types:
# MUTATION_EXTENDED — somatic mutations
# COPY_NUMBER_ALTERATION — CNA (GISTIC)
# MRNA_EXPRESSION — mRNA expression
# PROTEIN_LEVEL — RPPA protein expression
# STRUCTURAL_VARIANT — fusions/rearrangements
```
### 4. Mutation Data
```python
def get_mutations(molecular_profile_id, entrez_gene_ids, sample_list_id=None):
"""Get mutations for specified genes in a molecular profile."""
body = {
"entrezGeneIds": entrez_gene_ids,
"sampleListId": sample_list_id or molecular_profile_id.replace("_mutations", "_all")
}
return cbioportal_post(
f"molecular-profiles/{molecular_profile_id}/mutations/fetch",
body
)
# BRCA1 Entrez ID is 672, TP53 is 7157, PTEN is 5728
mutations = get_mutations("brca_tcga_mutations", entrez_gene_ids=[7157]) # TP53
# Each mutation record contains:
# patientId, sampleId, entrezGeneId, gene.hugoGeneSymbol
# mutationType (Missense_Mutation, Nonsense_Mutation, Frame_Shift_Del, etc.)
# proteinChange (e.g., "R175H")
# variantClassification, variantType
# ncbiBuild, chr, startPosition, endPosition, referenceAllele, variantAllele
# mutationStatus (Somatic/Germline)
# alleleFreqT (tumor VAF)
import pandas as pd
df = pd.DataFrame(mutations)
print(df[["patientId", "mutationType", "proteinChange", "alleleFreqT"]].head())
print(f"\nMutation types:\n{df['mutationType'].value_counts()}")
```
### 5. Copy Number Alteration Data
```python
def get_cna(molecular_profile_id, entrez_gene_ids):
"""Get discrete CNA data (GISTIC: -2, -1, 0, 1, 2)."""
body = {
"entrezGeneIds": entrez_gene_ids,
"sampleListId": molecular_profile_id.replace("_gistic", "_all").replace("_cna", "_all")
}
return cbioportal_post(
f"molecular-profiles/{molecular_profile_id}/discrete-copy-number/fetch",
body
)
# GISTIC values:
# -2 = Deep deletion (homozygous loss)
# -1 = Shallow deletion (heterozygous loss)
# 0 = Diploid (neutral)
# 1 = Low-level gain
# 2 = High-level amplification
cna_data = get_cna("brca_tcga_gistic", entrez_gene_ids=[1956]) # EGFR
df_cna = pd.DataFrame(cna_data)
print(df_cna["value"].value_counts())
```
### 6. Alteration Frequency (OncoPrint-style)
```python
def get_alteration_frequency(study_id, gene_symbols, alteration_types=None):
"""Compute alteration frequencies for genes across a cancer study."""
import requests, pandas as pd
# Get sample list
samples = requests.get(
f"{BASE_URL}/studies/{study_id}/sample-lists",
headers=HEADERS
).json()
all_samples_id = next(
(s["sampleListId"] for s in samples if s["category"] == "all_cases_in_study"), None
)
total_samples = len(requests.get(
f"{BASE_URL}/sample-lists/{all_samples_id}/sample-ids",
headers=HEADERS
).json())
# Get gene Entrez IDs
gene_data = requests.post(
f"{BASE_URL}/genes/fetch",
json=[{"hugoGeneSymbol": g} for g in gene_symbols],
headers=HEADERS
).json()
entrez_ids = [g["entrezGeneId"] for g in gene_data]
# Get mutations
mutation_profile = f"{study_id}_mutations"
mutations = get_mutations(mutation_profile, entrez_ids, all_samples_id)
freq = {}
for g_symbol, e_id in zip(gene_symbols, entrez_ids):
mutated = len(set(m["patientId"] for m in mutations if m["entrezGeneId"] == e_id))
freq[g_symbol] = mutated / total_samples * 100
return freq
# Example
freq = get_alteration_frequency("brca_tcga", ["TP53", "PIK3CA", "BRCA1", "BRCA2"])
for gene, pct in sorted(freq.items(), key=lambda x: -x[1]):
print(f" {gene}: {pct:.1f}%")
```
### 7. Clinical Data
```python
def get_clinical_data(study_id, attribute_ids=None):
"""Get patient-level clinical data."""
params = {"studyId": study_id}
all_clinical = cbioportal_get(
"clinical-data/fetch",
params
)
# Returns list of {patientId, studyId, clinicalAttributeId, value}
# Clinical attributes include:
# OS_STATUS, OS_MONTHS, DFS_STATUS, DFS_MONTHS (survival)
# TUMOR_STAGE, GRADE, AGE, SEX, RACE
# Study-specific attributes vary
def get_clinical_attributes(study_id):
"""List all available clinical attributes for a study."""
return cbioportal_get(f"studies/{study_id}/clinical-attributes")
```
## Query Workflows
### Workflow 1: Gene Alteration Profile in a Cancer Type
```python
import requests, pandas as pd
def alteration_profile(study_id, gene_symbol):
"""Full alteration profile for a gene in a cancer study."""
# 1. Get gene Entrez ID
gene_info = requests.post(
f"{BASE_URL}/genes/fetch",
json=[{"hugoGeneSymbol": gene_symbol}],
headers=HEADERS
).json()[0]
entrez_id = gene_info["entrezGeneId"]
# 2. Get mutations
mutations = get_mutations(f"{study_id}_mutations", [entrez_id])
mut_df = pd.DataFrame(mutations) if mutations else pd.DataFrame()
# 3. Get CNAs
cna = get_cna(f"{study_id}_gistic", [entrez_id])
cna_df = pd.DataFrame(cna) if cna else pd.DataFrame()
# 4. Summary
n_mut = len(set(mut_df["patientId"])) if not mut_df.empty else 0
n_amp = len(cna_df[cna_df["value"] == 2]) if not cna_df.empty else 0
n_del = len(cna_df[cna_df["value"] == -2]) if not cna_df.empty else 0
return {"mutations": n_mut, "amplifications": n_amp, "deep_deletions": n_del}
result = alteration_profile("brca_tcga", "PIK3CA")
print(result)
```
### Workflow 2: Pan-Cancer Gene Mutation Frequency
```python
import requests, pandas as pd
def pan_cancer_mutation_freq(gene_symbol, cancer_study_ids=None):
"""Mutation frequency of a gene across multiple cancer types."""
studies = get_all_studies()
if cancer_study_ids:
studies = [s for s in studies if s["studyId"] in cancer_study_ids]
results = []
for study in studies[:20]: # Limit for demo
try:
freq = get_alteration_frequency(study["studyId"], [gene_symbol])
results.append({
"study": study["studyId"],
"cancer": study.get("cancerTypeId", ""),
"mutation_pct": freq.get(gene_symbol, 0)
})
except Exception:
pass
df = pd.DataFrame(results).sort_values("mutation_pct", ascending=False)
return df
```
### Workflow 3: Survival Analysis by Mutation Status
```python
import requests, pandas as pd
def survival_by_mutation(study_id, gene_symbol):
"""Get survival data split by mutation status."""
# This workflow fetches clinical and mutation data for downstream analysis
gene_info = requests.post(
f"{BASE_URL}/genes/fetch",
json=[{"hugoGeneSymbol": gene_symbol}],
headers=HEADERS
).json()[0]
entrez_id = gene_info["entrezGeneId"]
mutations = get_mutations(f"{study_id}_mutations", [entrez_id])
mutated_patients = set(m["patientId"] for m in mutations)
clinical = cbioportal_get("clinical-data/fetch", {"studyId": study_id})
clinical_df = pd.DataFrame(clinical)
os_data = clinical_df[clinical_df["clinicalAttributeId"].isin(["OS_MONTHS", "OS_STATUS"])]
os_wide = os_data.pivot(index="patientId", columns="clinicalAttributeId", values="value")
os_wide["mutated"] = os_wide.index.isin(mutated_patients)
return os_wide
```
## Key API Endpoints Summary
| Endpoint | Description |
|----------|-------------|
| `GET /studies` | List all studies |
| `GET /studies/{studyId}/molecular-profiles` | Molecular profiles for a study |
| `POST /molecular-profiles/{profileId}/mutations/fetch` | Get mutation data |
| `POST /molecular-profiles/{profileId}/discrete-copy-number/fetch` | Get CNA data |
| `POST /molecular-profiles/{profileId}/molecular-data/fetch` | Get expression data |
| `GET /studies/{studyId}/clinical-attributes` | Available clinical variables |
| `GET /clinical-data/fetch` | Clinical data |
| `POST /genes/fetch` | Gene metadata by symbol or Entrez ID |
| `GET /studies/{studyId}/sample-lists` | Sample lists |
## Best Practices
- **Know your study IDs**: Use the Swagger UI or `GET /studies` to find the correct study ID
- **Use sample lists**: Each study has an `all` sample list and subsets; always specify the appropriate one
- **TCGA vs. GENIE**: TCGA data is comprehensive but older; GENIE has more recent clinical sequencing data
- **Entrez gene IDs**: The API uses Entrez IDs — use `/genes/fetch` to convert from symbols
- **Handle 404s**: Some molecular profiles may not exist for all studies
- **Rate limiting**: Add delays for bulk queries; consider downloading data files for large-scale analyses
## Data Downloads
For large-scale analyses, download study data directly:
```bash
# Download TCGA BRCA data
wget https://cbioportal-datahub.s3.amazonaws.com/brca_tcga.tar.gz
```
## Additional Resources
- **cBioPortal website**: https://www.cbioportal.org/
- **API Swagger UI**: https://www.cbioportal.org/api/swagger-ui/index.html
- **Documentation**: https://docs.cbioportal.org/
- **GitHub**: https://github.com/cBioPortal/cbioportal
- **Data hub**: https://datahub.cbioportal.org/
- **Citation**: Cerami E et al. (2012) Cancer Discovery. PMID: 22588877
- **API clients**: https://docs.cbioportal.org/web-api-and-clients/

View File

@@ -0,0 +1,128 @@
# cBioPortal Study Exploration Reference
## Major Study Collections
### TCGA (The Cancer Genome Atlas)
| Study ID | Cancer Type | Samples |
|----------|-------------|---------|
| `brca_tcga` | Breast Cancer | ~1,000 |
| `luad_tcga` | Lung Adenocarcinoma | ~500 |
| `lusc_tcga` | Lung Squamous Cell Carcinoma | ~500 |
| `coadread_tcga` | Colorectal Cancer | ~600 |
| `gbm_tcga` | Glioblastoma | ~600 |
| `prad_tcga` | Prostate Cancer | ~500 |
| `skcm_tcga` | Skin Cutaneous Melanoma | ~450 |
| `blca_tcga` | Bladder Urothelial Carcinoma | ~400 |
| `hnsc_tcga` | Head and Neck Squamous | ~500 |
| `lihc_tcga` | Liver Hepatocellular Carcinoma | ~370 |
| `stad_tcga` | Stomach Adenocarcinoma | ~440 |
| `ucec_tcga` | Uterine Endometrial Carcinoma | ~550 |
| `ov_tcga` | Ovarian Serous Carcinoma | ~580 |
| `kirc_tcga` | Kidney Renal Clear Cell Carcinoma | ~530 |
| `thca_tcga` | Thyroid Cancer | ~500 |
| `paad_tcga` | Pancreatic Adenocarcinoma | ~180 |
| `laml_tcga` | Acute Myeloid Leukemia | ~200 |
| `acc_tcga` | Adrenocortical Carcinoma | ~90 |
### TCGA Pan-Cancer
| Study ID | Description |
|----------|-------------|
| `tcga_pan_can_atlas_2018` | TCGA Pan-Cancer Atlas (32 cancer types, ~10K samples) |
### MSK-IMPACT (Memorial Sloan Kettering)
| Study ID | Description |
|----------|-------------|
| `msk_impact_2017` | MSK-IMPACT clinical sequencing |
| `mskcc_pd` | MSK pediatric solid tumors |
### AACR Project GENIE
| Study ID | Description |
|----------|-------------|
| `genie_14_1_public` | GENIE v14.1 (multi-center clinical sequencing) |
## Molecular Profile ID Naming Conventions
Molecular profile IDs are structured as `{studyId}_{type}`:
| Type Suffix | Alteration Type |
|-------------|----------------|
| `_mutations` | Somatic mutations (MAF) |
| `_gistic` | Copy number (GISTIC discrete: -2, -1, 0, 1, 2) |
| `_cna` | Copy number (continuous log2 ratio) |
| `_mrna` | mRNA expression (z-scores or log2) |
| `_rna_seq_v2_mrna` | RNA-seq (RSEM) |
| `_rna_seq_v2_mrna_median_Zscores` | RNA-seq z-scores relative to normals |
| `_rppa` | RPPA protein expression |
| `_rppa_Zscores` | RPPA z-scores |
| `_sv` | Structural variants/fusions |
| `_methylation_hm450` | DNA methylation (450K array) |
**Example:** For `brca_tcga`:
- `brca_tcga_mutations` — mutation data
- `brca_tcga_gistic` — CNA data
- `brca_tcga_rna_seq_v2_mrna` — RNA-seq expression
## Sample List Categories
Each study has sample lists of different subsets:
| Category | sampleListId Pattern | Contents |
|----------|---------------------|----------|
| `all_cases_in_study` | `{studyId}_all` | All samples |
| `all_cases_with_mutation_data` | `{studyId}_sequenced` | Sequenced samples only |
| `all_cases_with_cna_data` | `{studyId}_cna` | Samples with CNA data |
| `all_cases_with_mrna_data` | `{studyId}_mrna` | Samples with expression |
| `all_cases_with_rppa_data` | `{studyId}_rppa` | Samples with RPPA |
| `all_complete_cases` | `{studyId}_complete` | Complete multiplatform data |
## Common Gene Entrez IDs
| Gene | Entrez ID | Role |
|------|-----------|------|
| TP53 | 7157 | Tumor suppressor |
| PIK3CA | 5290 | Oncogene |
| KRAS | 3845 | Oncogene |
| BRCA1 | 672 | Tumor suppressor |
| BRCA2 | 675 | Tumor suppressor |
| PTEN | 5728 | Tumor suppressor |
| EGFR | 1956 | Oncogene |
| MYC | 4609 | Oncogene |
| RB1 | 5925 | Tumor suppressor |
| APC | 324 | Tumor suppressor |
| CDKN2A | 1029 | Tumor suppressor |
| IDH1 | 3417 | Oncogene (mutant) |
| BRAF | 673 | Oncogene |
| CDH1 | 999 | Tumor suppressor |
| VHL | 7428 | Tumor suppressor |
## Mutation Type Classifications
| mutationType | Description |
|-------------|-------------|
| `Missense_Mutation` | Amino acid change |
| `Nonsense_Mutation` | Premature stop codon |
| `Frame_Shift_Del` | Frameshift deletion |
| `Frame_Shift_Ins` | Frameshift insertion |
| `Splice_Site` | Splice site mutation |
| `In_Frame_Del` | In-frame deletion |
| `In_Frame_Ins` | In-frame insertion |
| `Translation_Start_Site` | Start codon mutation |
| `Nonstop_Mutation` | Stop codon mutation |
| `Silent` | Synonymous |
| `5'Flank` | 5' flanking |
| `3'UTR` | 3' UTR |
## OncoPrint Color Legend
cBioPortal uses consistent colors in OncoPrint:
- **Red**: Amplification
- **Blue (dark)**: Deep deletion
- **Green**: Missense mutation
- **Black**: Truncating mutation
- **Purple**: Fusion
- **Orange**: mRNA upregulation
- **Teal**: mRNA downregulation

View File

@@ -0,0 +1,300 @@
---
name: depmap
description: Query the Cancer Dependency Map (DepMap) for cancer cell line gene dependency scores (CRISPR Chronos), drug sensitivity data, and gene effect profiles. Use for identifying cancer-specific vulnerabilities, synthetic lethal interactions, and validating oncology drug targets.
license: CC-BY-4.0
metadata:
skill-author: Kuan-lin Huang
---
# DepMap — Cancer Dependency Map
## Overview
The Cancer Dependency Map (DepMap) project, run by the Broad Institute, systematically characterizes genetic dependencies across hundreds of cancer cell lines using genome-wide CRISPR knockout screens (DepMap CRISPR), RNA interference (RNAi), and compound sensitivity assays (PRISM). DepMap data is essential for:
- Identifying which genes are essential for specific cancer types
- Finding cancer-selective dependencies (therapeutic targets)
- Validating oncology drug targets
- Discovering synthetic lethal interactions
**Key resources:**
- DepMap Portal: https://depmap.org/portal/
- DepMap data downloads: https://depmap.org/portal/download/all/
- Python package: `depmap` (or access via API/downloads)
- API: https://depmap.org/portal/api/
## When to Use This Skill
Use DepMap when:
- **Target validation**: Is a gene essential for survival in cancer cell lines with a specific mutation (e.g., KRAS-mutant)?
- **Biomarker discovery**: What genomic features predict sensitivity to knockout of a gene?
- **Synthetic lethality**: Find genes that are selectively essential when another gene is mutated/deleted
- **Drug sensitivity**: What cell line features predict response to a compound?
- **Pan-cancer essentiality**: Is a gene broadly essential across all cancer types (bad target) or selectively essential?
- **Correlation analysis**: Which pairs of genes have correlated dependency profiles (co-essentiality)?
## Core Concepts
### Dependency Scores
| Score | Range | Meaning |
|-------|-------|---------|
| **Chronos** (CRISPR) | ~ -3 to 0+ | More negative = more essential. Common essential threshold: 1. Pan-essential genes ~1 to 2 |
| **RNAi DEMETER2** | ~ -3 to 0+ | Similar scale to Chronos |
| **Gene Effect** | normalized | Normalized Chronos; 1 = median effect of common essential genes |
**Key thresholds:**
- Chronos ≤ 0.5: likely dependent
- Chronos ≤ 1: strongly dependent (common essential range)
### Cell Line Annotations
Each cell line has:
- `DepMap_ID`: unique identifier (e.g., `ACH-000001`)
- `cell_line_name`: human-readable name
- `primary_disease`: cancer type
- `lineage`: broad tissue lineage
- `lineage_subtype`: specific subtype
## Core Capabilities
### 1. DepMap API
```python
import requests
import pandas as pd
BASE_URL = "https://depmap.org/portal/api"
def depmap_get(endpoint, params=None):
url = f"{BASE_URL}/{endpoint}"
response = requests.get(url, params=params)
response.raise_for_status()
return response.json()
```
### 2. Gene Dependency Scores
```python
def get_gene_dependency(gene_symbol, dataset="Chronos_Combined"):
"""Get CRISPR dependency scores for a gene across all cell lines."""
url = f"{BASE_URL}/gene"
params = {
"gene_id": gene_symbol,
"dataset": dataset
}
response = requests.get(url, params=params)
return response.json()
# Alternatively, use the /data endpoint:
def get_dependencies_slice(gene_symbol, dataset_name="CRISPRGeneEffect"):
"""Get a gene's dependency slice from a dataset."""
url = f"{BASE_URL}/data/gene_dependency"
params = {"gene_name": gene_symbol, "dataset_name": dataset_name}
response = requests.get(url, params=params)
data = response.json()
return data
```
### 3. Download-Based Analysis (Recommended for Large Queries)
For large-scale analysis, download DepMap data files and analyze locally:
```python
import pandas as pd
import requests, os
def download_depmap_data(url, output_path):
"""Download a DepMap data file."""
response = requests.get(url, stream=True)
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# DepMap 24Q4 data files (update version as needed)
FILES = {
"crispr_gene_effect": "https://figshare.com/ndownloader/files/...",
# OR download from: https://depmap.org/portal/download/all/
# Files available:
# CRISPRGeneEffect.csv - Chronos gene effect scores
# OmicsExpressionProteinCodingGenesTPMLogp1.csv - mRNA expression
# OmicsSomaticMutationsMatrixDamaging.csv - mutation binary matrix
# OmicsCNGene.csv - copy number
# sample_info.csv - cell line metadata
}
def load_depmap_gene_effect(filepath="CRISPRGeneEffect.csv"):
"""
Load DepMap CRISPR gene effect matrix.
Rows = cell lines (DepMap_ID), Columns = genes (Symbol (EntrezID))
"""
df = pd.read_csv(filepath, index_col=0)
# Rename columns to gene symbols only
df.columns = [col.split(" ")[0] for col in df.columns]
return df
def load_cell_line_info(filepath="sample_info.csv"):
"""Load cell line metadata."""
return pd.read_csv(filepath)
```
### 4. Identifying Selective Dependencies
```python
import numpy as np
import pandas as pd
def find_selective_dependencies(gene_effect_df, cell_line_info, target_gene,
cancer_type=None, threshold=-0.5):
"""Find cell lines selectively dependent on a gene."""
# Get scores for target gene
if target_gene not in gene_effect_df.columns:
return None
scores = gene_effect_df[target_gene].dropna()
dependent = scores[scores <= threshold]
# Add cell line info
result = pd.DataFrame({
"DepMap_ID": dependent.index,
"gene_effect": dependent.values
}).merge(cell_line_info[["DepMap_ID", "cell_line_name", "primary_disease", "lineage"]])
if cancer_type:
result = result[result["primary_disease"].str.contains(cancer_type, case=False, na=False)]
return result.sort_values("gene_effect")
# Example usage (after loading data)
# df_effect = load_depmap_gene_effect("CRISPRGeneEffect.csv")
# cell_info = load_cell_line_info("sample_info.csv")
# deps = find_selective_dependencies(df_effect, cell_info, "KRAS", cancer_type="Lung")
```
### 5. Biomarker Analysis (Gene Effect vs. Mutation)
```python
import pandas as pd
from scipy import stats
def biomarker_analysis(gene_effect_df, mutation_df, target_gene, biomarker_gene):
"""
Test if mutation in biomarker_gene predicts dependency on target_gene.
Args:
gene_effect_df: CRISPR gene effect DataFrame
mutation_df: Binary mutation DataFrame (1 = mutated)
target_gene: Gene to assess dependency of
biomarker_gene: Gene whose mutation may predict dependency
"""
if target_gene not in gene_effect_df.columns or biomarker_gene not in mutation_df.columns:
return None
# Align cell lines
common_lines = gene_effect_df.index.intersection(mutation_df.index)
scores = gene_effect_df.loc[common_lines, target_gene].dropna()
mutations = mutation_df.loc[scores.index, biomarker_gene]
mutated = scores[mutations == 1]
wt = scores[mutations == 0]
stat, pval = stats.mannwhitneyu(mutated, wt, alternative='less')
return {
"target_gene": target_gene,
"biomarker_gene": biomarker_gene,
"n_mutated": len(mutated),
"n_wt": len(wt),
"mean_effect_mutated": mutated.mean(),
"mean_effect_wt": wt.mean(),
"pval": pval,
"significant": pval < 0.05
}
```
### 6. Co-Essentiality Analysis
```python
import pandas as pd
def co_essentiality(gene_effect_df, target_gene, top_n=20):
"""Find genes with most correlated dependency profiles (co-essential partners)."""
if target_gene not in gene_effect_df.columns:
return None
target_scores = gene_effect_df[target_gene].dropna()
correlations = {}
for gene in gene_effect_df.columns:
if gene == target_gene:
continue
other_scores = gene_effect_df[gene].dropna()
common = target_scores.index.intersection(other_scores.index)
if len(common) < 50:
continue
r = target_scores[common].corr(other_scores[common])
if not pd.isna(r):
correlations[gene] = r
corr_series = pd.Series(correlations).sort_values(ascending=False)
return corr_series.head(top_n)
# Co-essential genes often share biological complexes or pathways
```
## Query Workflows
### Workflow 1: Target Validation for a Cancer Type
1. Download `CRISPRGeneEffect.csv` and `sample_info.csv`
2. Filter cell lines by cancer type
3. Compute mean gene effect for target gene in cancer vs. all others
4. Calculate selectivity: how specific is the dependency to your cancer type?
5. Cross-reference with mutation, expression, or CNA data as biomarkers
### Workflow 2: Synthetic Lethality Screen
1. Identify cell lines with mutation/deletion in gene of interest (e.g., BRCA1-mutant)
2. Compute gene effect scores for all genes in mutant vs. WT lines
3. Identify genes significantly more essential in mutant lines (synthetic lethal partners)
4. Filter by selectivity and effect size
### Workflow 3: Compound Sensitivity Analysis
1. Download PRISM compound sensitivity data (`primary-screen-replicate-treatment-info.csv`)
2. Correlate compound AUC/log2(fold-change) with genomic features
3. Identify predictive biomarkers for compound sensitivity
## DepMap Data Files Reference
| File | Description |
|------|-------------|
| `CRISPRGeneEffect.csv` | CRISPR Chronos gene effect (primary dependency data) |
| `CRISPRGeneEffectUnscaled.csv` | Unscaled CRISPR scores |
| `RNAi_merged.csv` | DEMETER2 RNAi dependency |
| `sample_info.csv` | Cell line metadata (lineage, disease, etc.) |
| `OmicsExpressionProteinCodingGenesTPMLogp1.csv` | mRNA expression |
| `OmicsSomaticMutationsMatrixDamaging.csv` | Damaging somatic mutations (binary) |
| `OmicsCNGene.csv` | Copy number per gene |
| `PRISM_Repurposing_Primary_Screens_Data.csv` | Drug sensitivity (repurposing library) |
Download all files from: https://depmap.org/portal/download/all/
## Best Practices
- **Use Chronos scores** (not DEMETER2) for current CRISPR analyses — better controlled for cutting efficiency
- **Distinguish pan-essential from cancer-selective**: Target genes with low variance (essential in all lines) are poor drug targets
- **Validate with expression data**: A gene not expressed in a cell line will score as non-essential regardless of actual function
- **Use DepMap ID** for cell line identification — cell_line_name can be ambiguous
- **Account for copy number**: Amplified genes may appear essential due to copy number effect (junk DNA hypothesis)
- **Multiple testing correction**: When computing biomarker associations genome-wide, apply FDR correction
## Additional Resources
- **DepMap Portal**: https://depmap.org/portal/
- **Data downloads**: https://depmap.org/portal/download/all/
- **DepMap paper**: Behan FM et al. (2019) Nature. PMID: 30971826
- **Chronos paper**: Dempster JM et al. (2021) Nature Methods. PMID: 34349281
- **GitHub**: https://github.com/broadinstitute/depmap-portal
- **Figshare**: https://figshare.com/articles/dataset/DepMap_24Q4_Public/27993966

View File

@@ -0,0 +1,178 @@
# DepMap Dependency Analysis Guide
## Understanding Chronos Scores
Chronos is the current (v5+) algorithm for computing gene dependency scores from CRISPR screen data. It addresses systematic biases including:
- Copy number effects (high-copy genes appear essential due to DNA cutting)
- Guide RNA efficiency variation
- Cell line growth rates
### Score Interpretation
| Score Range | Interpretation |
|------------|----------------|
| > 0 | Likely growth-promoting when knocked out (some noise) |
| 0 to 0.3 | Non-essential: minimal fitness effect |
| 0.3 to 0.5 | Mild dependency |
| 0.5 to 1.0 | Significant dependency |
| < 1.0 | Strong dependency (common essential range) |
| ≈ 1.0 | Median of pan-essential genes (e.g., proteasome subunits) |
### Common Essential Genes (Controls)
Genes that are essential in nearly all cell lines (score ~1 to 2):
- Ribosomal proteins: RPL..., RPS...
- Proteasome: PSMA..., PSMB...
- Spliceosome: SNRPD1, SNRNP70
- DNA replication: MCM2, PCNA
- Transcription: POLR2A, TAF...
These can be used as positive controls for screen quality.
### Non-Essential Controls
Genes with negligible fitness effect (score ~ 0):
- Non-expressed genes (tissue-specific)
- Safe harbor loci
## Selectivity Assessment
To determine if a dependency is cancer-selective:
```python
import pandas as pd
import numpy as np
def compute_selectivity(gene_effect_df, target_gene, cancer_lineage):
"""Compute selectivity score for a cancer lineage."""
scores = gene_effect_df[target_gene].dropna()
# Get cell line metadata
from depmap_utils import load_cell_line_info
cell_info = load_cell_line_info()
scores_df = scores.reset_index()
scores_df.columns = ["DepMap_ID", "score"]
scores_df = scores_df.merge(cell_info[["DepMap_ID", "lineage"]])
cancer_scores = scores_df[scores_df["lineage"] == cancer_lineage]["score"]
other_scores = scores_df[scores_df["lineage"] != cancer_lineage]["score"]
# Selectivity: lower mean in cancer lineage vs others
selectivity = other_scores.mean() - cancer_scores.mean()
return {
"target_gene": target_gene,
"cancer_lineage": cancer_lineage,
"cancer_mean": cancer_scores.mean(),
"other_mean": other_scores.mean(),
"selectivity_score": selectivity,
"n_cancer": len(cancer_scores),
"fraction_dependent": (cancer_scores < -0.5).mean()
}
```
## CRISPR Dataset Versions
| Dataset | Description | Recommended |
|---------|-------------|-------------|
| `CRISPRGeneEffect` | Chronos-corrected gene effect | Yes (current) |
| `Achilles_gene_effect` | Older CERES algorithm | Legacy only |
| `RNAi_merged` | DEMETER2 RNAi | For cross-validation |
## Quality Metrics
DepMap reports quality control metrics per screen:
- **Skewness**: Pan-essential genes should show negative skew
- **AUC**: Area under ROC for pan-essential vs non-essential controls
Good screens: skewness < 1, AUC > 0.85
## Cancer Lineage Codes
Common values for `lineage` field in `sample_info.csv`:
| Lineage | Description |
|---------|-------------|
| `lung` | Lung cancer |
| `breast` | Breast cancer |
| `colorectal` | Colorectal cancer |
| `brain_cancer` | Brain cancer (GBM, etc.) |
| `leukemia` | Leukemia |
| `lymphoma` | Lymphoma |
| `prostate` | Prostate cancer |
| `ovarian` | Ovarian cancer |
| `pancreatic` | Pancreatic cancer |
| `skin` | Melanoma and other skin |
| `liver` | Liver cancer |
| `kidney` | Kidney cancer |
## Synthetic Lethality Analysis
```python
import pandas as pd
import numpy as np
from scipy import stats
def find_synthetic_lethal(gene_effect_df, mutation_df, biomarker_gene,
fdr_threshold=0.1):
"""
Find synthetic lethal partners for a loss-of-function mutation.
For each gene, tests if cell lines mutant in biomarker_gene
are more dependent on that gene vs. WT lines.
"""
if biomarker_gene not in mutation_df.columns:
return pd.DataFrame()
# Get mutant vs WT cell lines
common = gene_effect_df.index.intersection(mutation_df.index)
is_mutant = mutation_df.loc[common, biomarker_gene] == 1
mutant_lines = common[is_mutant]
wt_lines = common[~is_mutant]
results = []
for gene in gene_effect_df.columns:
mut_scores = gene_effect_df.loc[mutant_lines, gene].dropna()
wt_scores = gene_effect_df.loc[wt_lines, gene].dropna()
if len(mut_scores) < 5 or len(wt_scores) < 10:
continue
stat, pval = stats.mannwhitneyu(mut_scores, wt_scores, alternative='less')
results.append({
"gene": gene,
"mean_mutant": mut_scores.mean(),
"mean_wt": wt_scores.mean(),
"effect_size": wt_scores.mean() - mut_scores.mean(),
"pval": pval,
"n_mutant": len(mut_scores),
"n_wt": len(wt_scores)
})
df = pd.DataFrame(results)
# FDR correction
from scipy.stats import false_discovery_control
df["qval"] = false_discovery_control(df["pval"], method="bh")
df = df[df["qval"] < fdr_threshold].sort_values("effect_size", ascending=False)
return df
```
## Drug Sensitivity (PRISM)
DepMap also contains compound sensitivity data from the PRISM assay:
```python
import pandas as pd
def load_prism_data(filepath="primary-screen-replicate-collapsed-logfold-change.csv"):
"""
Load PRISM drug sensitivity data.
Rows = cell lines, Columns = compounds (broad_id::name::dose)
Values = log2 fold change (more negative = more sensitive)
"""
return pd.read_csv(filepath, index_col=0)
# Available datasets:
# primary-screen: 4,518 compounds at single dose
# secondary-screen: ~8,000 compounds at multiple doses (AUC available)
```

View File

@@ -0,0 +1,338 @@
---
name: glycoengineering
description: Analyze and engineer protein glycosylation. Scan sequences for N-glycosylation sequons (N-X-S/T), predict O-glycosylation hotspots, and access curated glycoengineering tools (NetOGlyc, GlycoShield, GlycoWorkbench). For glycoprotein engineering, therapeutic antibody optimization, and vaccine design.
license: Unknown
metadata:
skill-author: Kuan-lin Huang
---
# Glycoengineering
## Overview
Glycosylation is the most common and complex post-translational modification (PTM) of proteins, affecting over 50% of all human proteins. Glycans regulate protein folding, stability, immune recognition, receptor interactions, and pharmacokinetics of therapeutic proteins. Glycoengineering involves rational modification of glycosylation patterns for improved therapeutic efficacy, stability, or immune evasion.
**Two major glycosylation types:**
- **N-glycosylation**: Attached to asparagine (N) in the sequon N-X-[S/T] where X ≠ Proline; occurs in the ER/Golgi
- **O-glycosylation**: Attached to serine (S) or threonine (T); no strict consensus motif; primarily GalNAc initiation
## When to Use This Skill
Use this skill when:
- **Antibody engineering**: Optimize Fc glycosylation for enhanced ADCC, CDC, or reduced immunogenicity
- **Therapeutic protein design**: Identify glycosylation sites that affect half-life, stability, or immunogenicity
- **Vaccine antigen design**: Engineer glycan shields to focus immune responses on conserved epitopes
- **Biosimilar characterization**: Compare glycan patterns between reference and biosimilar
- **Drug target analysis**: Does glycosylation affect target engagement for a receptor?
- **Protein stability**: N-glycans often stabilize proteins; identify sites for stabilizing mutations
## N-Glycosylation Sequon Analysis
### Scanning for N-Glycosylation Sites
N-glycosylation occurs at the sequon **N-X-[S/T]** where X ≠ Proline.
```python
import re
from typing import List, Tuple
def find_n_glycosylation_sequons(sequence: str) -> List[dict]:
"""
Scan a protein sequence for canonical N-linked glycosylation sequons.
Motif: N-X-[S/T], where X ≠ Proline.
Args:
sequence: Single-letter amino acid sequence
Returns:
List of dicts with position (1-based), motif, and context
"""
seq = sequence.upper()
results = []
i = 0
while i <= len(seq) - 3:
triplet = seq[i:i+3]
if triplet[0] == 'N' and triplet[1] != 'P' and triplet[2] in {'S', 'T'}:
context = seq[max(0, i-3):i+6] # ±3 residue context
results.append({
'position': i + 1, # 1-based
'motif': triplet,
'context': context,
'sequon_type': 'NXS' if triplet[2] == 'S' else 'NXT'
})
i += 3
else:
i += 1
return results
def summarize_glycosylation_sites(sequence: str, protein_name: str = "") -> str:
"""Generate a research log summary of N-glycosylation sites."""
sequons = find_n_glycosylation_sequons(sequence)
lines = [f"# N-Glycosylation Sequon Analysis: {protein_name or 'Protein'}"]
lines.append(f"Sequence length: {len(sequence)}")
lines.append(f"Total N-glycosylation sequons: {len(sequons)}")
if sequons:
lines.append(f"\nN-X-S sites: {sum(1 for s in sequons if s['sequon_type'] == 'NXS')}")
lines.append(f"N-X-T sites: {sum(1 for s in sequons if s['sequon_type'] == 'NXT')}")
lines.append(f"\nSite details:")
for s in sequons:
lines.append(f" Position {s['position']}: {s['motif']} (context: ...{s['context']}...)")
else:
lines.append("No canonical N-glycosylation sequons detected.")
return "\n".join(lines)
# Example: IgG1 Fc region
fc_sequence = "APELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK"
print(summarize_glycosylation_sites(fc_sequence, "IgG1 Fc"))
```
### Mutating N-Glycosylation Sites
```python
def eliminate_glycosite(sequence: str, position: int, replacement: str = "Q") -> str:
"""
Eliminate an N-glycosylation site by substituting Asn → Gln (conservative).
Args:
sequence: Protein sequence
position: 1-based position of the Asn to mutate
replacement: Amino acid to substitute (default Q = Gln; similar size, not glycosylated)
Returns:
Mutated sequence
"""
seq = list(sequence.upper())
idx = position - 1
assert seq[idx] == 'N', f"Position {position} is '{seq[idx]}', not 'N'"
seq[idx] = replacement.upper()
return ''.join(seq)
def add_glycosite(sequence: str, position: int, flanking_context: str = "S") -> str:
"""
Introduce an N-glycosylation site by mutating a residue to Asn,
and ensuring X ≠ Pro and +2 = S/T.
Args:
position: 1-based position to introduce Asn
flanking_context: 'S' or 'T' at position+2 (if modification needed)
"""
seq = list(sequence.upper())
idx = position - 1
# Mutate to Asn
seq[idx] = 'N'
# Ensure X+1 != Pro (mutate to Ala if needed)
if idx + 1 < len(seq) and seq[idx + 1] == 'P':
seq[idx + 1] = 'A'
# Ensure X+2 = S or T
if idx + 2 < len(seq) and seq[idx + 2] not in ('S', 'T'):
seq[idx + 2] = flanking_context
return ''.join(seq)
```
## O-Glycosylation Analysis
### Heuristic O-Glycosylation Hotspot Prediction
```python
def predict_o_glycosylation_hotspots(
sequence: str,
window: int = 7,
min_st_fraction: float = 0.4,
disallow_proline_next: bool = True
) -> List[dict]:
"""
Heuristic O-glycosylation hotspot scoring based on local S/T density.
Not a substitute for NetOGlyc; use as fast baseline.
Rules:
- O-GalNAc glycosylation clusters on Ser/Thr-rich segments
- Flag Ser/Thr residues in windows enriched for S/T
- Avoid S/T immediately followed by Pro (TP/SP motifs inhibit GalNAc-T)
Args:
window: Odd window size for local S/T density
min_st_fraction: Minimum fraction of S/T in window to flag site
"""
if window % 2 == 0:
window = 7
seq = sequence.upper()
half = window // 2
candidates = []
for i, aa in enumerate(seq):
if aa not in ('S', 'T'):
continue
if disallow_proline_next and i + 1 < len(seq) and seq[i+1] == 'P':
continue
start = max(0, i - half)
end = min(len(seq), i + half + 1)
segment = seq[start:end]
st_count = sum(1 for c in segment if c in ('S', 'T'))
frac = st_count / len(segment)
if frac >= min_st_fraction:
candidates.append({
'position': i + 1,
'residue': aa,
'st_fraction': round(frac, 3),
'window': f"{start+1}-{end}",
'segment': segment
})
return candidates
```
## External Glycoengineering Tools
### 1. NetOGlyc 4.0 (O-glycosylation prediction)
Web service for high-accuracy O-GalNAc site prediction:
- **URL**: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/
- **Input**: FASTA protein sequence
- **Output**: Per-residue O-glycosylation probability scores
- **Method**: Neural network trained on experimentally verified O-GalNAc sites
```python
import requests
def submit_netoglycv4(fasta_sequence: str) -> str:
"""
Submit sequence to NetOGlyc 4.0 web service.
Returns the job URL for result retrieval.
Note: This uses the DTU Health Tech web service. Results take ~1-5 min.
"""
url = "https://services.healthtech.dtu.dk/cgi-bin/webface2.cgi"
# NetOGlyc submission (parameters may vary with web service version)
# Recommend using the web interface directly for most use cases
print("Submit sequence at: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/")
return url
# Also: NetNGlyc for N-glycosylation prediction
# URL: https://services.healthtech.dtu.dk/services/NetNGlyc-1.0/
```
### 2. GlycoShield-MD (Glycan Shielding Analysis)
GlycoShield-MD analyzes how glycans shield protein surfaces during MD simulations:
- **URL**: https://gitlab.mpcdf.mpg.de/dioscuri-biophysics/glycoshield-md/
- **Use**: Map glycan shielding on protein surface over MD trajectory
- **Output**: Per-residue shielding fraction, visualization
```bash
# Installation
pip install glycoshield
# Basic usage: analyze glycan shielding from glycosylated protein MD trajectory
glycoshield \
--topology glycoprotein.pdb \
--trajectory glycoprotein.xtc \
--glycan_resnames BGLCNA FUC \
--output shielding_analysis/
```
### 3. GlycoWorkbench (Glycan Structure Drawing/Analysis)
- **URL**: https://www.eurocarbdb.org/project/glycoworkbench
- **Use**: Draw glycan structures, calculate masses, annotate MS spectra
- **Format**: GlycoCT, IUPAC condensed glycan notation
### 4. GlyConnect (Glycan-Protein Database)
- **URL**: https://glyconnect.expasy.org/
- **Use**: Find experimentally verified glycoproteins and glycosylation sites
- **Query**: By protein (UniProt ID), glycan structure, or tissue
```python
import requests
def query_glyconnect(uniprot_id: str) -> dict:
"""Query GlyConnect for glycosylation data for a protein."""
url = f"https://glyconnect.expasy.org/api/proteins/uniprot/{uniprot_id}"
response = requests.get(url, headers={"Accept": "application/json"})
if response.status_code == 200:
return response.json()
return {}
# Example: query EGFR glycosylation
egfr_glyco = query_glyconnect("P00533")
```
### 5. UniCarbKB (Glycan Structure Database)
- **URL**: https://unicarbkb.org/
- **Use**: Browse glycan structures, search by mass or composition
- **Format**: GlycoCT or IUPAC notation
## Key Glycoengineering Strategies
### For Therapeutic Antibodies
| Goal | Strategy | Notes |
|------|----------|-------|
| Enhance ADCC | Defucosylation at Fc Asn297 | Afucosylated IgG1 has ~50× better FcγRIIIa binding |
| Reduce immunogenicity | Remove non-human glycans | Eliminate α-Gal, NGNA epitopes |
| Improve PK half-life | Sialylation | Sialylated glycans extend half-life |
| Reduce inflammation | Hypersialylation | IVIG anti-inflammatory mechanism |
| Create glycan shield | Add N-glycosites to surface | Masks vulnerable epitopes (vaccine design) |
### Common Mutations Used
| Mutation | Effect |
|----------|--------|
| N297A/Q (IgG1) | Removes Fc glycosylation (aglycosyl) |
| N297D (IgG1) | Removes Fc glycosylation |
| S298A/E333A/K334A | Increases FcγRIIIa binding |
| F243L (IgG1) | Increases defucosylation |
| T299A | Removes Fc glycosylation |
## Glycan Notation
### IUPAC Condensed Notation (Monosaccharide abbreviations)
| Symbol | Full Name | Type |
|--------|-----------|------|
| Glc | Glucose | Hexose |
| GlcNAc | N-Acetylglucosamine | HexNAc |
| Man | Mannose | Hexose |
| Gal | Galactose | Hexose |
| Fuc | Fucose | Deoxyhexose |
| Neu5Ac | N-Acetylneuraminic acid (Sialic acid) | Sialic acid |
| GalNAc | N-Acetylgalactosamine | HexNAc |
### Complex N-Glycan Structure
```
Typical complex biantennary N-glycan:
Neu5Ac-Gal-GlcNAc-Man\
Man-GlcNAc-GlcNAc-[Asn]
Neu5Ac-Gal-GlcNAc-Man/
(±Core Fuc at innermost GlcNAc)
```
## Best Practices
- **Start with NetNGlyc/NetOGlyc** for computational prediction before experimental validation
- **Verify with mass spectrometry**: Glycoproteomics (Byonic, Mascot) for site-specific glycan profiling
- **Consider site context**: Not all predicted sequons are actually glycosylated (accessibility, cell type, protein conformation)
- **For antibodies**: Fc N297 glycan is critical — always characterize this site first
- **Use GlyConnect** to check if your protein of interest has experimentally verified glycosylation data
## Additional Resources
- **GlyTouCan** (glycan structure repository): https://glytoucan.org/
- **GlyConnect**: https://glyconnect.expasy.org/
- **CFG Functional Glycomics**: http://www.functionalglycomics.org/
- **DTU Health Tech servers** (NetNGlyc, NetOGlyc): https://services.healthtech.dtu.dk/
- **GlycoWorkbench**: https://glycoworkbench.software.informer.com/
- **Review**: Apweiler R et al. (1999) Biochim Biophys Acta. PMID: 10564035
- **Therapeutic glycoengineering review**: Jefferis R (2009) Nature Reviews Drug Discovery. PMID: 19448661

View File

@@ -0,0 +1,165 @@
# Glycan Databases and Resources Reference
## Primary Databases
### GlyTouCan
- **URL**: https://glytoucan.org/
- **Content**: Unique accession numbers (GTC IDs) for glycan structures
- **Use**: Standardized glycan identification across databases
- **Format**: GlycoCT, WURCS, IUPAC
```python
import requests
def lookup_glytoucan(glytoucan_id: str) -> dict:
"""Fetch glycan details from GlyTouCan."""
url = f"https://api.glytoucan.org/glycan/{glytoucan_id}"
response = requests.get(url, headers={"Accept": "application/json"})
return response.json() if response.ok else {}
```
### GlyConnect
- **URL**: https://glyconnect.expasy.org/
- **Content**: Protein glycosylation database with site-specific glycan profiles
- **Integration**: Links UniProt proteins to experimentally verified glycosylation
- **Use**: Look up known glycosylation for your target protein
```python
import requests
def get_glycoprotein_info(uniprot_id: str) -> dict:
"""Get glycosylation data for a protein from GlyConnect."""
base_url = "https://glyconnect.expasy.org/api"
response = requests.get(f"{base_url}/proteins/uniprot/{uniprot_id}")
return response.json() if response.ok else {}
def get_glycan_compositions(glyconnect_protein_id: int) -> list:
"""Get all glycan compositions for a GlyConnect protein entry."""
base_url = "https://glyconnect.expasy.org/api"
response = requests.get(f"{base_url}/compositions/protein/{glyconnect_protein_id}")
return response.json().get("data", []) if response.ok else []
```
### UniCarbKB
- **URL**: https://unicarbkb.org/
- **Content**: Curated glycan structures with biological context
- **Features**: Tissue/cell-type specific glycan data, mass spectrometry data
### KEGG Glycan
- **URL**: https://www.genome.jp/kegg/glycan/
- **Content**: Glycan structures in KEGG format, biosynthesis pathways
- **Integration**: Links to KEGG PATHWAY maps for glycan biosynthesis
### CAZy (Carbohydrate-Active Enzymes)
- **URL**: http://www.cazy.org/
- **Content**: Enzymes that build, break, and modify glycans
- **Use**: Identify enzymes for glycoengineering applications
## Prediction Servers
### NetNGlyc 1.0
- **URL**: https://services.healthtech.dtu.dk/services/NetNGlyc-1.0/
- **Method**: Neural network for N-glycosylation site prediction
- **Input**: Protein FASTA sequence
- **Output**: Per-asparagine probability score; threshold ~0.5
### NetOGlyc 4.0
- **URL**: https://services.healthtech.dtu.dk/services/NetOGlyc-4.0/
- **Method**: Neural network for O-GalNAc glycosylation prediction
- **Input**: Protein FASTA sequence
- **Output**: Per-serine/threonine probability; threshold ~0.5
### GlycoMine (Machine Learning)
- Machine learning predictor for N-, O- and C-glycosylation
- Multiple glycan types: N-GlcNAc, O-GalNAc, O-GlcNAc, O-Man, O-Fuc, O-Glc, C-Man
### SymLink (Glycosylation site & sequon predictor)
- Species-specific N-glycosylation prediction
- More specific than simple sequon scanning
## Mass Spectrometry Glycoproteomics Tools
### Byonic (Protein Metrics)
- De novo glycopeptide identification from MS2 spectra
- Comprehensive glycan database
- Site-specific glycoform assignment
### Mascot Glycan Analysis
- Glycan-specific search parameters
- Common for bottom-up glycoproteomics
### GlycoWorkbench
- **URL**: https://www.eurocarbdb.org/project/glycoworkbench
- Glycan structure drawing and mass calculation
- Annotation of MS/MS spectra with glycan fragment ions
### Skyline
- Targeted quantification of glycopeptides
- Integrates with glycan database
## Glycan Nomenclature Systems
### Oxford Notation (For N-glycans)
Codes complex N-glycans as text strings:
```
G0F = Core-fucosylated, biantennary, no galactose
G1F = Core-fucosylated, one galactose
G2F = Core-fucosylated, two galactoses
G2FS1 = Core-fucosylated, two galactoses, one sialic acid
G2FS2 = Core-fucosylated, two galactoses, two sialic acids
M5 = High mannose 5 (Man5GlcNAc2)
M9 = High mannose 9 (Man9GlcNAc2)
```
### Symbol Nomenclature for Glycans (SNFG)
Standard colored symbols for publications:
- Blue circle = Glucose
- Green circle = Mannose
- Yellow circle = Galactose
- Blue square = N-Acetylglucosamine
- Yellow square = N-Acetylgalactosamine
- Purple diamond = N-Acetylneuraminic acid (sialic acid)
- Red triangle = Fucose
## Therapeutic Glycoproteins and Key Glycosylation Sites
| Therapeutic | Target | Key Glycosylation | Function |
|-------------|--------|------------------|---------|
| IgG1 antibody | Various | N297 (Fc) | ADCC/CDC effector function |
| Erythropoietin | EPOR | N24, N38, N83, O-glycans | Pharmacokinetics |
| Etanercept | TNF | N420 (IgG1 Fc) | Half-life |
| tPA (alteplase) | Fibrin | N117, N184, N448 | Fibrin binding |
| Factor VIII | VWF | 25 N-glycosites | Clearance |
## Batch Analysis Example
```python
from glycoengineering_tools import find_n_glycosylation_sequons, predict_o_glycosylation_hotspots
import pandas as pd
def analyze_glycosylation_landscape(sequences_dict: dict) -> pd.DataFrame:
"""
Batch analysis of glycosylation for multiple proteins.
Args:
sequences_dict: {protein_name: sequence}
Returns:
DataFrame with glycosylation summary per protein
"""
results = []
for name, seq in sequences_dict.items():
n_sites = find_n_glycosylation_sequons(seq)
o_sites = predict_o_glycosylation_hotspots(seq)
results.append({
'protein': name,
'length': len(seq),
'n_glycosites': len(n_sites),
'o_glyco_hotspots': len(o_sites),
'n_glyco_density': len(n_sites) / len(seq) * 100,
'n_glyco_positions': [s['position'] for s in n_sites]
})
return pd.DataFrame(results)
```

View File

@@ -0,0 +1,395 @@
---
name: gnomad-database
description: Query gnomAD (Genome Aggregation Database) for population allele frequencies, variant constraint scores (pLI, LOEUF), and loss-of-function intolerance. Essential for variant pathogenicity interpretation, rare disease genetics, and identifying loss-of-function intolerant genes.
license: CC0-1.0
metadata:
skill-author: Kuan-lin Huang
---
# gnomAD Database
## Overview
The Genome Aggregation Database (gnomAD) is the largest publicly available collection of human genetic variation, aggregated from large-scale sequencing projects. gnomAD v4 contains exome sequences from 730,947 individuals and genome sequences from 76,215 individuals across diverse ancestries. It provides population allele frequencies, variant consequence annotations, and gene-level constraint metrics that are essential for interpreting the clinical significance of genetic variants.
**Key resources:**
- gnomAD browser: https://gnomad.broadinstitute.org/
- GraphQL API: https://gnomad.broadinstitute.org/api
- Data downloads: https://gnomad.broadinstitute.org/downloads
- Documentation: https://gnomad.broadinstitute.org/help
## When to Use This Skill
Use gnomAD when:
- **Variant frequency lookup**: Checking if a variant is rare, common, or absent in the general population
- **Pathogenicity assessment**: Rare variants (MAF < 1%) are candidates for disease causation; gnomAD helps filter benign common variants
- **Loss-of-function intolerance**: Using pLI and LOEUF scores to assess whether a gene tolerates protein-truncating variants
- **Population-stratified frequencies**: Comparing allele frequencies across ancestries (African/African American, Admixed American, Ashkenazi Jewish, East Asian, Finnish, Middle Eastern, Non-Finnish European, South Asian)
- **ClinVar/ACMG variant classification**: gnomAD frequency data feeds into BA1/BS1 evidence codes for variant classification
- **Constraint analysis**: Identifying genes depleted of missense or loss-of-function variation (z-scores, pLI, LOEUF)
## Core Capabilities
### 1. gnomAD GraphQL API
gnomAD uses a GraphQL API accessible at `https://gnomad.broadinstitute.org/api`. Most queries fetch variants by gene or specific genomic position.
**Datasets available:**
- `gnomad_r4` — gnomAD v4 exomes (recommended default, GRCh38)
- `gnomad_r4_genomes` — gnomAD v4 genomes (GRCh38)
- `gnomad_r3` — gnomAD v3 genomes (GRCh38)
- `gnomad_r2_1` — gnomAD v2 exomes (GRCh37)
**Reference genomes:**
- `GRCh38` — default for v3/v4
- `GRCh37` — for v2
### 2. Querying Variants by Gene
```python
import requests
def query_gnomad_gene(gene_symbol, dataset="gnomad_r4", reference_genome="GRCh38"):
"""Fetch variants in a gene from gnomAD."""
url = "https://gnomad.broadinstitute.org/api"
query = """
query GeneVariants($gene_symbol: String!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
gene_id
gene_symbol
variants(dataset: $dataset) {
variant_id
pos
ref
alt
consequence
genome {
af
ac
an
ac_hom
populations {
id
ac
an
af
}
}
exome {
af
ac
an
ac_hom
}
lof
lof_flags
lof_filter
}
}
}
"""
variables = {
"gene_symbol": gene_symbol,
"dataset": dataset,
"reference_genome": reference_genome
}
response = requests.post(url, json={"query": query, "variables": variables})
return response.json()
# Example
result = query_gnomad_gene("BRCA1")
gene_data = result["data"]["gene"]
variants = gene_data["variants"]
# Filter to rare PTVs
rare_ptvs = [
v for v in variants
if v.get("lof") == "LC" or v.get("consequence") in ["stop_gained", "frameshift_variant"]
and v.get("genome", {}).get("af", 1) < 0.001
]
print(f"Found {len(rare_ptvs)} rare PTVs in {gene_data['gene_symbol']}")
```
### 3. Querying a Specific Variant
```python
import requests
def query_gnomad_variant(variant_id, dataset="gnomad_r4"):
"""Fetch details for a specific variant (e.g., '1-55516888-G-GA')."""
url = "https://gnomad.broadinstitute.org/api"
query = """
query VariantDetails($variantId: String!, $dataset: DatasetId!) {
variant(variantId: $variantId, dataset: $dataset) {
variant_id
chrom
pos
ref
alt
genome {
af
ac
an
ac_hom
populations {
id
ac
an
af
}
}
exome {
af
ac
an
ac_hom
populations {
id
ac
an
af
}
}
consequence
lof
rsids
in_silico_predictors {
id
value
flags
}
clinvar_variation_id
}
}
"""
response = requests.post(
url,
json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}}
)
return response.json()
# Example: query a specific variant
result = query_gnomad_variant("17-43094692-G-A") # BRCA1 missense
variant = result["data"]["variant"]
if variant:
genome_af = variant.get("genome", {}).get("af", "N/A")
exome_af = variant.get("exome", {}).get("af", "N/A")
print(f"Variant: {variant['variant_id']}")
print(f" Consequence: {variant['consequence']}")
print(f" Genome AF: {genome_af}")
print(f" Exome AF: {exome_af}")
print(f" LoF: {variant.get('lof')}")
```
### 4. Gene Constraint Scores
gnomAD constraint scores assess how tolerant a gene is to variation relative to expectation:
```python
import requests
def query_gnomad_constraint(gene_symbol, reference_genome="GRCh38"):
"""Fetch constraint scores for a gene."""
url = "https://gnomad.broadinstitute.org/api"
query = """
query GeneConstraint($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
gene_id
gene_symbol
gnomad_constraint {
exp_lof
exp_mis
exp_syn
obs_lof
obs_mis
obs_syn
oe_lof
oe_mis
oe_syn
oe_lof_lower
oe_lof_upper
lof_z
mis_z
syn_z
pLI
}
}
}
"""
response = requests.post(
url,
json={"query": query, "variables": {"gene_symbol": gene_symbol, "reference_genome": reference_genome}}
)
return response.json()
# Example
result = query_gnomad_constraint("KCNQ2")
gene = result["data"]["gene"]
constraint = gene["gnomad_constraint"]
print(f"Gene: {gene['gene_symbol']}")
print(f" pLI: {constraint['pLI']:.3f} (>0.9 = LoF intolerant)")
print(f" LOEUF: {constraint['oe_lof_upper']:.3f} (<0.35 = highly constrained)")
print(f" Obs/Exp LoF: {constraint['oe_lof']:.3f}")
print(f" Missense Z: {constraint['mis_z']:.3f}")
```
**Constraint score interpretation:**
| Score | Range | Meaning |
|-------|-------|---------|
| `pLI` | 01 | Probability of LoF intolerance; >0.9 = highly intolerant |
| `LOEUF` | 0∞ | LoF observed/expected upper bound; <0.35 = constrained |
| `oe_lof` | 0∞ | Observed/expected ratio for LoF variants |
| `mis_z` | −∞ to ∞ | Missense constraint z-score; >3.09 = constrained |
| `syn_z` | −∞ to ∞ | Synonymous z-score (control; should be near 0) |
### 5. Population Frequency Analysis
```python
import requests
import pandas as pd
def get_population_frequencies(variant_id, dataset="gnomad_r4"):
"""Extract per-population allele frequencies for a variant."""
url = "https://gnomad.broadinstitute.org/api"
query = """
query PopFreqs($variantId: String!, $dataset: DatasetId!) {
variant(variantId: $variantId, dataset: $dataset) {
variant_id
genome {
populations {
id
ac
an
af
ac_hom
}
}
}
}
"""
response = requests.post(
url,
json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}}
)
data = response.json()
populations = data["data"]["variant"]["genome"]["populations"]
df = pd.DataFrame(populations)
df = df[df["an"] > 0].copy()
df["af"] = df["ac"] / df["an"]
df = df.sort_values("af", ascending=False)
return df
# Population IDs in gnomAD v4:
# afr = African/African American
# ami = Amish
# amr = Admixed American
# asj = Ashkenazi Jewish
# eas = East Asian
# fin = Finnish
# mid = Middle Eastern
# nfe = Non-Finnish European
# sas = South Asian
# remaining = Other
```
### 6. Structural Variants (gnomAD-SV)
gnomAD also contains a structural variant dataset:
```python
import requests
def query_gnomad_sv(gene_symbol):
"""Query structural variants overlapping a gene."""
url = "https://gnomad.broadinstitute.org/api"
query = """
query SVsByGene($gene_symbol: String!) {
gene(gene_symbol: $gene_symbol, reference_genome: GRCh38) {
structural_variants {
variant_id
type
chrom
pos
end
af
ac
an
}
}
}
"""
response = requests.post(url, json={"query": query, "variables": {"gene_symbol": gene_symbol}})
return response.json()
```
## Query Workflows
### Workflow 1: Variant Pathogenicity Assessment
1. **Check population frequency** — Is the variant rare enough to be pathogenic?
- Use gnomAD AF < 1% for recessive, < 0.1% for dominant conditions
- Check ancestry-specific frequencies (a variant rare overall may be common in one population)
2. **Assess functional impact** — LoF variants have highest prior probability
- Check `lof` field: `HC` = high-confidence LoF, `LC` = low-confidence
- Check `lof_flags` for issues like "NAGNAG_SITE", "PHYLOCSF_WEAK"
3. **Apply ACMG criteria:**
- BA1: AF > 5% → Benign Stand-Alone
- BS1: AF > disease prevalence threshold → Benign Supporting
- PM2: Absent or very rare in gnomAD → Pathogenic Moderate
### Workflow 2: Gene Prioritization in Rare Disease
1. Query constraint scores for candidate genes
2. Filter for pLI > 0.9 (haploinsufficient) or LOEUF < 0.35
3. Cross-reference with observed LoF variants in the gene
4. Integrate with ClinVar and disease databases
### Workflow 3: Population Genetics Research
1. Identify variant of interest from GWAS or clinical data
2. Query per-population frequencies
3. Compare frequency differences across ancestries
4. Test for enrichment in specific founder populations
## Best Practices
- **Use gnomAD v4 (gnomad_r4)** for the most current data; use v2 (gnomad_r2_1) only for GRCh37 compatibility
- **Handle null responses**: Variants not observed in gnomAD are not necessarily pathogenic — absence is informative
- **Distinguish exome vs. genome data**: Genome data has more uniform coverage; exome data is larger but may have coverage gaps
- **Rate limit GraphQL queries**: Add delays between requests; batch queries when possible
- **Homozygous counts** (`ac_hom`) are relevant for recessive disease analysis
- **LOEUF is preferred over pLI** for gene constraint (less sensitive to sample size)
## Data Access
- **Browser**: https://gnomad.broadinstitute.org/ — interactive variant and gene browsing
- **GraphQL API**: https://gnomad.broadinstitute.org/api — programmatic access
- **Downloads**: https://gnomad.broadinstitute.org/downloads — VCF, Hail tables, constraint tables
- **Google Cloud**: gs://gcp-public-data--gnomad/
## Additional Resources
- **gnomAD website**: https://gnomad.broadinstitute.org/
- **gnomAD blog**: https://gnomad.broadinstitute.org/news
- **Downloads**: https://gnomad.broadinstitute.org/downloads
- **API explorer**: https://gnomad.broadinstitute.org/api (interactive GraphiQL)
- **Constraint documentation**: https://gnomad.broadinstitute.org/help/constraint
- **Citation**: Karczewski KJ et al. (2020) Nature. PMID: 32461654; Chen S et al. (2024) Nature. PMID: 38conservation
- **GitHub**: https://github.com/broadinstitute/gnomad-browser

View File

@@ -0,0 +1,219 @@
# gnomAD GraphQL Query Reference
## API Endpoint
```
POST https://gnomad.broadinstitute.org/api
Content-Type: application/json
Body: { "query": "<graphql_query>", "variables": { ... } }
```
## Dataset Identifiers
| ID | Description | Reference Genome |
|----|-------------|-----------------|
| `gnomad_r4` | gnomAD v4 exomes (730K individuals) | GRCh38 |
| `gnomad_r4_genomes` | gnomAD v4 genomes (76K individuals) | GRCh38 |
| `gnomad_r3` | gnomAD v3 genomes (76K individuals) | GRCh38 |
| `gnomad_r2_1` | gnomAD v2 exomes (125K individuals) | GRCh37 |
| `gnomad_r2_1_non_cancer` | v2 non-cancer subset | GRCh37 |
| `gnomad_cnv_r4` | Copy number variants | GRCh38 |
## Core Query Templates
### 1. Variants in a Gene
```graphql
query GeneVariants($gene_symbol: String!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
gene_id
gene_symbol
chrom
start
stop
variants(dataset: $dataset) {
variant_id
pos
ref
alt
consequence
lof
lof_flags
lof_filter
genome {
af
ac
an
ac_hom
populations { id ac an af ac_hom }
}
exome {
af
ac
an
ac_hom
populations { id ac an af ac_hom }
}
rsids
clinvar_variation_id
in_silico_predictors { id value flags }
}
}
}
```
### 2. Single Variant Lookup
```graphql
query VariantDetails($variantId: String!, $dataset: DatasetId!) {
variant(variantId: $variantId, dataset: $dataset) {
variant_id
chrom
pos
ref
alt
consequence
lof
lof_flags
rsids
genome { af ac an ac_hom populations { id ac an af } }
exome { af ac an ac_hom populations { id ac an af } }
in_silico_predictors { id value flags }
clinvar_variation_id
}
}
```
**Variant ID format:** `{chrom}-{pos}-{ref}-{alt}` (e.g., `17-43094692-G-A`)
### 3. Gene Constraint
```graphql
query GeneConstraint($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
gene_id
gene_symbol
gnomad_constraint {
exp_lof exp_mis exp_syn
obs_lof obs_mis obs_syn
oe_lof oe_mis oe_syn
oe_lof_lower oe_lof_upper
oe_mis_lower oe_mis_upper
lof_z mis_z syn_z
pLI
flags
}
}
}
```
### 4. Region Query (by genomic position)
```graphql
query RegionVariants($chrom: String!, $start: Int!, $stop: Int!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) {
region(chrom: $chrom, start: $start, stop: $stop, reference_genome: $reference_genome) {
variants(dataset: $dataset) {
variant_id
pos
ref
alt
consequence
genome { af ac an }
exome { af ac an }
}
}
}
```
### 5. ClinVar Variants in Gene
```graphql
query ClinVarVariants($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) {
gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) {
clinvar_variants {
variant_id
pos
ref
alt
clinical_significance
clinvar_variation_id
gold_stars
major_consequence
in_gnomad
gnomad_exomes { ac an af }
}
}
}
```
## Population IDs
| ID | Population |
|----|-----------|
| `afr` | African/African American |
| `ami` | Amish |
| `amr` | Admixed American |
| `asj` | Ashkenazi Jewish |
| `eas` | East Asian |
| `fin` | Finnish |
| `mid` | Middle Eastern |
| `nfe` | Non-Finnish European |
| `sas` | South Asian |
| `remaining` | Other/Unassigned |
| `XX` | Female (appended to above, e.g., `afr_XX`) |
| `XY` | Male |
## LoF Annotation Fields
| Field | Values | Meaning |
|-------|--------|---------|
| `lof` | `HC`, `LC`, `null` | High/low-confidence LoF, or not annotated as LoF |
| `lof_flags` | comma-separated strings | Quality flags (e.g., `NAGNAG_SITE`, `NON_CANONICAL_SPLICE_SITE`) |
| `lof_filter` | string or null | Reason for LC classification |
## In Silico Predictor IDs
Common values for `in_silico_predictors[].id`:
- `cadd` — CADD PHRED score
- `revel` — REVEL score
- `spliceai_ds_max` — SpliceAI max delta score
- `pangolin_largest_ds` — Pangolin splicing score
- `polyphen` — PolyPhen-2 prediction
- `sift` — SIFT prediction
## Python Helper
```python
import requests
import time
def gnomad_query(query: str, variables: dict, retries: int = 3) -> dict:
"""Execute a gnomAD GraphQL query with retry logic."""
url = "https://gnomad.broadinstitute.org/api"
headers = {"Content-Type": "application/json"}
for attempt in range(retries):
try:
response = requests.post(
url,
json={"query": query, "variables": variables},
headers=headers,
timeout=60
)
response.raise_for_status()
result = response.json()
if "errors" in result:
print(f"GraphQL errors: {result['errors']}")
return result
return result
except requests.exceptions.RequestException as e:
if attempt < retries - 1:
time.sleep(2 ** attempt) # exponential backoff
else:
raise
return {}
```

View File

@@ -0,0 +1,85 @@
# gnomAD Variant Interpretation Guide
## Allele Frequency Thresholds for Disease Interpretation
### ACMG/AMP Criteria
| Criterion | AF threshold | Classification |
|-----------|-------------|----------------|
| BA1 | > 0.05 (5%) | Benign Stand-Alone |
| BS1 | > disease prevalence | Benign Supporting |
| PM2_Supporting | < 0.0001 (0.01%) for dominant; absent for recessive | Pathogenic Moderate → Supporting |
**Notes:**
- BA1 applies to most conditions; exceptions include autosomal dominant with high penetrance (e.g., LDLR for FH: BA1 threshold is ~0.1%)
- BS1 requires knowing disease prevalence; for rare diseases (1:10,000), BS1 if AF > 0.01%
- Homozygous counts (`ac_hom`) matter for recessive diseases
### Practical Thresholds
| Inheritance | Suggested max AF |
|-------------|-----------------|
| Autosomal Dominant (high penetrance) | < 0.001 (0.1%) |
| Autosomal Dominant (reduced penetrance) | < 0.01 (1%) |
| Autosomal Recessive | < 0.01 (1%) |
| X-linked recessive | < 0.001 in females |
## Absence in gnomAD
A variant **absent in gnomAD** (ac = 0) is evidence of rarity, but interpret carefully:
- gnomAD does not capture all rare variants (sequencing depth, coverage, calling thresholds)
- A variant absent in 730K exomes is very strong evidence of rarity for PM2
- Check coverage at the position: if < 10x, absence is less informative
## Loss-of-Function Variant Assessment
### LOFTEE Classification (lof field)
- **HC (High Confidence):** Predicted to truncate functional protein
- Stop-gained, splice site (±1,2), frameshift variants
- Passes all LOFTEE quality filters
- **LC (Low Confidence):** LoF annotation with quality concerns
- Check `lof_flags` for specific reason
- May still be pathogenic — requires manual review
### Common lof_flags
| Flag | Meaning |
|------|---------|
| `NAGNAG_SITE` | Splice site may be rescued by nearby alternative site |
| `NON_CANONICAL_SPLICE_SITE` | Not a canonical splice donor/acceptor |
| `PHYLOCSF_WEAK` | Weak phylogenetic conservation signal |
| `SMALL_INTRON` | Intron too small to affect splicing |
| `SINGLE_EXON` | Single-exon gene (no splicing) |
| `LAST_EXON` | In last exon (NMD may not apply) |
## Homozygous Observations
The `ac_hom` field counts homozygous (or hemizygous in males for chrX) observations.
**For recessive diseases:**
- If a variant is observed homozygous in healthy individuals in gnomAD, it is strong evidence against pathogenicity (BS2 criterion)
- Even a single homozygous observation can be informative
## Coverage at Position
Always check that gnomAD has adequate coverage at the variant position before concluding absence is meaningful. The gnomAD browser shows coverage tracks, and coverage data can be downloaded from:
- https://gnomad.broadinstitute.org/downloads#v4-coverage
## In Silico Predictor Scores
| Predictor | Score Range | Pathogenic Threshold |
|-----------|-------------|---------------------|
| CADD PHRED | 099 | > 20 deleterious; > 30 highly deleterious |
| REVEL | 01 | > 0.75 likely pathogenic (for missense) |
| SpliceAI max_ds | 01 | > 0.5 likely splice-altering |
| SIFT | 01 | < 0.05 deleterious |
| PolyPhen-2 | 01 | > 0.909 probably damaging |
## Ancestry-Specific Considerations
- A variant rare overall may be a common founder variant in a specific population
- Always check all ancestry-specific AFs, not just the total
- Finnish and Ashkenazi Jewish populations have high rates of founder variants
- Report ancestry-specific frequencies when relevant to patient ancestry

View File

@@ -0,0 +1,315 @@
---
name: gtex-database
description: Query GTEx (Genotype-Tissue Expression) portal for tissue-specific gene expression, eQTLs (expression quantitative trait loci), and sQTLs. Essential for linking GWAS variants to gene regulation, understanding tissue-specific expression, and interpreting non-coding variant effects.
license: CC-BY-4.0
metadata:
skill-author: Kuan-lin Huang
---
# GTEx Database
## Overview
The Genotype-Tissue Expression (GTEx) project provides a comprehensive resource for studying tissue-specific gene expression and genetic regulation across 54 non-diseased human tissues from nearly 1,000 individuals. GTEx v10 (the latest release) enables researchers to understand how genetic variants regulate gene expression (eQTLs) and splicing (sQTLs) in a tissue-specific manner, which is critical for interpreting GWAS loci and identifying regulatory mechanisms.
**Key resources:**
- GTEx Portal: https://gtexportal.org/
- GTEx API v2: https://gtexportal.org/api/v2/
- Data downloads: https://gtexportal.org/home/downloads/adult-gtex/
- Documentation: https://gtexportal.org/home/documentationPage
## When to Use This Skill
Use GTEx when:
- **GWAS locus interpretation**: Identifying which gene a non-coding GWAS variant regulates via eQTLs
- **Tissue-specific expression**: Comparing gene expression levels across 54 human tissues
- **eQTL colocalization**: Testing if a GWAS signal and an eQTL signal share the same causal variant
- **Multi-tissue eQTL analysis**: Finding variants that regulate expression in multiple tissues
- **Splicing QTLs (sQTLs)**: Identifying variants that affect splicing ratios
- **Tissue specificity analysis**: Determining which tissues express a gene of interest
- **Gene expression exploration**: Retrieving normalized expression levels (TPM) per tissue
## Core Capabilities
### 1. GTEx REST API v2
Base URL: `https://gtexportal.org/api/v2/`
The API returns JSON and does not require authentication. All endpoints support pagination.
```python
import requests
BASE_URL = "https://gtexportal.org/api/v2"
def gtex_get(endpoint, params=None):
"""Make a GET request to the GTEx API."""
url = f"{BASE_URL}/{endpoint}"
response = requests.get(url, params=params, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
```
### 2. Gene Expression by Tissue
```python
import requests
import pandas as pd
def get_gene_expression_by_tissue(gene_id_or_symbol, dataset_id="gtex_v10"):
"""Get median gene expression across all tissues."""
url = "https://gtexportal.org/api/v2/expression/medianGeneExpression"
params = {
"gencodeId": gene_id_or_symbol,
"datasetId": dataset_id,
"itemsPerPage": 100
}
response = requests.get(url, params=params)
data = response.json()
records = data.get("data", [])
df = pd.DataFrame(records)
if not df.empty:
df = df[["tissueSiteDetailId", "tissueSiteDetail", "median", "unit"]].sort_values(
"median", ascending=False
)
return df
# Example: get expression of APOE across tissues
df = get_gene_expression_by_tissue("ENSG00000130203.10") # APOE GENCODE ID
# Or use gene symbol (some endpoints accept both)
print(df.head(10))
# Output: tissue name, median TPM, sorted by highest expression
```
### 3. eQTL Lookup
```python
import requests
import pandas as pd
def query_eqtl(gene_id, tissue_id=None, dataset_id="gtex_v10"):
"""Query significant eQTLs for a gene, optionally filtered by tissue."""
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
params = {
"gencodeId": gene_id,
"datasetId": dataset_id,
"itemsPerPage": 250
}
if tissue_id:
params["tissueSiteDetailId"] = tissue_id
all_results = []
page = 0
while True:
params["page"] = page
response = requests.get(url, params=params)
data = response.json()
results = data.get("data", [])
if not results:
break
all_results.extend(results)
if len(results) < params["itemsPerPage"]:
break
page += 1
df = pd.DataFrame(all_results)
if not df.empty:
df = df.sort_values("pval", ascending=True)
return df
# Example: Find eQTLs for PCSK9
df = query_eqtl("ENSG00000169174.14")
print(df[["snpId", "tissueSiteDetailId", "slope", "pval", "gencodeId"]].head(20))
```
### 4. Single-Tissue eQTL by Variant
```python
import requests
def query_variant_eqtl(variant_id, tissue_id=None, dataset_id="gtex_v10"):
"""Get all eQTL associations for a specific variant."""
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
params = {
"variantId": variant_id, # e.g., "chr1_55516888_G_GA_b38"
"datasetId": dataset_id,
"itemsPerPage": 250
}
if tissue_id:
params["tissueSiteDetailId"] = tissue_id
response = requests.get(url, params=params)
return response.json()
# GTEx variant ID format: chr{chrom}_{pos}_{ref}_{alt}_b38
# Example: "chr17_43094692_G_A_b38"
```
### 5. Multi-Tissue eQTL (eGenes)
```python
import requests
def get_egenes(tissue_id, dataset_id="gtex_v10"):
"""Get all eGenes (genes with at least one significant eQTL) in a tissue."""
url = "https://gtexportal.org/api/v2/association/egene"
params = {
"tissueSiteDetailId": tissue_id,
"datasetId": dataset_id,
"itemsPerPage": 500
}
all_egenes = []
page = 0
while True:
params["page"] = page
response = requests.get(url, params=params)
data = response.json()
batch = data.get("data", [])
if not batch:
break
all_egenes.extend(batch)
if len(batch) < params["itemsPerPage"]:
break
page += 1
return all_egenes
# Example: all eGenes in whole blood
egenes = get_egenes("Whole_Blood")
print(f"Found {len(egenes)} eGenes in Whole Blood")
```
### 6. Tissue List
```python
import requests
def get_tissues(dataset_id="gtex_v10"):
"""Get all available tissues with metadata."""
url = "https://gtexportal.org/api/v2/dataset/tissueSiteDetail"
params = {"datasetId": dataset_id, "itemsPerPage": 100}
response = requests.get(url, params=params)
return response.json()["data"]
tissues = get_tissues()
# Key fields: tissueSiteDetailId, tissueSiteDetail, colorHex, samplingSite
# Common tissue IDs:
# Whole_Blood, Brain_Cortex, Liver, Kidney_Cortex, Heart_Left_Ventricle,
# Lung, Muscle_Skeletal, Adipose_Subcutaneous, Colon_Transverse, ...
```
### 7. sQTL (Splicing QTLs)
```python
import requests
def query_sqtl(gene_id, tissue_id=None, dataset_id="gtex_v10"):
"""Query significant sQTLs for a gene."""
url = "https://gtexportal.org/api/v2/association/singleTissueSqtl"
params = {
"gencodeId": gene_id,
"datasetId": dataset_id,
"itemsPerPage": 250
}
if tissue_id:
params["tissueSiteDetailId"] = tissue_id
response = requests.get(url, params=params)
return response.json()
```
## Query Workflows
### Workflow 1: Interpreting a GWAS Variant via eQTLs
1. **Identify the GWAS variant** (rs ID or chromosome position)
2. **Convert to GTEx variant ID format** (`chr{chrom}_{pos}_{ref}_{alt}_b38`)
3. **Query all eQTL associations** for that variant across tissues
4. **Check effect direction**: is the GWAS risk allele the same as the eQTL effect allele?
5. **Prioritize tissues**: select tissues biologically relevant to the disease
6. **Consider colocalization** using `coloc` (R package) with full summary statistics
```python
import requests, pandas as pd
def interpret_gwas_variant(variant_id, dataset_id="gtex_v10"):
"""Find all genes regulated by a GWAS variant."""
url = "https://gtexportal.org/api/v2/association/singleTissueEqtl"
params = {"variantId": variant_id, "datasetId": dataset_id, "itemsPerPage": 500}
response = requests.get(url, params=params)
data = response.json()
df = pd.DataFrame(data.get("data", []))
if df.empty:
return df
return df[["geneSymbol", "tissueSiteDetailId", "slope", "pval", "maf"]].sort_values("pval")
# Example
results = interpret_gwas_variant("chr1_154453788_A_T_b38")
print(results.groupby("geneSymbol")["tissueSiteDetailId"].count().sort_values(ascending=False))
```
### Workflow 2: Gene Expression Atlas
1. Get median expression for a gene across all tissues
2. Identify the primary expression site(s)
3. Compare with disease-relevant tissues
4. Download raw data for statistical comparisons
### Workflow 3: Tissue-Specific eQTL Analysis
1. Select tissues relevant to your disease
2. Query all eGenes in that tissue
3. Cross-reference with GWAS-significant loci
4. Identify co-localized signals
## Key API Endpoints
| Endpoint | Description |
|----------|-------------|
| `/expression/medianGeneExpression` | Median TPM by tissue for a gene |
| `/expression/geneExpression` | Full distribution of expression per tissue |
| `/association/singleTissueEqtl` | Significant eQTL associations |
| `/association/singleTissueSqtl` | Significant sQTL associations |
| `/association/egene` | eGenes in a tissue |
| `/dataset/tissueSiteDetail` | Available tissues with metadata |
| `/reference/gene` | Gene metadata (GENCODE IDs, coordinates) |
| `/variant/variantPage` | Variant lookup by rsID or position |
## Datasets Available
| ID | Description |
|----|-------------|
| `gtex_v10` | GTEx v10 (current; ~960 donors, 54 tissues) |
| `gtex_v8` | GTEx v8 (838 donors, 49 tissues) — older but widely cited |
## Best Practices
- **Use GENCODE IDs** (e.g., `ENSG00000130203.10`) for gene queries; the `.version` suffix matters for some endpoints
- **GTEx variant IDs** use the format `chr{chrom}_{pos}_{ref}_{alt}_b38` (GRCh38) — different from rs IDs
- **Handle pagination**: Large queries (e.g., all eGenes) require iterating through pages
- **Tissue nomenclature**: Use `tissueSiteDetailId` (e.g., `Whole_Blood`) not display names for API calls
- **FDR correction**: GTEx uses FDR < 0.05 (q-value) as the significance threshold for eQTLs
- **Effect alleles**: The `slope` field is the effect of the alternative allele; positive = higher expression with alt allele
## Data Downloads (for large-scale analysis)
For genome-wide analyses, download full summary statistics rather than using the API:
```bash
# All significant eQTLs (v10)
wget https://storage.googleapis.com/adult-gtex/bulk-qtl/v10/single-tissue-cis-qtl/GTEx_Analysis_v10_eQTL.tar
# Normalized expression matrices
wget https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_reads.gct.gz
```
## Additional Resources
- **GTEx Portal**: https://gtexportal.org/
- **API documentation**: https://gtexportal.org/api/v2/
- **Data downloads**: https://gtexportal.org/home/downloads/adult-gtex/
- **GitHub**: https://github.com/broadinstitute/gtex-pipeline
- **Citation**: GTEx Consortium (2020) Science. PMID: 32913098

View File

@@ -0,0 +1,191 @@
# GTEx API v2 Reference
## Base URL
```
https://gtexportal.org/api/v2/
```
All endpoints accept GET requests. Responses are JSON. No authentication required.
## Common Query Parameters
| Parameter | Description | Example |
|-----------|-------------|---------|
| `gencodeId` | GENCODE gene ID with version | `ENSG00000130203.10` |
| `geneSymbol` | Gene symbol | `APOE` |
| `variantId` | GTEx variant ID | `chr17_45413693_C_T_b38` |
| `tissueSiteDetailId` | Tissue identifier | `Whole_Blood` |
| `datasetId` | Dataset version | `gtex_v10` |
| `itemsPerPage` | Results per page | `250` |
| `page` | Page number (0-indexed) | `0` |
## Endpoint Reference
### Expression Endpoints
#### `GET /expression/medianGeneExpression`
Median TPM expression for a gene across tissues.
**Parameters:** `gencodeId`, `datasetId`, `itemsPerPage`
**Response fields:**
```json
{
"data": [
{
"gencodeId": "ENSG00000130203.10",
"geneSymbol": "APOE",
"tissueSiteDetailId": "Liver",
"tissueSiteDetail": "Liver",
"median": 2847.9,
"unit": "TPM",
"datasetId": "gtex_v10"
}
]
}
```
#### `GET /expression/geneExpression`
Full expression distribution (box plot data) per tissue.
**Parameters:** `gencodeId`, `tissueSiteDetailId`, `datasetId`
**Response fields:**
- `data[].tissueExpressionData.data`: array of TPM values per sample
### Association (QTL) Endpoints
#### `GET /association/singleTissueEqtl`
Significant single-tissue cis-eQTLs.
**Parameters:** `gencodeId` OR `variantId`, `tissueSiteDetailId` (optional), `datasetId`
**Response fields:**
```json
{
"data": [
{
"gencodeId": "ENSG00000169174.14",
"geneSymbol": "PCSK9",
"variantId": "chr1_55516888_G_GA_b38",
"snpId": "rs72646508",
"tissueSiteDetailId": "Liver",
"slope": -0.342,
"slopeStandardError": 0.051,
"pval": 3.2e-11,
"qval": 2.1e-8,
"maf": 0.089,
"datasetId": "gtex_v10"
}
]
}
```
**Key fields:**
- `slope`: effect of alt allele on expression (log2 scale after rank normalization)
- `pval`: nominal p-value
- `qval`: FDR-adjusted q-value
- `maf`: minor allele frequency in the GTEx cohort
#### `GET /association/singleTissueSqtl`
Significant single-tissue sQTLs (splicing).
**Parameters:** Same as eQTL endpoint
#### `GET /association/egene`
All eGenes (genes with ≥1 significant eQTL) in a tissue.
**Parameters:** `tissueSiteDetailId`, `datasetId`
**Response fields:** gene ID, gene symbol, best eQTL variant, p-value, q-value
### Dataset/Metadata Endpoints
#### `GET /dataset/tissueSiteDetail`
List of all available tissues.
**Parameters:** `datasetId`, `itemsPerPage`
**Response fields:**
- `tissueSiteDetailId`: API identifier (use this in queries)
- `tissueSiteDetail`: Display name
- `colorHex`: Color for visualization
- `samplingSite`: Anatomical location
#### `GET /reference/gene`
Gene metadata from GENCODE.
**Parameters:** `geneSymbol` OR `gencodeId`, `referenceGenomeId` (GRCh38)
### Variant Endpoints
#### `GET /variant/variantPage`
Variant metadata and lookup.
**Parameters:** `snpId` (rsID) OR `variantId`
## Tissue IDs Reference (Common Tissues)
| ID | Display Name |
|----|-------------|
| `Whole_Blood` | Whole Blood |
| `Brain_Cortex` | Brain - Cortex |
| `Brain_Hippocampus` | Brain - Hippocampus |
| `Brain_Frontal_Cortex_BA9` | Brain - Frontal Cortex (BA9) |
| `Liver` | Liver |
| `Kidney_Cortex` | Kidney - Cortex |
| `Heart_Left_Ventricle` | Heart - Left Ventricle |
| `Lung` | Lung |
| `Muscle_Skeletal` | Muscle - Skeletal |
| `Adipose_Subcutaneous` | Adipose - Subcutaneous |
| `Colon_Transverse` | Colon - Transverse |
| `Small_Intestine_Terminal_Ileum` | Small Intestine - Terminal Ileum |
| `Skin_Sun_Exposed_Lower_leg` | Skin - Sun Exposed (Lower leg) |
| `Thyroid` | Thyroid |
| `Nerve_Tibial` | Nerve - Tibial |
| `Artery_Coronary` | Artery - Coronary |
| `Artery_Aorta` | Artery - Aorta |
| `Pancreas` | Pancreas |
| `Pituitary` | Pituitary |
| `Spleen` | Spleen |
| `Prostate` | Prostate |
| `Ovary` | Ovary |
| `Uterus` | Uterus |
| `Testis` | Testis |
## Error Handling
```python
import requests
from requests.exceptions import HTTPError, Timeout
def safe_gtex_query(endpoint, params):
url = f"https://gtexportal.org/api/v2/{endpoint}"
try:
response = requests.get(url, params=params, timeout=30)
response.raise_for_status()
return response.json()
except HTTPError as e:
print(f"HTTP error {e.response.status_code}: {e.response.text}")
except Timeout:
print("Request timed out")
except Exception as e:
print(f"Error: {e}")
return None
```
## Rate Limiting
GTEx API does not publish explicit rate limits but:
- Add 0.51s delays between bulk queries
- Use data downloads for genome-wide analyses instead of API
- Cache results locally for repeated queries

View File

@@ -0,0 +1,305 @@
---
name: interpro-database
description: Query InterPro for protein family, domain, and functional site annotations. Integrates Pfam, PANTHER, PRINTS, SMART, SUPERFAMILY, and 11 other member databases. Use for protein function prediction, domain architecture analysis, evolutionary classification, and GO term mapping.
license: CC0-1.0
metadata:
skill-author: Kuan-lin Huang
---
# InterPro Database
## Overview
InterPro (https://www.ebi.ac.uk/interpro/) is a comprehensive resource for protein family and domain classification maintained by EMBL-EBI. It integrates signatures from 13 member databases including Pfam, PANTHER, PRINTS, ProSite, SMART, TIGRFAM, SUPERFAMILY, CDD, and others, providing a unified view of protein functional annotations for over 100 million protein sequences.
InterPro classifies proteins into:
- **Families**: Groups of proteins sharing common ancestry and function
- **Domains**: Independently folding structural/functional units
- **Homologous superfamilies**: Structurally similar protein regions
- **Repeats**: Short tandem sequences
- **Sites**: Functional sites (active, binding, PTM)
**Key resources:**
- InterPro website: https://www.ebi.ac.uk/interpro/
- REST API: https://www.ebi.ac.uk/interpro/api/
- API documentation: https://github.com/ProteinsWebTeam/interpro7-api/blob/master/docs/
- Python client: via `requests`
## When to Use This Skill
Use InterPro when:
- **Protein function prediction**: What function(s) does an uncharacterized protein likely have?
- **Domain architecture**: What domains make up a protein, and in what order?
- **Protein family classification**: Which family/superfamily does a protein belong to?
- **GO term annotation**: Map protein sequences to Gene Ontology terms via InterPro
- **Evolutionary analysis**: Are two proteins in the same homologous superfamily?
- **Structure prediction context**: What domains should a new protein structure be compared against?
- **Pipeline annotation**: Batch-annotate proteomes or novel sequences
## Core Capabilities
### 1. InterPro REST API
Base URL: `https://www.ebi.ac.uk/interpro/api/`
```python
import requests
BASE_URL = "https://www.ebi.ac.uk/interpro/api"
def interpro_get(endpoint, params=None):
url = f"{BASE_URL}/{endpoint}"
headers = {"Accept": "application/json"}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
return response.json()
```
### 2. Look Up a Protein
```python
def get_protein_entries(uniprot_id):
"""Get all InterPro entries that match a UniProt protein."""
data = interpro_get(f"protein/UniProt/{uniprot_id}/entry/InterPro/")
return data
# Example: Human p53 (TP53)
result = get_protein_entries("P04637")
entries = result.get("results", [])
for entry in entries:
meta = entry["metadata"]
print(f" {meta['accession']} ({meta['type']}): {meta['name']}")
# e.g., IPR011615 (domain): p53, tetramerisation domain
# IPR010991 (domain): p53, DNA-binding domain
# IPR013872 (family): p53 family
```
### 3. Get Specific InterPro Entry
```python
def get_entry(interpro_id):
"""Fetch details for an InterPro entry."""
return interpro_get(f"entry/InterPro/{interpro_id}/")
# Example: Get Pfam domain PF00397 (WW domain)
ww_entry = get_entry("IPR001202")
print(f"Name: {ww_entry['metadata']['name']}")
print(f"Type: {ww_entry['metadata']['type']}")
# Also supports member database IDs:
def get_pfam_entry(pfam_id):
return interpro_get(f"entry/Pfam/{pfam_id}/")
pfam = get_pfam_entry("PF00397")
```
### 4. Search Proteins by InterPro Entry
```python
def get_proteins_for_entry(interpro_id, database="UniProt", page_size=25):
"""Get all proteins annotated with an InterPro entry."""
params = {"page_size": page_size}
data = interpro_get(f"entry/InterPro/{interpro_id}/protein/{database}/", params)
return data
# Example: Find all human kinase-domain proteins
kinase_proteins = get_proteins_for_entry("IPR000719") # Protein kinase domain
print(f"Total proteins: {kinase_proteins['count']}")
```
### 5. Domain Architecture
```python
def get_domain_architecture(uniprot_id):
"""Get the complete domain architecture of a protein."""
data = interpro_get(f"protein/UniProt/{uniprot_id}/")
return data
# Example: Get full domain architecture for EGFR
egfr = get_domain_architecture("P00533")
# The response includes locations of all matching entries on the sequence
for entry in egfr.get("entries", []):
for fragment in entry.get("entry_protein_locations", []):
for loc in fragment.get("fragments", []):
print(f" {entry['accession']}: {loc['start']}-{loc['end']}")
```
### 6. GO Term Mapping
```python
def get_go_terms_for_protein(uniprot_id):
"""Get GO terms associated with a protein via InterPro."""
data = interpro_get(f"protein/UniProt/{uniprot_id}/")
# GO terms are embedded in the entry metadata
go_terms = []
for entry in data.get("entries", []):
go = entry.get("metadata", {}).get("go_terms", [])
go_terms.extend(go)
# Deduplicate
seen = set()
unique_go = []
for term in go_terms:
if term["identifier"] not in seen:
seen.add(term["identifier"])
unique_go.append(term)
return unique_go
# GO terms include:
# {"identifier": "GO:0004672", "name": "protein kinase activity", "category": {"code": "F", "name": "Molecular Function"}}
```
### 7. Batch Protein Lookup
```python
def batch_lookup_proteins(uniprot_ids, database="UniProt"):
"""Look up multiple proteins and collect their InterPro entries."""
import time
results = {}
for uid in uniprot_ids:
try:
data = interpro_get(f"protein/{database}/{uid}/entry/InterPro/")
entries = data.get("results", [])
results[uid] = [
{
"accession": e["metadata"]["accession"],
"name": e["metadata"]["name"],
"type": e["metadata"]["type"]
}
for e in entries
]
except Exception as e:
results[uid] = {"error": str(e)}
time.sleep(0.3) # Rate limiting
return results
# Example
proteins = ["P04637", "P00533", "P38398", "Q9Y6I9"]
domain_info = batch_lookup_proteins(proteins)
for uid, entries in domain_info.items():
print(f"\n{uid}:")
for e in entries[:3]:
print(f" - {e['accession']} ({e['type']}): {e['name']}")
```
### 8. Search by Text or Taxonomy
```python
def search_entries(query, entry_type=None, taxonomy_id=None):
"""Search InterPro entries by text."""
params = {"search": query, "page_size": 20}
if entry_type:
params["type"] = entry_type # family, domain, homologous_superfamily, etc.
endpoint = "entry/InterPro/"
if taxonomy_id:
endpoint = f"entry/InterPro/taxonomy/UniProt/{taxonomy_id}/"
return interpro_get(endpoint, params)
# Search for kinase-related entries
kinase_entries = search_entries("kinase", entry_type="domain")
```
## Query Workflows
### Workflow 1: Characterize an Unknown Protein
1. **Run InterProScan** locally or via the web (https://www.ebi.ac.uk/interpro/search/sequence/) to scan a protein sequence
2. **Parse results** to identify domain architecture
3. **Look up each InterPro entry** for biological context
4. **Get GO terms** from associated InterPro entries for functional inference
```python
# After running InterProScan and getting a UniProt ID:
def characterize_protein(uniprot_id):
"""Complete characterization workflow."""
# 1. Get all annotations
entries = get_protein_entries(uniprot_id)
# 2. Group by type
by_type = {}
for e in entries.get("results", []):
t = e["metadata"]["type"]
by_type.setdefault(t, []).append({
"accession": e["metadata"]["accession"],
"name": e["metadata"]["name"]
})
# 3. Get GO terms
go_terms = get_go_terms_for_protein(uniprot_id)
return {
"families": by_type.get("family", []),
"domains": by_type.get("domain", []),
"superfamilies": by_type.get("homologous_superfamily", []),
"go_terms": go_terms
}
```
### Workflow 2: Find All Members of a Protein Family
1. Identify the InterPro family entry ID (e.g., IPR000719 for protein kinases)
2. Query all UniProt proteins annotated with that entry
3. Filter by organism/taxonomy if needed
4. Download FASTA sequences for phylogenetic analysis
### Workflow 3: Comparative Domain Analysis
1. Collect proteins of interest (e.g., all paralogs)
2. Get domain architecture for each protein
3. Compare domain compositions and orders
4. Identify domain gain/loss events
## API Endpoint Summary
| Endpoint | Description |
|----------|-------------|
| `/protein/UniProt/{id}/` | Full annotation for a protein |
| `/protein/UniProt/{id}/entry/InterPro/` | InterPro entries for a protein |
| `/entry/InterPro/{id}/` | Details of an InterPro entry |
| `/entry/Pfam/{id}/` | Pfam entry details |
| `/entry/InterPro/{id}/protein/UniProt/` | Proteins with an entry |
| `/entry/InterPro/` | Search/list InterPro entries |
| `/taxonomy/UniProt/{tax_id}/` | Proteins from a taxon |
| `/structure/PDB/{pdb_id}/` | Structures mapped to InterPro |
## Member Databases
| Database | Focus |
|----------|-------|
| Pfam | Protein domains (HMM profiles) |
| PANTHER | Protein families and subfamilies |
| PRINTS | Protein fingerprints |
| ProSitePatterns | Amino acid patterns |
| ProSiteProfiles | Protein profile patterns |
| SMART | Protein domain analysis |
| TIGRFAM | JCVI curated protein families |
| SUPERFAMILY | Structural classification |
| CDD | Conserved Domain Database (NCBI) |
| HAMAP | Microbial protein families |
| NCBIfam | NCBI curated TIGRFAMs |
| Gene3D | CATH structural classification |
| PIRSR | PIR site rules |
## Best Practices
- **Use UniProt accession numbers** (not gene names) for the most reliable lookups
- **Distinguish types**: `family` gives broad classification; `domain` gives specific structural/functional units
- **InterProScan is faster for novel sequences**: For sequences not in UniProt, submit to the web service
- **Handle pagination**: Large result sets require iterating through pages
- **Combine with UniProt data**: InterPro entries often include links to UniProt, PDB, and GO
## Additional Resources
- **InterPro website**: https://www.ebi.ac.uk/interpro/
- **InterProScan** (run locally): https://github.com/ebi-pf-team/interproscan
- **API documentation**: https://github.com/ProteinsWebTeam/interpro7-api/blob/master/docs/
- **Pfam**: https://www.ebi.ac.uk/interpro/entry/pfam/
- **Citation**: Paysan-Lafosse T et al. (2023) Nucleic Acids Research. PMID: 36350672

View File

@@ -0,0 +1,159 @@
# InterPro Domain Analysis Reference
## Entry Types
| Type | Description | Example |
|------|-------------|---------|
| `family` | Group of related proteins sharing common evolutionary origin | IPR013872: p53 family |
| `domain` | Distinct structural/functional unit that can exist independently | IPR011615: p53 tetramerisation domain |
| `homologous_superfamily` | Proteins related by structure but not necessarily sequence | IPR009003: Peptidase, aspartic |
| `repeat` | Short sequence unit that occurs in multiple copies | IPR000822: Ankyrin repeat |
| `site` | Residues important for function | IPR018060: Metalloprotease active site |
| `conserved_site` | Conserved sequence motif (functional) | IPR016152: PTB/PI domain binding site |
| `active_site` | Catalytic residues | IPR000743: RING domain |
| `binding_site` | Residues involved in binding | — |
| `ptm` | Post-translational modification site | — |
## Common Domain Accessions
### Signaling Domains
| Accession | Name | Function |
|-----------|------|---------|
| IPR000719 | Protein kinase domain | ATP-dependent phosphorylation |
| IPR001245 | Serine-threonine/tyrosine-protein kinase | Kinase catalytic domain |
| IPR000980 | SH2 domain | Phosphotyrosine binding |
| IPR001452 | SH3 domain | Proline-rich sequence binding |
| IPR011993 | PH domain | Phosphoinositide binding |
| IPR000048 | IQ motif | Calmodulin binding |
| IPR000008 | C2 domain | Ca2+/phospholipid binding |
| IPR001849 | PH domain | Pleckstrin homology |
### DNA Binding Domains
| Accession | Name | Function |
|-----------|------|---------|
| IPR013087 | Zinc finger, C2H2 | DNA binding |
| IPR017456 | CCCH zinc finger | RNA binding |
| IPR011991 | Winged helix-turn-helix | Transcription factor DNA binding |
| IPR011607 | MH1 domain | SMAD DNA binding |
| IPR003313 | ARID domain | AT-rich DNA binding |
| IPR014756 | E1-E2 ATPase, nucleotide-binding | — |
### Structural Domains
| Accession | Name | Function |
|-----------|------|---------|
| IPR001357 | BRCT domain | DNA repair protein interaction |
| IPR000536 | Nuclear hormone receptor, ligand-binding | Hormone binding |
| IPR001628 | Zinc finger, nuclear hormone receptor | DNA binding (NHR) |
| IPR003961 | Fibronectin type III | Cell adhesion |
| IPR000742 | EGF-like domain | Receptor-ligand interaction |
## Domain Architecture Patterns
Common multi-domain architectures and their biological meanings:
### Receptor Tyrosine Kinases
```
[EGF domain]... - [TM] - [Kinase domain]
e.g., EGFR: IPR000742 (EGF) + IPR000719 (kinase)
```
### Adapter Proteins
```
[SH3] - [SH2] - [SH3]
e.g., Grb2, Crk — signaling adapters
```
### Nuclear Receptors
```
[DBD/C2H2 zinc finger] - [Ligand binding domain]
e.g., ERα (ESR1)
```
### Kinases
```
[N-lobe] - [Activation loop] - [C-lobe]
Standard protein kinase fold (IPR000719)
```
## GO Term Categories
InterPro GO annotations use three ontologies:
| Code | Ontology | Examples |
|------|----------|---------|
| P | Biological Process | GO:0006468 (protein phosphorylation) |
| F | Molecular Function | GO:0004672 (protein kinase activity) |
| C | Cellular Component | GO:0005886 (plasma membrane) |
## InterProScan for Novel Sequences
For protein sequences not in UniProt (novel/predicted sequences), run InterProScan:
```bash
# Command-line (install InterProScan locally)
./interproscan.sh -i my_proteins.fasta -f tsv,json -dp
# Options:
# -i: input FASTA
# -f: output formats (tsv, json, xml, gff3, html)
# -dp: disable precalculation lookup (use for non-UniProt sequences)
# --goterms: include GO term mappings
# --pathways: include pathway mappings
# Or use the web service:
# https://www.ebi.ac.uk/interpro/search/sequence/
```
**Output fields (TSV):**
1. Protein accession
2. Sequence MD5
3. Sequence length
4. Analysis (e.g., Pfam, SMART)
5. Signature accession (e.g., PF00397)
6. Signature description
7. Start
8. Stop
9. Score
10. Status (T = true)
11. Date
12. InterPro accession (if integrated)
13. InterPro description
## Useful Entry ID Collections
### Human Disease-Relevant Domains
```python
DISEASE_DOMAINS = {
# Cancer
"IPR011615": "p53 tetramerization",
"IPR012346": "p53/p63/p73, tetramerization domain",
"IPR000719": "Protein kinase domain",
"IPR004827": "Basic-leucine zipper (bZIP) TF",
# Neurodegenerative
"IPR003527": "MAP kinase, ERK1/2",
"IPR016024": "ARM-type fold",
# Metabolic
"IPR001764": "Glycoside hydrolase, family 13 (amylase)",
"IPR006047": "Glycoside hydrolase superfamily",
}
```
### Commonly Referenced Pfam IDs
| Pfam ID | Domain Name |
|---------|-------------|
| PF00069 | Pkinase (protein kinase) |
| PF00076 | RRM_1 (RNA recognition motif) |
| PF00096 | zf-C2H2 (zinc finger) |
| PF00397 | WW domain |
| PF00400 | WD40 repeat |
| PF00415 | RasGEF domain |
| PF00018 | SH3 domain |
| PF00017 | SH2 domain |
| PF02196 | zf-C3HC4 (RING finger) |

View File

@@ -0,0 +1,351 @@
---
name: jaspar-database
description: Query JASPAR for transcription factor binding site (TFBS) profiles (PWMs/PFMs). Search by TF name, species, or class; scan DNA sequences for TF binding sites; compare matrices; essential for regulatory genomics, motif analysis, and GWAS regulatory variant interpretation.
license: CC0-1.0
metadata:
skill-author: Kuan-lin Huang
---
# JASPAR Database
## Overview
JASPAR (https://jaspar.elixir.no/) is the gold-standard open-access database of curated, non-redundant transcription factor (TF) binding profiles stored as position frequency matrices (PFMs). JASPAR 2024 contains 1,210 non-redundant TF binding profiles for 164 eukaryotic species. Each profile is experimentally derived (ChIP-seq, SELEX, HT-SELEX, protein binding microarray, etc.) and rigorously validated.
**Key resources:**
- JASPAR portal: https://jaspar.elixir.no/
- REST API: https://jaspar.elixir.no/api/v1/
- API docs: https://jaspar.elixir.no/api/v1/docs/
- Python package: `jaspar` (via Biopython) or direct API
## When to Use This Skill
Use JASPAR when:
- **TF binding site prediction**: Scan a DNA sequence for potential binding sites of a TF
- **Regulatory variant interpretation**: Does a GWAS/eQTL variant disrupt a TF binding motif?
- **Promoter/enhancer analysis**: What TFs are predicted to bind to a regulatory element?
- **Gene regulatory network construction**: Link TFs to their target genes via motif scanning
- **TF family analysis**: Compare binding profiles across a TF family (e.g., all homeobox factors)
- **ChIP-seq analysis**: Find known TF motifs enriched in ChIP-seq peaks
- **ENCODE/ATAC-seq interpretation**: Match open chromatin regions to TF binding profiles
## Core Capabilities
### 1. JASPAR REST API
Base URL: `https://jaspar.elixir.no/api/v1/`
```python
import requests
BASE_URL = "https://jaspar.elixir.no/api/v1"
def jaspar_get(endpoint, params=None):
url = f"{BASE_URL}/{endpoint}"
response = requests.get(url, params=params, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
```
### 2. Search for TF Profiles
```python
def search_jaspar(
tf_name=None,
species=None,
collection="CORE",
tf_class=None,
tf_family=None,
page=1,
page_size=25
):
"""Search JASPAR for TF binding profiles."""
params = {
"collection": collection,
"page": page,
"page_size": page_size,
"format": "json"
}
if tf_name:
params["name"] = tf_name
if species:
params["species"] = species # Use taxonomy ID or name, e.g., "9606" for human
if tf_class:
params["tf_class"] = tf_class
if tf_family:
params["tf_family"] = tf_family
return jaspar_get("matrix", params)
# Examples:
# Search for human CTCF profile
ctcf = search_jaspar("CTCF", species="9606")
print(f"Found {ctcf['count']} CTCF profiles")
# Search for all homeobox TFs in human
hox_tfs = search_jaspar(tf_class="Homeodomain", species="9606")
# Search for a TF family
nfkb = search_jaspar(tf_family="NF-kappaB")
```
### 3. Fetch a Specific Matrix (PFM/PWM)
```python
def get_matrix(matrix_id):
"""Fetch a specific JASPAR matrix by ID (e.g., 'MA0139.1' for CTCF)."""
return jaspar_get(f"matrix/{matrix_id}/")
# Example: Get CTCF matrix
ctcf_matrix = get_matrix("MA0139.1")
# Matrix structure:
# {
# "matrix_id": "MA0139.1",
# "name": "CTCF",
# "collection": "CORE",
# "tax_group": "vertebrates",
# "pfm": { "A": [...], "C": [...], "G": [...], "T": [...] },
# "consensus": "CCGCGNGGNGGCAG",
# "length": 19,
# "species": [{"tax_id": 9606, "name": "Homo sapiens"}],
# "class": ["C2H2 zinc finger factors"],
# "family": ["BEN domain factors"],
# "type": "ChIP-seq",
# "uniprot_ids": ["P49711"]
# }
```
### 4. Download PFM/PWM as Matrix
```python
import numpy as np
def get_pwm(matrix_id, pseudocount=0.8):
"""
Fetch a PFM from JASPAR and convert to PWM (log-odds).
Returns numpy array of shape (4, L) in order A, C, G, T.
"""
matrix = get_matrix(matrix_id)
pfm = matrix["pfm"]
# Convert PFM to numpy
pfm_array = np.array([pfm["A"], pfm["C"], pfm["G"], pfm["T"]], dtype=float)
# Add pseudocount
pfm_array += pseudocount
# Normalize to get PPM
ppm = pfm_array / pfm_array.sum(axis=0, keepdims=True)
# Convert to PWM (log-odds relative to background 0.25)
background = 0.25
pwm = np.log2(ppm / background)
return pwm, matrix["name"]
# Example
pwm, name = get_pwm("MA0139.1") # CTCF
print(f"PWM for {name}: shape {pwm.shape}")
max_score = pwm.max(axis=0).sum()
print(f"Maximum possible score: {max_score:.2f} bits")
```
### 5. Scan a DNA Sequence for TF Binding Sites
```python
import numpy as np
from typing import List, Tuple
NUCLEOTIDE_MAP = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
def scan_sequence(sequence: str, pwm: np.ndarray, threshold_pct: float = 0.8) -> List[dict]:
"""
Scan a DNA sequence for TF binding sites using a PWM.
Args:
sequence: DNA sequence string
pwm: PWM array (4 x L) in ACGT order
threshold_pct: Fraction of max score to use as threshold (0-1)
Returns:
List of hits with position, score, and matched sequence
"""
motif_len = pwm.shape[1]
max_score = pwm.max(axis=0).sum()
min_score = pwm.min(axis=0).sum()
threshold = min_score + threshold_pct * (max_score - min_score)
hits = []
seq = sequence.upper()
for i in range(len(seq) - motif_len + 1):
subseq = seq[i:i + motif_len]
# Skip if contains non-ACGT
if any(c not in NUCLEOTIDE_MAP for c in subseq):
continue
score = sum(pwm[NUCLEOTIDE_MAP[c], j] for j, c in enumerate(subseq))
if score >= threshold:
relative_score = (score - min_score) / (max_score - min_score)
hits.append({
"position": i + 1, # 1-based
"score": score,
"relative_score": relative_score,
"sequence": subseq,
"strand": "+"
})
return hits
# Example: Scan a promoter sequence for CTCF binding sites
promoter = "AGCCCGCGAGGNGGCAGTTGCCTGGAGCAGGATCAGCAGATC"
pwm, name = get_pwm("MA0139.1")
hits = scan_sequence(promoter, pwm, threshold_pct=0.75)
for hit in hits:
print(f" Position {hit['position']}: {hit['sequence']} (score: {hit['score']:.2f}, {hit['relative_score']:.0%})")
```
### 6. Scan Both Strands
```python
def reverse_complement(seq: str) -> str:
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
return ''.join(complement.get(b, 'N') for b in reversed(seq.upper()))
def scan_both_strands(sequence: str, pwm: np.ndarray, threshold_pct: float = 0.8):
"""Scan forward and reverse complement strands."""
fwd_hits = scan_sequence(sequence, pwm, threshold_pct)
for h in fwd_hits:
h["strand"] = "+"
rev_seq = reverse_complement(sequence)
rev_hits = scan_sequence(rev_seq, pwm, threshold_pct)
seq_len = len(sequence)
for h in rev_hits:
h["strand"] = "-"
h["position"] = seq_len - h["position"] - len(h["sequence"]) + 2 # Convert to fwd coords
all_hits = fwd_hits + rev_hits
return sorted(all_hits, key=lambda x: x["position"])
```
### 7. Variant Impact on TF Binding
```python
def variant_tfbs_impact(ref_seq: str, alt_seq: str, pwm: np.ndarray,
tf_name: str, threshold_pct: float = 0.7):
"""
Assess impact of a SNP on TF binding by comparing ref vs alt sequences.
Both sequences should be centered on the variant with flanking context.
"""
ref_hits = scan_both_strands(ref_seq, pwm, threshold_pct)
alt_hits = scan_both_strands(alt_seq, pwm, threshold_pct)
max_ref = max((h["score"] for h in ref_hits), default=None)
max_alt = max((h["score"] for h in alt_hits), default=None)
result = {
"tf": tf_name,
"ref_max_score": max_ref,
"alt_max_score": max_alt,
"ref_has_site": len(ref_hits) > 0,
"alt_has_site": len(alt_hits) > 0,
}
if max_ref and max_alt:
result["score_change"] = max_alt - max_ref
result["effect"] = "gained" if max_alt > max_ref else "disrupted"
elif max_ref and not max_alt:
result["effect"] = "disrupted"
elif not max_ref and max_alt:
result["effect"] = "gained"
else:
result["effect"] = "no_site"
return result
```
## Query Workflows
### Workflow 1: Find All TF Binding Sites in a Promoter
```python
import requests, numpy as np
# 1. Get relevant TF matrices (e.g., all human TFs in CORE collection)
response = requests.get(
"https://jaspar.elixir.no/api/v1/matrix/",
params={"species": "9606", "collection": "CORE", "page_size": 500, "page": 1}
)
matrices = response.json()["results"]
# 2. For each matrix, compute PWM and scan promoter
promoter = "CCCGCCCGCCCGCCGCCCGCAGTTAATGAGCCCAGCGTGCC" # Example
all_hits = []
for m in matrices[:10]: # Limit for demo
pwm_data = requests.get(f"https://jaspar.elixir.no/api/v1/matrix/{m['matrix_id']}/").json()
pfm = pfm_data["pfm"]
pfm_arr = np.array([pfm["A"], pfm["C"], pfm["G"], pfm["T"]], dtype=float) + 0.8
ppm = pfm_arr / pfm_arr.sum(axis=0)
pwm = np.log2(ppm / 0.25)
hits = scan_sequence(promoter, pwm, threshold_pct=0.8)
for h in hits:
h["tf_name"] = m["name"]
h["matrix_id"] = m["matrix_id"]
all_hits.extend(hits)
print(f"Found {len(all_hits)} TF binding sites")
for h in sorted(all_hits, key=lambda x: -x["score"])[:5]:
print(f" {h['tf_name']} ({h['matrix_id']}): pos {h['position']}, score {h['score']:.2f}")
```
### Workflow 2: SNP Impact on TF Binding (Regulatory Variant Analysis)
1. Retrieve the genomic sequence flanking the SNP (±20 bp each side)
2. Construct ref and alt sequences
3. Scan with all relevant TF PWMs
4. Report TFs whose binding is created or destroyed by the SNP
### Workflow 3: Motif Enrichment Analysis
1. Identify a set of peak sequences (e.g., from ChIP-seq or ATAC-seq)
2. Scan all peaks with JASPAR PWMs
3. Compare hit rates in peaks vs. background sequences
4. Report significantly enriched motifs (Fisher's exact test or FIMO-style scoring)
## Collections Available
| Collection | Description | Profiles |
|------------|-------------|----------|
| `CORE` | Non-redundant, high-quality profiles | ~1,210 |
| `UNVALIDATED` | Experimentally derived but not validated | ~500 |
| `PHYLOFACTS` | Phylogenetically conserved sites | ~50 |
| `CNE` | Conserved non-coding elements | ~30 |
| `POLII` | RNA Pol II binding profiles | ~20 |
| `FAM` | TF family representative profiles | ~170 |
| `SPLICE` | Splice factor profiles | ~20 |
## Best Practices
- **Use CORE collection** for most analyses — best validated and non-redundant
- **Threshold selection**: 80% of max score is common for de novo prediction; 90% for high-confidence
- **Always scan both strands** — TFs can bind in either orientation
- **Provide flanking context** for variant analysis: at least (motif_length - 1) bp on each side
- **Consider background**: PWM scores relative to uniform (0.25) background; adjust for actual GC content
- **Cross-validate with ChIP-seq data** when available — motif scanning has many false positives
- **Use Biopython's motifs module** for full-featured scanning: `from Bio import motifs`
## Additional Resources
- **JASPAR portal**: https://jaspar.elixir.no/
- **API documentation**: https://jaspar.elixir.no/api/v1/docs/
- **JASPAR 2024 paper**: Castro-Mondragon et al. (2022) Nucleic Acids Research. PMID: 34875674
- **Biopython motifs**: https://biopython.org/docs/latest/Tutorial/chapter_motifs.html
- **FIMO tool** (for large-scale scanning): https://meme-suite.org/meme/tools/fimo
- **HOMER** (motif enrichment): http://homer.ucsd.edu/homer/
- **GitHub**: https://github.com/wassermanlab/JASPAR-UCSC-tracks

View File

@@ -0,0 +1,182 @@
# JASPAR API v1 Reference
## Base URL
```
https://jaspar.elixir.no/api/v1/
```
No authentication required. Responses are JSON.
## Core Endpoints
### `GET /matrix/`
Search and list TF binding profiles.
**Parameters:**
| Parameter | Type | Description | Example |
|-----------|------|-------------|---------|
| `name` | string | TF name (partial match) | `CTCF` |
| `matrix_id` | string | Exact matrix ID | `MA0139.1` |
| `collection` | string | Collection name | `CORE` |
| `tax_group` | string | Taxonomic group | `vertebrates` |
| `species` | string | Species name or tax ID | `9606`, `Homo sapiens` |
| `tf_class` | string | TF structural class | `C2H2 zinc finger factors` |
| `tf_family` | string | TF family | `BEN domain factors` |
| `type` | string | Experimental method | `ChIP-seq`, `SELEX` |
| `version` | string | `latest` or specific version | `latest` |
| `page` | int | Page number | `1` |
| `page_size` | int | Results per page (max 500) | `25` |
**Response:**
```json
{
"count": 1210,
"next": "https://jaspar.elixir.no/api/v1/matrix/?page=2",
"previous": null,
"results": [
{
"matrix_id": "MA0139.1",
"name": "CTCF",
"collection": "CORE",
"tax_group": "vertebrates"
}
]
}
```
### `GET /matrix/{id}/`
Fetch a specific matrix with full details.
**Response:**
```json
{
"matrix_id": "MA0139.1",
"name": "CTCF",
"collection": "CORE",
"tax_group": "vertebrates",
"pfm": {
"A": [87, 167, 281, ...],
"C": [291, 145, 49, ...],
"G": [76, 414, 504, ...],
"T": [205, 114, 27, ...]
},
"consensus": "CCGCGNGGNGGCAG",
"length": 19,
"species": [{"tax_id": 9606, "name": "Homo sapiens"}],
"class": ["C2H2 zinc finger factors"],
"family": ["BEN domain factors"],
"type": "ChIP-seq",
"uniprot_ids": ["P49711"],
"pubmed_ids": ["19172222"],
"version": 1,
"latest": true
}
```
### `GET /matrix/{id}/logo/`
Returns SVG/PNG logo for the matrix.
**Parameters:** `format` = `svg` (default) or `png`
### `GET /taxon/`
List available species/taxa.
### `GET /tf_class/`
List available TF structural classes.
### `GET /tf_family/`
List available TF families.
### `GET /collection/`
List available collections.
## Matrix ID Format
```
MA{number}.{version}
```
- `MA` prefix = Manual curation
- `PB` prefix = Published (automatic)
- `UN` prefix = Unvalidated
- Version increments when profile is updated
## Common Matrix IDs
| Matrix ID | TF | Species | Method |
|-----------|-----|---------|--------|
| `MA0139.1` | CTCF | Human | ChIP-seq |
| `MA0098.3` | ETS1 | Human | SELEX |
| `MA0107.1` | RELA (NF-kB p65) | Human | SELEX |
| `MA0048.2` | NHLH1 | Human | SELEX |
| `MA0079.4` | SP1 | Human | SELEX |
| `MA0080.4` | SPI1 (PU.1) | Human | ChIP-seq |
| `MA0025.2` | NFIL3 | Human | SELEX |
| `MA0002.2` | RUNX1 | Human | SELEX |
| `MA0004.1` | Arnt | Mouse | SELEX |
| `MA0009.2` | TAL1::GATA1 | Human | SELEX |
## TF Classes (partial list)
- `C2H2 zinc finger factors`
- `Basic leucine zipper factors (bZIP)`
- `Basic helix-loop-helix factors (bHLH)`
- `Homeodomain factors`
- `Forkhead box (FOX) factors`
- `ETS-domain factors`
- `Nuclear hormone receptors`
- `Tryptophan cluster factors`
- `p53-like transcription factors`
- `STAT factors`
- `MADS box factors`
- `T-box factors`
## Python Example: Batch Download
```python
import requests, json, time
def download_all_human_profiles(output_file="jaspar_human_profiles.json"):
"""Download all human TF profiles from JASPAR CORE collection."""
url = "https://jaspar.elixir.no/api/v1/matrix/"
params = {
"collection": "CORE",
"species": "9606",
"version": "latest",
"page_size": 500,
"page": 1
}
profiles = []
while True:
response = requests.get(url, params=params)
data = response.json()
profiles.extend(data["results"])
if not data["next"]:
break
params["page"] += 1
time.sleep(0.5)
# Fetch full details for each profile
full_profiles = []
for p in profiles:
detail_url = f"https://jaspar.elixir.no/api/v1/matrix/{p['matrix_id']}/"
detail = requests.get(detail_url).json()
full_profiles.append(detail)
time.sleep(0.1) # Be respectful
with open(output_file, "w") as f:
json.dump(full_profiles, f)
return full_profiles
```

View File

@@ -0,0 +1,457 @@
---
name: molecular-dynamics
description: Run and analyze molecular dynamics simulations with OpenMM and MDAnalysis. Set up protein/small molecule systems, define force fields, run energy minimization and production MD, analyze trajectories (RMSD, RMSF, contact maps, free energy surfaces). For structural biology, drug binding, and biophysics.
license: MIT
metadata:
skill-author: Kuan-lin Huang
---
# Molecular Dynamics
## Overview
Molecular dynamics (MD) simulation computationally models the time evolution of molecular systems by integrating Newton's equations of motion. This skill covers two complementary tools:
- **OpenMM** (https://openmm.org/): High-performance MD simulation engine with GPU support, Python API, and flexible force field support
- **MDAnalysis** (https://mdanalysis.org/): Python library for reading, writing, and analyzing MD trajectories from all major simulation packages
**Installation:**
```bash
conda install -c conda-forge openmm mdanalysis nglview
# or
pip install openmm mdanalysis
```
## When to Use This Skill
Use molecular dynamics when:
- **Protein stability analysis**: How does a mutation affect protein dynamics?
- **Drug binding simulations**: Characterize binding mode and residence time of a ligand
- **Conformational sampling**: Explore protein flexibility and conformational changes
- **Protein-protein interaction**: Model interface dynamics and binding energetics
- **RMSD/RMSF analysis**: Quantify structural fluctuations from a reference structure
- **Free energy estimation**: Compute binding free energy or conformational free energy
- **Membrane simulations**: Model proteins in lipid bilayers
- **Intrinsically disordered proteins**: Study IDR conformational ensembles
## Core Workflow: OpenMM Simulation
### 1. System Preparation
```python
from openmm.app import *
from openmm import *
from openmm.unit import *
import sys
def prepare_system_from_pdb(pdb_file, forcefield_name="amber14-all.xml",
water_model="amber14/tip3pfb.xml"):
"""
Prepare an OpenMM system from a PDB file.
Args:
pdb_file: Path to cleaned PDB file (use PDBFixer for raw PDB files)
forcefield_name: Force field XML file
water_model: Water model XML file
Returns:
pdb, forcefield, system, topology
"""
# Load PDB
pdb = PDBFile(pdb_file)
# Load force field
forcefield = ForceField(forcefield_name, water_model)
# Add hydrogens and solvate
modeller = Modeller(pdb.topology, pdb.positions)
modeller.addHydrogens(forcefield)
# Add solvent box (10 Å padding, 150 mM NaCl)
modeller.addSolvent(
forcefield,
model='tip3p',
padding=10*angstroms,
ionicStrength=0.15*molar
)
print(f"System: {modeller.topology.getNumAtoms()} atoms, "
f"{modeller.topology.getNumResidues()} residues")
# Create system
system = forcefield.createSystem(
modeller.topology,
nonbondedMethod=PME, # Particle Mesh Ewald for long-range electrostatics
nonbondedCutoff=1.0*nanometer,
constraints=HBonds, # Constrain hydrogen bonds (allows 2 fs timestep)
rigidWater=True,
ewaldErrorTolerance=0.0005
)
return modeller, system
```
### 2. Energy Minimization
```python
from openmm.app import *
from openmm import *
from openmm.unit import *
def minimize_energy(modeller, system, output_pdb="minimized.pdb",
max_iterations=1000, tolerance=10.0):
"""
Energy minimize the system to remove steric clashes.
Args:
modeller: Modeller object with topology and positions
system: OpenMM System
output_pdb: Path to save minimized structure
max_iterations: Maximum minimization steps
tolerance: Convergence criterion in kJ/mol/nm
Returns:
simulation object with minimized positions
"""
# Set up integrator (doesn't matter for minimization)
integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.004*picoseconds)
# Create simulation
# Use GPU if available (CUDA or OpenCL), fall back to CPU
try:
platform = Platform.getPlatformByName('CUDA')
properties = {'DeviceIndex': '0', 'Precision': 'mixed'}
except Exception:
try:
platform = Platform.getPlatformByName('OpenCL')
properties = {}
except Exception:
platform = Platform.getPlatformByName('CPU')
properties = {}
simulation = Simulation(
modeller.topology, system, integrator,
platform, properties
)
simulation.context.setPositions(modeller.positions)
# Check initial energy
state = simulation.context.getState(getEnergy=True)
print(f"Initial energy: {state.getPotentialEnergy()}")
# Minimize
simulation.minimizeEnergy(
tolerance=tolerance*kilojoules_per_mole/nanometer,
maxIterations=max_iterations
)
state = simulation.context.getState(getEnergy=True, getPositions=True)
print(f"Minimized energy: {state.getPotentialEnergy()}")
# Save minimized structure
with open(output_pdb, 'w') as f:
PDBFile.writeFile(simulation.topology, state.getPositions(), f)
return simulation
```
### 3. NVT Equilibration
```python
from openmm.app import *
from openmm import *
from openmm.unit import *
def run_nvt_equilibration(simulation, n_steps=50000, temperature=300,
report_interval=1000, output_prefix="nvt"):
"""
NVT equilibration: constant N, V, T.
Equilibrate velocities to target temperature.
Args:
simulation: OpenMM Simulation (after minimization)
n_steps: Number of MD steps (50000 × 2fs = 100 ps)
temperature: Temperature in Kelvin
report_interval: Steps between data reports
output_prefix: File prefix for trajectory and log
"""
# Add position restraints for backbone during NVT
# (Optional: restraint heavy atoms)
# Set temperature
simulation.context.setVelocitiesToTemperature(temperature*kelvin)
# Add reporters
simulation.reporters = []
# Log file
simulation.reporters.append(
StateDataReporter(
f"{output_prefix}_log.txt",
report_interval,
step=True,
potentialEnergy=True,
kineticEnergy=True,
temperature=True,
volume=True,
speed=True
)
)
# DCD trajectory (compact binary format)
simulation.reporters.append(
DCDReporter(f"{output_prefix}_traj.dcd", report_interval)
)
print(f"Running NVT equilibration: {n_steps} steps ({n_steps*2/1000:.1f} ps)")
simulation.step(n_steps)
print("NVT equilibration complete")
return simulation
```
### 4. NPT Equilibration and Production
```python
def run_npt_production(simulation, n_steps=500000, temperature=300, pressure=1.0,
report_interval=5000, output_prefix="npt"):
"""
NPT production run: constant N, P, T.
Args:
n_steps: Production steps (500000 × 2fs = 1 ns)
temperature: Temperature in Kelvin
pressure: Pressure in bar
report_interval: Steps between reports
"""
# Add Monte Carlo barostat for pressure control
system = simulation.context.getSystem()
system.addForce(MonteCarloBarostat(pressure*bar, temperature*kelvin, 25))
simulation.context.reinitialize(preserveState=True)
# Update reporters
simulation.reporters = []
simulation.reporters.append(
StateDataReporter(
f"{output_prefix}_log.txt",
report_interval,
step=True,
potentialEnergy=True,
temperature=True,
density=True,
speed=True
)
)
simulation.reporters.append(
DCDReporter(f"{output_prefix}_traj.dcd", report_interval)
)
# Save checkpoints
simulation.reporters.append(
CheckpointReporter(f"{output_prefix}_checkpoint.chk", 50000)
)
print(f"Running NPT production: {n_steps} steps ({n_steps*2/1000000:.2f} ns)")
simulation.step(n_steps)
print("Production MD complete")
return simulation
```
## Trajectory Analysis with MDAnalysis
### 1. Load Trajectory
```python
import MDAnalysis as mda
from MDAnalysis.analysis import rms, align, contacts
import numpy as np
import matplotlib.pyplot as plt
def load_trajectory(topology_file, trajectory_file):
"""
Load an MD trajectory with MDAnalysis.
Args:
topology_file: PDB, PSF, or other topology file
trajectory_file: DCD, XTC, TRR, or other trajectory
"""
u = mda.Universe(topology_file, trajectory_file)
print(f"Universe: {u.atoms.n_atoms} atoms, {u.trajectory.n_frames} frames")
print(f"Time range: 0 to {u.trajectory.totaltime:.0f} ps")
return u
```
### 2. RMSD Analysis
```python
def compute_rmsd(u, selection="backbone", reference_frame=0):
"""
Compute RMSD of selected atoms relative to reference frame.
Args:
u: MDAnalysis Universe
selection: Atom selection string (MDAnalysis syntax)
reference_frame: Frame index for reference structure
Returns:
numpy array of (time, rmsd) values
"""
# Align trajectory to minimize RMSD
aligner = align.AlignTraj(u, u, select=selection, in_memory=True)
aligner.run()
# Compute RMSD
R = rms.RMSD(u, select=selection, ref_frame=reference_frame)
R.run()
rmsd_data = R.results.rmsd # columns: frame, time, RMSD
return rmsd_data
def plot_rmsd(rmsd_data, title="RMSD over time", output_file="rmsd.png"):
"""Plot RMSD over simulation time."""
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(rmsd_data[:, 1] / 1000, rmsd_data[:, 2], 'b-', linewidth=0.5)
ax.set_xlabel("Time (ns)")
ax.set_ylabel("RMSD (Å)")
ax.set_title(title)
ax.axhline(rmsd_data[:, 2].mean(), color='r', linestyle='--',
label=f'Mean: {rmsd_data[:, 2].mean():.2f} Å')
ax.legend()
plt.tight_layout()
plt.savefig(output_file, dpi=150)
return fig
```
### 3. RMSF Analysis (Per-Residue Flexibility)
```python
def compute_rmsf(u, selection="backbone", start_frame=0):
"""
Compute per-residue RMSF (flexibility).
Returns:
resids, rmsf_values arrays
"""
# Select atoms
atoms = u.select_atoms(selection)
# Compute RMSF
R = rms.RMSF(atoms)
R.run(start=start_frame)
# Average by residue
resids = []
rmsf_per_res = []
for res in u.select_atoms(selection).residues:
res_atoms = res.atoms.intersection(atoms)
if len(res_atoms) > 0:
resids.append(res.resid)
rmsf_per_res.append(R.results.rmsf[res_atoms.indices].mean())
return np.array(resids), np.array(rmsf_per_res)
```
### 4. Protein-Ligand Contacts
```python
def analyze_contacts(u, protein_sel="protein", ligand_sel="resname LIG",
radius=4.5, start_frame=0):
"""
Track protein-ligand contacts over trajectory.
Args:
radius: Contact distance cutoff in Angstroms
"""
protein = u.select_atoms(protein_sel)
ligand = u.select_atoms(ligand_sel)
contact_frames = []
for ts in u.trajectory[start_frame:]:
# Find protein atoms within radius of ligand
distances = contacts.contact_matrix(
protein.positions, ligand.positions, radius
)
contact_residues = set()
for i in range(distances.shape[0]):
if distances[i].any():
contact_residues.add(protein.atoms[i].resid)
contact_frames.append(contact_residues)
return contact_frames
```
## Force Field Selection Guide
| System | Recommended Force Field | Water Model |
|--------|------------------------|-------------|
| Standard proteins | AMBER14 (`amber14-all.xml`) | TIP3P-FB |
| Proteins + small molecules | AMBER14 + GAFF2 | TIP3P-FB |
| Membrane proteins | CHARMM36m | TIP3P |
| Nucleic acids | AMBER99-bsc1 or AMBER14 | TIP3P |
| Disordered proteins | ff19SB or CHARMM36m | TIP3P |
## System Preparation Tools
### PDBFixer (for raw PDB files)
```python
from pdbfixer import PDBFixer
from openmm.app import PDBFile
def fix_pdb(input_pdb, output_pdb, ph=7.0):
"""Fix common PDB issues: missing residues, atoms, add H, standardize."""
fixer = PDBFixer(filename=input_pdb)
fixer.findMissingResidues()
fixer.findNonstandardResidues()
fixer.replaceNonstandardResidues()
fixer.removeHeterogens(True) # Remove water/ligands
fixer.findMissingAtoms()
fixer.addMissingAtoms()
fixer.addMissingHydrogens(ph)
with open(output_pdb, 'w') as f:
PDBFile.writeFile(fixer.topology, fixer.positions, f)
return output_pdb
```
### GAFF2 for Small Molecules (via OpenFF Toolkit)
```python
# For ligand parameterization, use OpenFF toolkit or ACPYPE
# pip install openff-toolkit
from openff.toolkit import Molecule, ForceField as OFFForceField
from openff.interchange import Interchange
def parameterize_ligand(smiles, ff_name="openff-2.0.0.offxml"):
"""Generate GAFF2/OpenFF parameters for a small molecule."""
mol = Molecule.from_smiles(smiles)
mol.generate_conformers(n_conformers=1)
off_ff = OFFForceField(ff_name)
interchange = off_ff.create_interchange(mol.to_topology())
return interchange
```
## Best Practices
- **Always minimize before MD**: Raw PDB structures have steric clashes
- **Equilibrate before production**: NVT (50100 ps) → NPT (100500 ps) → Production
- **Use GPU**: Simulations are 10100× faster on GPU (CUDA/OpenCL)
- **2 fs timestep with HBonds constraints**: Standard; use 4 fs with HMR (hydrogen mass repartitioning)
- **Analyze only equilibrated trajectory**: Discard first 2050% as equilibration
- **Save checkpoints**: MD runs can fail; checkpoints allow restart
- **Periodic boundary conditions**: Required for solvated systems
- **PME for electrostatics**: More accurate than cutoff methods for charged systems
## Additional Resources
- **OpenMM documentation**: https://openmm.org/documentation.html
- **MDAnalysis user guide**: https://docs.mdanalysis.org/
- **GROMACS** (alternative MD engine): https://manual.gromacs.org/
- **NAMD** (alternative): https://www.ks.uiuc.edu/Research/namd/
- **CHARMM-GUI** (web-based system builder): https://charmm-gui.org/
- **AmberTools** (free Amber tools): https://ambermd.org/AmberTools.php
- **OpenMM paper**: Eastman P et al. (2017) PLOS Computational Biology. PMID: 28278240
- **MDAnalysis paper**: Michaud-Agrawal N et al. (2011) J Computational Chemistry. PMID: 21500218

View File

@@ -0,0 +1,208 @@
# MDAnalysis Analysis Reference
## MDAnalysis Universe and AtomGroup
```python
import MDAnalysis as mda
# Load Universe
u = mda.Universe("topology.pdb", "trajectory.dcd")
# or for single structure:
u = mda.Universe("structure.pdb")
# Key attributes
print(u.atoms.n_atoms) # Total atoms
print(u.residues.n_residues) # Total residues
print(u.trajectory.n_frames) # Number of frames
print(u.trajectory.dt) # Time step in ps
print(u.trajectory.totaltime) # Total simulation time in ps
```
## Atom Selection Language
MDAnalysis uses a rich selection language:
```python
# Basic selections
protein = u.select_atoms("protein")
backbone = u.select_atoms("backbone") # CA, N, C, O
calpha = u.select_atoms("name CA")
water = u.select_atoms("resname WAT or resname HOH or resname TIP3")
ligand = u.select_atoms("resname LIG")
# By residue number
region = u.select_atoms("resid 10:50")
specific = u.select_atoms("resid 45 and name CA")
# By proximity
near_ligand = u.select_atoms("protein and around 5.0 resname LIG")
# By property
charged = u.select_atoms("resname ARG LYS ASP GLU")
hydrophobic = u.select_atoms("resname ALA VAL LEU ILE PRO PHE TRP MET")
# Boolean combinations
active_site = u.select_atoms("(resid 100 102 145 200) and protein")
# Inverse
not_water = u.select_atoms("not (resname WAT HOH)")
```
## Common Analysis Modules
### RMSD and RMSF
```python
from MDAnalysis.analysis import rms, align
# Align trajectory to first frame
align.AlignTraj(u, u, select='backbone', in_memory=True).run()
# RMSD
R = rms.RMSD(u, u, select='backbone', groupselections=['name CA'])
R.run()
# R.results.rmsd: shape (n_frames, 3) = [frame, time, RMSD]
# RMSF (per-atom fluctuations)
from MDAnalysis.analysis.rms import RMSF
rmsf = RMSF(u.select_atoms('backbone')).run()
# rmsf.results.rmsf: per-atom RMSF values in Angstroms
```
### Radius of Gyration
```python
rg = []
for ts in u.trajectory:
rg.append(u.select_atoms("protein").radius_of_gyration())
import numpy as np
print(f"Mean Rg: {np.mean(rg):.2f} Å")
```
### Secondary Structure Analysis
```python
from MDAnalysis.analysis.dssp import DSSP
# DSSP secondary structure assignment per frame
dssp = DSSP(u).run()
# dssp.results.dssp: per-residue per-frame secondary structure codes
# H = alpha-helix, E = beta-strand, C = coil
```
### Hydrogen Bonds
```python
from MDAnalysis.analysis.hydrogenbonds import HydrogenBondAnalysis
hbonds = HydrogenBondAnalysis(
u,
donors_sel="protein and name N",
acceptors_sel="protein and name O",
d_h_cutoff=1.2, # donor-H distance (Å)
d_a_cutoff=3.0, # donor-acceptor distance (Å)
d_h_a_angle_cutoff=150 # D-H-A angle (degrees)
)
hbonds.run()
# Count H-bonds per frame
import pandas as pd
df = pd.DataFrame(hbonds.results.hbonds,
columns=['frame', 'donor_ix', 'hydrogen_ix', 'acceptor_ix',
'DA_dist', 'DHA_angle'])
```
### Principal Component Analysis (PCA)
```python
from MDAnalysis.analysis import pca
pca_analysis = pca.PCA(u, select='backbone', align=True).run()
# PC variances
print(pca_analysis.results.variance[:5]) # % variance of first 5 PCs
# Project trajectory onto PCs
projected = pca_analysis.transform(u.select_atoms('backbone'), n_components=3)
# Shape: (n_frames, n_components)
```
### Free Energy Surface (FES)
```python
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def plot_free_energy_surface(x, y, bins=50, T=300, xlabel="PC1", ylabel="PC2",
output="fes.png"):
"""
Compute 2D free energy surface from two order parameters.
FES = -kT * ln(P(x,y))
"""
kB = 0.0083144621 # kJ/mol/K
kT = kB * T
# 2D histogram
H, xedges, yedges = np.histogram2d(x, y, bins=bins, density=True)
H = H.T
# Free energy
H_safe = np.where(H > 0, H, np.nan)
fes = -kT * np.log(H_safe)
fes -= np.nanmin(fes) # Shift minimum to 0
# Plot
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.contourf(xedges[:-1], yedges[:-1], fes, levels=20, cmap='RdYlBu_r')
plt.colorbar(im, ax=ax, label='Free Energy (kJ/mol)')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.savefig(output, dpi=150, bbox_inches='tight')
return fig
```
## Trajectory Formats Supported
| Format | Extension | Notes |
|--------|-----------|-------|
| DCD | `.dcd` | CHARMM/NAMD binary; widely used |
| XTC | `.xtc` | GROMACS compressed |
| TRR | `.trr` | GROMACS full precision |
| NetCDF | `.nc` | AMBER format |
| LAMMPS | `.lammpstrj` | LAMMPS dump |
| HDF5 | `.h5md` | H5MD standard |
| PDB | `.pdb` | Multi-model PDB |
## MDAnalysis Interoperability
```python
# Convert to numpy
positions = u.atoms.positions # Current frame: shape (N, 3)
# Write to PDB
with mda.Writer("frame_10.pdb", u.atoms.n_atoms) as W:
u.trajectory[10] # Move to frame 10
W.write(u.atoms)
# Write trajectory subset
with mda.Writer("protein_traj.dcd", u.select_atoms("protein").n_atoms) as W:
for ts in u.trajectory:
W.write(u.select_atoms("protein"))
# Convert to MDTraj (for compatibility)
# import mdtraj as md
# traj = md.load("trajectory.dcd", top="topology.pdb")
```
## Performance Tips
- **Use `in_memory=True`** for AlignTraj when RAM allows (much faster iteration)
- **Select minimal atoms** before analysis to reduce memory/compute
- **Use multiprocessing** for independent frame analyses
- **Process in chunks** for very long trajectories using `start`/`stop`/`step` parameters:
```python
# Analyze every 10th frame from frame 100 to 1000
R.run(start=100, stop=1000, step=10)
```

View File

@@ -0,0 +1,372 @@
---
name: monarch-database
description: Query the Monarch Initiative knowledge graph for disease-gene-phenotype associations across species. Integrates OMIM, ORPHANET, HPO, ClinVar, and model organism databases. Use for rare disease gene discovery, phenotype-to-gene mapping, cross-species disease modeling, and HPO term lookup.
license: CC0-1.0
metadata:
skill-author: Kuan-lin Huang
---
# Monarch Initiative Database
## Overview
The Monarch Initiative (https://monarchinitiative.org/) is a multi-species integrated knowledgebase that links genes, diseases, and phenotypes across humans and model organisms. It integrates data from over 40 sources including OMIM, ORPHANET, HPO (Human Phenotype Ontology), ClinVar, MGI (Mouse Genome Informatics), ZFIN (Zebrafish), RGD (Rat), FlyBase, and WormBase.
Monarch enables:
- Mapping phenotypes across species to identify candidate disease genes
- Finding all genes associated with a disease or phenotype
- Discovering model organisms for human diseases
- Navigating the HPO hierarchy for phenotype ontology queries
**Key resources:**
- Monarch portal: https://monarchinitiative.org/
- Monarch API v3: https://api-v3.monarchinitiative.org/v3/
- API docs: https://api-v3.monarchinitiative.org/v3/docs
- HPO browser: https://hpo.jax.org/
## When to Use This Skill
Use Monarch when:
- **Rare disease gene discovery**: What genes are associated with my patient's phenotypes (HPO terms)?
- **Phenotype similarity**: Are two diseases similar based on their phenotypic profiles?
- **Cross-species modeling**: Are there mouse/zebrafish models for my disease of interest?
- **HPO term lookup**: Retrieve HPO term names, definitions, and ontology hierarchy
- **Disease-phenotype mapping**: List all HPO terms associated with a specific disease
- **Gene-phenotype associations**: What phenotypes are caused by variants in a gene?
- **Ortholog-phenotype mapping**: Use animal model phenotypes to infer human gene function
## Core Capabilities
### 1. Monarch API v3
Base URL: `https://api-v3.monarchinitiative.org/v3/`
```python
import requests
BASE_URL = "https://api-v3.monarchinitiative.org/v3"
def monarch_get(endpoint, params=None):
"""Make a GET request to the Monarch API."""
url = f"{BASE_URL}/{endpoint}"
response = requests.get(url, params=params, headers={"Accept": "application/json"})
response.raise_for_status()
return response.json()
```
### 2. Phenotype-to-Gene Association (Pheno2Gene)
```python
def get_genes_for_phenotypes(hpo_ids, limit=50, offset=0):
"""
Find genes associated with a list of HPO phenotype terms.
Core use case: rare disease differential diagnosis.
Args:
hpo_ids: List of HPO term IDs (e.g., ["HP:0001250", "HP:0004322"])
limit: Maximum number of results
"""
params = {
"terms": hpo_ids,
"limit": limit,
"offset": offset
}
return monarch_get("semsim/termset-pairwise-similarity/analyze", params)
def phenotype_to_gene(hpo_ids):
"""
Return genes whose phenotypes match the given HPO terms.
Uses semantic similarity scoring.
"""
# Use the /association endpoint for direct phenotype-gene links
all_genes = []
for hpo_id in hpo_ids:
data = monarch_get("association/all", {
"subject": hpo_id,
"predicate": "biolink:has_phenotype",
"category": "biolink:GeneToPhenotypicFeatureAssociation",
"limit": 50
})
for assoc in data.get("items", []):
all_genes.append({
"phenotype_id": hpo_id,
"gene_id": assoc.get("object", {}).get("id"),
"gene_name": assoc.get("object", {}).get("name"),
"evidence": assoc.get("evidence_type")
})
return all_genes
# Example: Find genes associated with seizures and short stature
hpo_terms = ["HP:0001250", "HP:0004322"] # Seizures, Short stature
genes = phenotype_to_gene(hpo_terms)
```
### 3. Disease-to-Gene Associations
```python
def get_genes_for_disease(disease_id, limit=100):
"""
Get all genes associated with a disease.
Disease IDs: OMIM:146300, MONDO:0007739, ORPHANET:558, etc.
"""
params = {
"object": disease_id,
"category": "biolink:DiseaseToDiseaseAssociation",
"limit": limit
}
# Use the gene-disease association endpoint
gene_params = {
"subject": disease_id,
"category": "biolink:GeneToPhenotypicFeatureAssociation",
"limit": limit
}
data = monarch_get("association/all", {
"object": disease_id,
"predicate": "biolink:has_phenotype",
"limit": limit
})
return data
def get_disease_genes(disease_id, limit=100):
"""Get genes causally linked to a disease."""
data = monarch_get("association/all", {
"subject_category": "biolink:Gene",
"object": disease_id,
"predicate": "biolink:causes",
"limit": limit
})
return data.get("items", [])
# MONDO disease IDs (preferred over OMIM for cross-ontology queries)
# MONDO:0007739 - Huntington disease
# MONDO:0009061 - Cystic fibrosis
# OMIM:104300 - Alzheimer disease, susceptibility to, type 1
```
### 4. Gene-to-Phenotype and Disease
```python
def get_phenotypes_for_gene(gene_id, limit=100):
"""
Get all phenotypes associated with a gene.
Gene IDs: HGNC:7884, NCBIGene:4137, etc.
"""
data = monarch_get("association/all", {
"subject": gene_id,
"predicate": "biolink:has_phenotype",
"limit": limit
})
return data.get("items", [])
def get_diseases_for_gene(gene_id, limit=100):
"""Get diseases caused by variants in a gene."""
data = monarch_get("association/all", {
"subject": gene_id,
"object_category": "biolink:Disease",
"limit": limit
})
return data.get("items", [])
# Example: What diseases does BRCA1 cause?
brca1_diseases = get_diseases_for_gene("HGNC:1100")
for assoc in brca1_diseases:
print(f" {assoc.get('object', {}).get('name')} ({assoc.get('object', {}).get('id')})")
```
### 5. HPO Term Lookup
```python
def get_hpo_term(hpo_id):
"""Fetch information about an HPO term."""
return monarch_get(f"entity/{hpo_id}")
def search_hpo_terms(query, limit=20):
"""Search for HPO terms by name."""
params = {
"q": query,
"category": "biolink:PhenotypicFeature",
"limit": limit
}
return monarch_get("search", params)
# Example: look up the HPO term for seizures
seizure_term = get_hpo_term("HP:0001250")
print(f"Name: {seizure_term.get('name')}")
print(f"Definition: {seizure_term.get('description')}")
# Search for related terms
epilepsy_terms = search_hpo_terms("epilepsy")
for term in epilepsy_terms.get("items", [])[:5]:
print(f" {term['id']}: {term['name']}")
```
### 6. Semantic Similarity (Disease Comparison)
```python
def compare_disease_phenotypes(disease_id_1, disease_id_2):
"""
Compare two diseases by semantic similarity of their phenotype profiles.
Returns similarity score using HPO hierarchy.
"""
params = {
"subjects": [disease_id_1],
"objects": [disease_id_2],
"metric": "ancestor_information_content"
}
return monarch_get("semsim/compare", params)
# Example: Compare Dravet syndrome with CDKL5-deficiency disorder
similarity = compare_disease_phenotypes("MONDO:0100135", "MONDO:0014917")
```
### 7. Cross-Species Orthologs
```python
def get_orthologs(gene_id, species=None):
"""
Get orthologs of a human gene in model organisms.
Useful for finding animal models of human diseases.
"""
params = {"limit": 50}
if species:
params["subject_taxon"] = species
data = monarch_get("association/all", {
"subject": gene_id,
"predicate": "biolink:orthologous_to",
"limit": 50
})
return data.get("items", [])
# NCBI Taxonomy IDs for common model organisms:
# Mouse: 10090 (Mus musculus)
# Zebrafish: 7955 (Danio rerio)
# Fruit fly: 7227 (Drosophila melanogaster)
# C. elegans: 6239
# Rat: 10116 (Rattus norvegicus)
```
### 8. Full Workflow: Rare Disease Gene Prioritization
```python
import requests
import pandas as pd
def rare_disease_gene_finder(patient_hpo_terms, candidate_gene_ids=None, top_n=20):
"""
Find genes that match a patient's HPO phenotype profile.
Args:
patient_hpo_terms: List of HPO IDs from clinical assessment
candidate_gene_ids: Optional list to restrict search
top_n: Number of top candidates to return
"""
BASE_URL = "https://api-v3.monarchinitiative.org/v3"
# 1. Find genes associated with each phenotype
gene_phenotype_counts = {}
for hpo_id in patient_hpo_terms:
data = requests.get(
f"{BASE_URL}/association/all",
params={
"object": hpo_id,
"subject_category": "biolink:Gene",
"limit": 100
}
).json()
for item in data.get("items", []):
gene_id = item.get("subject", {}).get("id")
gene_name = item.get("subject", {}).get("name")
if gene_id:
if gene_id not in gene_phenotype_counts:
gene_phenotype_counts[gene_id] = {"name": gene_name, "count": 0, "phenotypes": []}
gene_phenotype_counts[gene_id]["count"] += 1
gene_phenotype_counts[gene_id]["phenotypes"].append(hpo_id)
# 2. Rank by number of matching phenotypes
ranked = sorted(gene_phenotype_counts.items(),
key=lambda x: -x[1]["count"])[:top_n]
results = []
for gene_id, info in ranked:
results.append({
"gene_id": gene_id,
"gene_name": info["name"],
"matching_phenotypes": info["count"],
"total_patient_phenotypes": len(patient_hpo_terms),
"phenotype_overlap": info["count"] / len(patient_hpo_terms),
"matching_hpo_terms": info["phenotypes"]
})
return pd.DataFrame(results)
# Example usage
patient_phenotypes = [
"HP:0001250", # Seizures
"HP:0004322", # Short stature
"HP:0001252", # Hypotonia
"HP:0000252", # Microcephaly
"HP:0001263", # Global developmental delay
]
candidates = rare_disease_gene_finder(patient_phenotypes)
print(candidates[["gene_name", "matching_phenotypes", "phenotype_overlap"]].to_string())
```
## Query Workflows
### Workflow 1: HPO-Based Differential Diagnosis
1. Extract HPO terms from clinical notes or genetics consultation
2. Run phenotype-to-gene query against Monarch
3. Rank candidate genes by number of matching phenotypes
4. Cross-reference with gnomAD (constraint scores) and ClinVar (variant evidence)
5. Prioritize genes with high pLI and known pathogenic variants
### Workflow 2: Disease Model Discovery
1. Identify gene or disease of interest
2. Query Monarch for cross-species orthologs
3. Find phenotype associations in model organism databases
4. Identify experimental models that recapitulate human disease features
### Workflow 3: Phenotype Annotation of Novel Genes
1. For a gene with unknown function, query all known phenotype associations
2. Map to HPO hierarchy to understand affected body systems
3. Cross-reference with OMIM and ORPHANET for disease links
## Common Identifier Prefixes
| Prefix | Namespace | Example |
|--------|-----------|---------|
| `HP:` | Human Phenotype Ontology | HP:0001250 (Seizures) |
| `MONDO:` | Monarch Disease Ontology | MONDO:0007739 |
| `OMIM:` | OMIM disease | OMIM:104300 |
| `ORPHANET:` | Orphanet rare disease | ORPHANET:558 |
| `HGNC:` | HGNC gene symbol | HGNC:7884 |
| `NCBIGene:` | NCBI gene ID | NCBIGene:4137 |
| `ENSEMBL:` | Ensembl gene | ENSEMBL:ENSG... |
| `MGI:` | Mouse gene | MGI:1338833 |
| `ZFIN:` | Zebrafish gene | ZFIN:ZDB-GENE... |
## Best Practices
- **Use MONDO IDs** for diseases — they unify OMIM/ORPHANET/MESH identifiers
- **Use HPO IDs** for phenotypes — the standard for clinical phenotype description
- **Handle pagination**: Large queries may require iterating with offset parameter
- **Semantic similarity is better than exact match**: Ancestor HPO terms catch related phenotypes
- **Cross-validate with ClinVar and OMIM**: Monarch aggregates many sources; quality varies
- **Use HGNC IDs for genes**: More stable than gene symbols across database versions
## Additional Resources
- **Monarch portal**: https://monarchinitiative.org/
- **API v3 docs**: https://api-v3.monarchinitiative.org/v3/docs
- **HPO browser**: https://hpo.jax.org/
- **MONDO ontology**: https://mondo.monarchinitiative.org/
- **Citation**: Shefchek KA et al. (2020) Nucleic Acids Research. PMID: 31701156
- **Phenomizer** (HPO-based diagnosis): https://compbio.charite.de/phenomizer/

View File

@@ -0,0 +1,160 @@
# HPO and Disease Ontology Reference for Monarch
## Human Phenotype Ontology (HPO)
### HPO Structure
HPO is organized hierarchically:
- **Root**: HP:0000001 (All)
- HP:0000118 (Phenotypic abnormality)
- HP:0000478 (Abnormality of the eye)
- HP:0000707 (Abnormality of the nervous system)
- HP:0001507 (Growth abnormality)
- HP:0001626 (Abnormality of the cardiovascular system)
- etc.
### Top-Level HPO Categories
| HPO ID | Name |
|--------|------|
| HP:0000924 | Abnormality of the skeletal system |
| HP:0000707 | Abnormality of the nervous system |
| HP:0000478 | Abnormality of the eye |
| HP:0000598 | Abnormality of the ear |
| HP:0001507 | Growth abnormality |
| HP:0001626 | Abnormality of the cardiovascular system |
| HP:0002086 | Abnormality of the respiratory system |
| HP:0001939 | Abnormality of metabolism/homeostasis |
| HP:0002664 | Neoplasm |
| HP:0000818 | Abnormality of the endocrine system |
| HP:0000119 | Abnormality of the genitourinary system |
| HP:0001197 | Abnormality of prenatal development/birth |
### Common HPO Terms in Rare Disease Genetics
#### Neurological
| HPO ID | Term |
|--------|------|
| HP:0001250 | Seizures |
| HP:0001251 | Ataxia |
| HP:0001252 | Muscular hypotonia |
| HP:0001263 | Global developmental delay |
| HP:0001270 | Motor delay |
| HP:0002167 | Neurological speech impairment |
| HP:0000716 | Depressivity |
| HP:0000729 | Autistic behavior |
| HP:0001332 | Dystonia |
| HP:0002071 | Abnormality of extrapyramidal motor function |
#### Growth/Morphology
| HPO ID | Term |
|--------|------|
| HP:0004322 | Short stature |
| HP:0001508 | Failure to thrive |
| HP:0000252 | Microcephaly |
| HP:0000256 | Macrocephaly |
| HP:0001511 | Intrauterine growth retardation |
#### Facial Features
| HPO ID | Term |
|--------|------|
| HP:0000324 | Facial asymmetry |
| HP:0001249 | Intellectual disability |
| HP:0000219 | Thin upper lip vermilion |
| HP:0000303 | Mandibular prognathia |
| HP:0000463 | Anteverted nares |
#### Metabolic
| HPO ID | Term |
|--------|------|
| HP:0001943 | Hypoglycemia |
| HP:0001944 | Hyperglycemia (Diabetes mellitus) |
| HP:0000822 | Hypertension |
| HP:0001712 | Left ventricular hypertrophy |
## MONDO Disease Ontology
MONDO integrates disease classifications from multiple sources:
- OMIM (Mendelian diseases)
- ORPHANET (rare diseases)
- MeSH (medical subject headings)
- SNOMED CT
- DOID (Disease Ontology)
- EFO (Experimental Factor Ontology)
### Key MONDO IDs for Common Rare Diseases
| MONDO ID | Disease | OMIM |
|----------|---------|------|
| MONDO:0007739 | Huntington disease | OMIM:143100 |
| MONDO:0009061 | Cystic fibrosis | OMIM:219700 |
| MONDO:0008608 | Down syndrome | OMIM:190685 |
| MONDO:0019391 | Fragile X syndrome | OMIM:300624 |
| MONDO:0010726 | Rett syndrome | OMIM:312750 |
| MONDO:0014517 | Dravet syndrome | OMIM:607208 |
| MONDO:0024522 | SCN1A-related epilepsy | — |
| MONDO:0014817 | CHARGE syndrome | OMIM:214800 |
| MONDO:0009764 | Marfan syndrome | OMIM:154700 |
| MONDO:0013282 | Alpha-1-antitrypsin deficiency | OMIM:613490 |
### OMIM ID Patterns
- **Phenotype only**: OMIM number alone (e.g., OMIM:104300)
- **Gene and phenotype**: Same gene, multiple phenotype entries
- **Phenotype series**: Grouped phenotypes at a locus
```python
import requests
def omim_to_mondo(omim_id):
"""Convert OMIM ID to MONDO ID via Monarch API."""
search_id = f"OMIM:{omim_id}" if not omim_id.startswith("OMIM:") else omim_id
data = requests.get(
f"https://api-v3.monarchinitiative.org/v3/entity/{search_id}"
).json()
# Check for same_as/equivalent_id links to MONDO
return data
```
## Association Evidence Codes
Monarch associations include evidence types:
| Code | Evidence Type |
|------|--------------|
| `IEA` | Inferred from electronic annotation |
| `TAS` | Traceable author statement |
| `IMP` | Inferred from mutant phenotype |
| `IGI` | Inferred from genetic interaction |
| `IDA` | Inferred from direct assay |
| `ISS` | Inferred from sequence or structural similarity |
| `IBA` | Inferred from biological aspect of ancestor |
Higher-quality evidence: IDA > TAS > IMP > IEA
## Semantic Similarity Metrics
Monarch supports multiple similarity metrics:
| Metric | Description | Use case |
|--------|-------------|---------|
| `ancestor_information_content` | IC of most informative common ancestor (MICA) | Disease similarity |
| `jaccard_similarity` | Overlap coefficient | Simple set comparison |
| `cosine` | Cosine similarity of IC vectors | Large-scale comparisons |
| `phenodigm` | Combined MICA + Jaccard | Model organism matching |
```python
import requests
def compute_disease_similarity(disease_ids_1, disease_ids_2, metric="ancestor_information_content"):
"""Compute semantic similarity between two sets of disease phenotypes."""
# Get phenotype sets for each disease
url = "https://api-v3.monarchinitiative.org/v3/semsim/compare"
params = {
"subjects": disease_ids_1,
"objects": disease_ids_2,
"metric": metric
}
response = requests.get(url, params=params)
return response.json()
```

View File

@@ -0,0 +1,404 @@
---
name: phylogenetics
description: Build and analyze phylogenetic trees using MAFFT (multiple alignment), IQ-TREE 2 (maximum likelihood), and FastTree (fast NJ/ML). Visualize with ETE3 or FigTree. For evolutionary analysis, microbial genomics, viral phylodynamics, protein family analysis, and molecular clock studies.
license: Unknown
metadata:
skill-author: Kuan-lin Huang
---
# Phylogenetics
## Overview
Phylogenetic analysis reconstructs the evolutionary history of biological sequences (genes, proteins, genomes) by inferring the branching pattern of descent. This skill covers the standard pipeline:
1. **MAFFT** — Multiple sequence alignment
2. **IQ-TREE 2** — Maximum likelihood tree inference with model selection
3. **FastTree** — Fast approximate maximum likelihood (for large datasets)
4. **ETE3** — Python library for tree manipulation and visualization
**Installation:**
```bash
# Conda (recommended for CLI tools)
conda install -c bioconda mafft iqtree fasttree
pip install ete3
```
## When to Use This Skill
Use phylogenetics when:
- **Evolutionary relationships**: Which organism/gene is most closely related to my sequence?
- **Viral phylodynamics**: Trace outbreak spread and estimate transmission dates
- **Protein family analysis**: Infer evolutionary relationships within a gene family
- **Horizontal gene transfer detection**: Identify genes with discordant species/gene trees
- **Ancestral sequence reconstruction**: Infer ancestral protein sequences
- **Molecular clock analysis**: Estimate divergence dates using temporal sampling
- **GWAS companion**: Place variants in evolutionary context (e.g., SARS-CoV-2 variants)
- **Microbiology**: Species phylogeny from 16S rRNA or core genome phylogeny
## Standard Workflow
### 1. Multiple Sequence Alignment with MAFFT
```python
import subprocess
import os
def run_mafft(input_fasta: str, output_fasta: str, method: str = "auto",
n_threads: int = 4) -> str:
"""
Align sequences with MAFFT.
Args:
input_fasta: Path to unaligned FASTA file
output_fasta: Path for aligned output
method: 'auto' (auto-select), 'einsi' (accurate), 'linsi' (accurate, slow),
'fftnsi' (medium), 'fftns' (fast), 'retree2' (fast)
n_threads: Number of CPU threads
Returns:
Path to aligned FASTA file
"""
methods = {
"auto": ["mafft", "--auto"],
"einsi": ["mafft", "--genafpair", "--maxiterate", "1000"],
"linsi": ["mafft", "--localpair", "--maxiterate", "1000"],
"fftnsi": ["mafft", "--fftnsi"],
"fftns": ["mafft", "--fftns"],
"retree2": ["mafft", "--retree", "2"],
}
cmd = methods.get(method, methods["auto"])
cmd += ["--thread", str(n_threads), "--inputorder", input_fasta]
with open(output_fasta, 'w') as out:
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"MAFFT failed:\n{result.stderr}")
# Count aligned sequences
with open(output_fasta) as f:
n_seqs = sum(1 for line in f if line.startswith('>'))
print(f"MAFFT: aligned {n_seqs} sequences → {output_fasta}")
return output_fasta
# MAFFT method selection guide:
# Few sequences (<200), accurate: linsi or einsi
# Many sequences (<1000), moderate: fftnsi
# Large datasets (>1000): fftns or auto
# Ultra-fast (>10000): mafft --retree 1
```
### 2. Trim Alignment (Optional but Recommended)
```python
def trim_alignment_trimal(aligned_fasta: str, output_fasta: str,
method: str = "automated1") -> str:
"""
Trim poorly aligned columns with TrimAl.
Methods:
- 'automated1': Automatic heuristic (recommended)
- 'gappyout': Remove gappy columns
- 'strict': Strict gap threshold
"""
cmd = ["trimal", f"-{method}", "-in", aligned_fasta, "-out", output_fasta, "-fasta"]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"TrimAl warning: {result.stderr}")
# Fall back to using the untrimmed alignment
import shutil
shutil.copy(aligned_fasta, output_fasta)
return output_fasta
```
### 3. IQ-TREE 2 — Maximum Likelihood Tree
```python
def run_iqtree(aligned_fasta: str, output_prefix: str,
model: str = "TEST", bootstrap: int = 1000,
n_threads: int = 4, extra_args: list = None) -> dict:
"""
Build a maximum likelihood tree with IQ-TREE 2.
Args:
aligned_fasta: Aligned FASTA file
output_prefix: Prefix for output files
model: 'TEST' for automatic model selection, or specify (e.g., 'GTR+G' for DNA,
'LG+G4' for proteins, 'JTT+G' for proteins)
bootstrap: Number of ultrafast bootstrap replicates (1000 recommended)
n_threads: Number of threads ('AUTO' to auto-detect)
extra_args: Additional IQ-TREE arguments
Returns:
Dict with paths to output files
"""
cmd = [
"iqtree2",
"-s", aligned_fasta,
"--prefix", output_prefix,
"-m", model,
"-B", str(bootstrap), # Ultrafast bootstrap
"-T", str(n_threads),
"--redo" # Overwrite existing results
]
if extra_args:
cmd.extend(extra_args)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"IQ-TREE failed:\n{result.stderr}")
# Print model selection result
log_file = f"{output_prefix}.log"
if os.path.exists(log_file):
with open(log_file) as f:
for line in f:
if "Best-fit model" in line:
print(f"IQ-TREE: {line.strip()}")
output_files = {
"tree": f"{output_prefix}.treefile",
"log": f"{output_prefix}.log",
"iqtree": f"{output_prefix}.iqtree", # Full report
"model": f"{output_prefix}.model.gz",
}
print(f"IQ-TREE: Tree saved to {output_files['tree']}")
return output_files
# IQ-TREE model selection guide:
# DNA: TEST → GTR+G, HKY+G, TrN+G
# Protein: TEST → LG+G4, WAG+G, JTT+G, Q.pfam+G
# Codon: TEST → MG+F3X4
# For temporal (molecular clock) analysis, add:
# extra_args = ["--date", "dates.txt", "--clock-test", "--date-CI", "95"]
```
### 4. FastTree — Fast Approximate ML
For large datasets (>1000 sequences) where IQ-TREE is too slow:
```python
def run_fasttree(aligned_fasta: str, output_tree: str,
sequence_type: str = "nt", model: str = "gtr",
n_threads: int = 4) -> str:
"""
Build a fast approximate ML tree with FastTree.
Args:
sequence_type: 'nt' for nucleotide or 'aa' for amino acid
model: For nt: 'gtr' (recommended) or 'jc'; for aa: 'lg', 'wag', 'jtt'
"""
if sequence_type == "nt":
cmd = ["FastTree", "-nt", "-gtr"]
else:
cmd = ["FastTree", f"-{model}"]
cmd += [aligned_fasta]
with open(output_tree, 'w') as out:
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"FastTree failed:\n{result.stderr}")
print(f"FastTree: Tree saved to {output_tree}")
return output_tree
```
### 5. Tree Analysis and Visualization with ETE3
```python
from ete3 import Tree, TreeStyle, NodeStyle, TextFace, PhyloTree
import matplotlib.pyplot as plt
def load_tree(tree_file: str) -> Tree:
"""Load a Newick tree file."""
t = Tree(tree_file)
print(f"Tree: {len(t)} leaves, {len(list(t.traverse()))} nodes")
return t
def basic_tree_stats(t: Tree) -> dict:
"""Compute basic tree statistics."""
leaves = t.get_leaves()
distances = [t.get_distance(l1, l2) for l1 in leaves[:min(50, len(leaves))]
for l2 in leaves[:min(50, len(leaves))] if l1 != l2]
stats = {
"n_leaves": len(leaves),
"n_internal_nodes": len(t) - len(leaves),
"total_branch_length": sum(n.dist for n in t.traverse()),
"max_leaf_distance": max(distances) if distances else 0,
"mean_leaf_distance": sum(distances)/len(distances) if distances else 0,
}
return stats
def find_mrca(t: Tree, leaf_names: list) -> Tree:
"""Find the most recent common ancestor of a set of leaves."""
return t.get_common_ancestor(*leaf_names)
def visualize_tree(t: Tree, output_file: str = "tree.png",
show_branch_support: bool = True,
color_groups: dict = None,
width: int = 800) -> None:
"""
Render phylogenetic tree to image.
Args:
t: ETE3 Tree object
color_groups: Dict mapping leaf_name → color (for coloring taxa)
show_branch_support: Show bootstrap values
"""
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_support = show_branch_support
ts.mode = "r" # 'r' = rectangular, 'c' = circular
if color_groups:
for node in t.traverse():
if node.is_leaf() and node.name in color_groups:
nstyle = NodeStyle()
nstyle["fgcolor"] = color_groups[node.name]
nstyle["size"] = 8
node.set_style(nstyle)
t.render(output_file, tree_style=ts, w=width, units="px")
print(f"Tree saved to: {output_file}")
def midpoint_root(t: Tree) -> Tree:
"""Root tree at midpoint (use when outgroup unknown)."""
t.set_outgroup(t.get_midpoint_outgroup())
return t
def prune_tree(t: Tree, keep_leaves: list) -> Tree:
"""Prune tree to keep only specified leaves."""
t.prune(keep_leaves, preserve_branch_length=True)
return t
```
### 6. Complete Analysis Script
```python
import subprocess, os
from ete3 import Tree
def full_phylogenetic_analysis(
input_fasta: str,
output_dir: str = "phylo_results",
sequence_type: str = "nt",
n_threads: int = 4,
bootstrap: int = 1000,
use_fasttree: bool = False
) -> dict:
"""
Complete phylogenetic pipeline: align → trim → tree → visualize.
Args:
input_fasta: Unaligned FASTA
sequence_type: 'nt' (nucleotide) or 'aa' (amino acid/protein)
use_fasttree: Use FastTree instead of IQ-TREE (faster for large datasets)
"""
os.makedirs(output_dir, exist_ok=True)
prefix = os.path.join(output_dir, "phylo")
print("=" * 50)
print("Step 1: Multiple Sequence Alignment (MAFFT)")
aligned = run_mafft(input_fasta, f"{prefix}_aligned.fasta",
method="auto", n_threads=n_threads)
print("\nStep 2: Tree Inference")
if use_fasttree:
tree_file = run_fasttree(
aligned, f"{prefix}.tree",
sequence_type=sequence_type,
model="gtr" if sequence_type == "nt" else "lg"
)
else:
model = "TEST" if sequence_type == "nt" else "TEST"
iqtree_files = run_iqtree(
aligned, prefix,
model=model,
bootstrap=bootstrap,
n_threads=n_threads
)
tree_file = iqtree_files["tree"]
print("\nStep 3: Tree Analysis")
t = Tree(tree_file)
t = midpoint_root(t)
stats = basic_tree_stats(t)
print(f"Tree statistics: {stats}")
print("\nStep 4: Visualization")
visualize_tree(t, f"{prefix}_tree.png", show_branch_support=True)
# Save rooted tree
rooted_tree_file = f"{prefix}_rooted.nwk"
t.write(format=1, outfile=rooted_tree_file)
results = {
"aligned_fasta": aligned,
"tree_file": tree_file,
"rooted_tree": rooted_tree_file,
"visualization": f"{prefix}_tree.png",
"stats": stats
}
print("\n" + "=" * 50)
print("Phylogenetic analysis complete!")
print(f"Results in: {output_dir}/")
return results
```
## IQ-TREE Model Guide
### DNA Models
| Model | Description | Use case |
|-------|-------------|---------|
| `GTR+G4` | General Time Reversible + Gamma | Most flexible DNA model |
| `HKY+G4` | Hasegawa-Kishino-Yano + Gamma | Two-rate model (common) |
| `TrN+G4` | Tamura-Nei | Unequal transitions |
| `JC` | Jukes-Cantor | Simplest; all rates equal |
### Protein Models
| Model | Description | Use case |
|-------|-------------|---------|
| `LG+G4` | Le-Gascuel + Gamma | Best average protein model |
| `WAG+G4` | Whelan-Goldman | Widely used |
| `JTT+G4` | Jones-Taylor-Thornton | Classical model |
| `Q.pfam+G4` | pfam-trained | For Pfam-like protein families |
| `Q.bird+G4` | Bird-specific | Vertebrate proteins |
**Tip:** Use `-m TEST` to let IQ-TREE automatically select the best model.
## Best Practices
- **Alignment quality first**: Poor alignment → unreliable trees; check alignment manually
- **Use `linsi` for small (<200 seq), `fftns` or `auto` for large alignments**
- **Model selection**: Always use `-m TEST` for IQ-TREE unless you have a specific reason
- **Bootstrap**: Use ≥1000 ultrafast bootstraps (`-B 1000`) for branch support
- **Root the tree**: Unrooted trees can be misleading; use outgroup or midpoint rooting
- **FastTree for >5000 sequences**: IQ-TREE becomes slow; FastTree is 10100× faster
- **Trim long alignments**: TrimAl removes unreliable columns; improves tree accuracy
- **Check for recombination** in viral/bacterial sequences before building trees (`RDP4`, `GARD`)
## Additional Resources
- **MAFFT**: https://mafft.cbrc.jp/alignment/software/
- **IQ-TREE 2**: http://www.iqtree.org/ | Tutorial: https://www.iqtree.org/workshop/molevol2022
- **FastTree**: http://www.microbesonline.org/fasttree/
- **ETE3**: http://etetoolkit.org/
- **FigTree** (GUI visualization): https://tree.bio.ed.ac.uk/software/figtree/
- **iTOL** (web visualization): https://itol.embl.de/
- **MUSCLE** (alternative aligner): https://www.drive5.com/muscle/
- **TrimAl** (alignment trimming): https://vicfero.github.io/trimal/

View File

@@ -0,0 +1,181 @@
# IQ-TREE 2 Phylogenetic Inference Reference
## Basic Command Syntax
```bash
iqtree2 -s alignment.fasta --prefix output -m TEST -B 1000 -T AUTO --redo
```
## Key Parameters
| Flag | Description | Default |
|------|-------------|---------|
| `-s` | Input alignment file | Required |
| `--prefix` | Output file prefix | alignment name |
| `-m` | Substitution model (or TEST) | GTR+G |
| `-B` | Ultrafast bootstrap replicates | Off |
| `-b` | Standard bootstrap replicates (slow) | Off |
| `-T` | Number of threads (or AUTO) | 1 |
| `-o` | Outgroup taxa name(s) | None (unrooted) |
| `--redo` | Overwrite existing results | Off |
| `-alrt` | SH-aLRT test replicates | Off |
## Model Selection
```bash
# Full model testing (automatically selects best model)
iqtree2 -s alignment.fasta -m TEST --prefix test_run -B 1000 -T 4
# Specify model explicitly
iqtree2 -s alignment.fasta -m GTR+G4 --prefix gtr_run -B 1000
# Protein sequences
iqtree2 -s protein.fasta -m TEST --prefix prot_tree -B 1000
# Codon-based analysis
iqtree2 -s codon.fasta -m GY --prefix codon_tree -B 1000
```
## Bootstrapping Methods
### Ultrafast Bootstrap (UFBoot, recommended)
```bash
iqtree2 -s alignment.fasta -B 1000 # 1000 replicates
# Values ≥95 are reliable
# ~10× faster than standard bootstrap
```
### Standard Bootstrap
```bash
iqtree2 -s alignment.fasta -b 100 # 100 replicates (very slow)
```
### SH-aLRT Test (fast alternative)
```bash
iqtree2 -s alignment.fasta -alrt 1000 -B 1000 # Both SH-aLRT and UFBoot
# SH-aLRT ≥80 AND UFBoot ≥95 = well-supported branch
```
## Branch Support Interpretation
| Bootstrap Value | Interpretation |
|----------------|----------------|
| ≥ 95 | Well-supported (strongly supported) |
| 7094 | Moderately supported |
| 5069 | Weakly supported |
| < 50 | Unreliable (not supported) |
## Output Files
| File | Description |
|------|-------------|
| `{prefix}.treefile` | Best ML tree in Newick format |
| `{prefix}.iqtree` | Full analysis report |
| `{prefix}.log` | Computation log |
| `{prefix}.contree` | Consensus tree from bootstrap |
| `{prefix}.splits.nex` | Network splits |
| `{prefix}.bionj` | BioNJ starting tree |
| `{prefix}.model.gz` | Saved model parameters |
## Advanced Analyses
### Molecular Clock (Dating)
```bash
# Temporal analysis with sampling dates
iqtree2 -s alignment.fasta -m GTR+G \
--date dates.tsv \ # Tab-separated: taxon_name YYYY-MM-DD
--clock-test \ # Test for clock-like evolution
--date-CI 95 \ # 95% CI for node dates
--prefix dated_tree
```
### Concordance Factors
```bash
# Gene concordance factor (gCF) - requires multiple gene alignments
iqtree2 --gcf gene_trees.nwk \
--tree main_tree.treefile \
--cf-verbose \
--prefix cf_analysis
```
### Ancestral Sequence Reconstruction
```bash
iqtree2 -s alignment.fasta -m LG+G4 \
-asr \ # Marginal ancestral state reconstruction
--prefix anc_tree
# Output: {prefix}.state (ancestral sequences per node)
```
### Partition Model (Multi-Gene)
```bash
# Create partition file (partitions.txt):
# DNA, gene1 = 1-500
# DNA, gene2 = 501-1000
iqtree2 -s concat_alignment.fasta \
-p partitions.txt \
-m TEST \
-B 1000 \
--prefix partition_tree
```
## IQ-TREE Log Parsing
```python
def parse_iqtree_log(log_file: str) -> dict:
"""Extract key results from IQ-TREE log file."""
results = {}
with open(log_file) as f:
for line in f:
if "Best-fit model" in line:
results["best_model"] = line.split(":")[1].strip()
elif "Log-likelihood of the tree:" in line:
results["log_likelihood"] = float(line.split(":")[1].strip())
elif "Number of free parameters" in line:
results["free_params"] = int(line.split(":")[1].strip())
elif "Akaike information criterion" in line:
results["AIC"] = float(line.split(":")[1].strip())
elif "Bayesian information criterion" in line:
results["BIC"] = float(line.split(":")[1].strip())
elif "Total CPU time used" in line:
results["cpu_time"] = line.split(":")[1].strip()
return results
# Example:
# results = parse_iqtree_log("output.log")
# print(f"Best model: {results['best_model']}")
# print(f"Log-likelihood: {results['log_likelihood']:.2f}")
```
## Common Issues and Solutions
| Issue | Likely Cause | Solution |
|-------|-------------|---------|
| All bootstrap values = 0 | Too few taxa | Need ≥4 taxa for bootstrap |
| Very long branches | Alignment artifacts | Re-trim alignment; check for outliers |
| Memory error | Too many sequences | Use FastTree; or reduce `-T` to 1 |
| Poor model fit | Wrong alphabet | Check nucleotide vs. protein specification |
| Identical sequences | Duplicate sequences | Remove duplicates before alignment |
## MAFFT Alignment Guide
```bash
# Accurate (< 200 sequences)
mafft --localpair --maxiterate 1000 input.fasta > aligned.fasta
# Medium (200-1000 sequences)
mafft --auto input.fasta > aligned.fasta
# Fast (> 1000 sequences)
mafft --fftns input.fasta > aligned.fasta
# Very large (> 10000 sequences)
mafft --retree 1 input.fasta > aligned.fasta
# Using multiple threads
mafft --thread 8 --auto input.fasta > aligned.fasta
```

View File

@@ -0,0 +1,270 @@
"""
Phylogenetic Analysis Pipeline
===============================
Complete workflow: MAFFT alignment → IQ-TREE tree → ETE3 visualization.
Requirements:
conda install -c bioconda mafft iqtree
pip install ete3
Usage:
python phylogenetic_analysis.py sequences.fasta --type nt --threads 4
python phylogenetic_analysis.py proteins.fasta --type aa --fasttree
"""
import argparse
import os
import subprocess
import sys
from pathlib import Path
def check_dependencies():
"""Check that required tools are installed."""
tools = {
"mafft": "conda install -c bioconda mafft",
"iqtree2": "conda install -c bioconda iqtree",
}
missing = []
for tool, install_cmd in tools.items():
result = subprocess.run(["which", tool], capture_output=True)
if result.returncode != 0:
missing.append(f" {tool}: {install_cmd}")
if missing:
print("Missing dependencies:")
for m in missing:
print(m)
sys.exit(1)
print("All dependencies found.")
def count_sequences(fasta_file: str) -> int:
"""Count sequences in a FASTA file."""
with open(fasta_file) as f:
return sum(1 for line in f if line.startswith('>'))
def run_mafft(input_fasta: str, output_fasta: str, n_threads: int = 4,
method: str = "auto") -> str:
"""Run MAFFT multiple sequence alignment."""
n_seqs = count_sequences(input_fasta)
print(f"MAFFT: Aligning {n_seqs} sequences...")
# Auto-select method based on dataset size
if method == "auto":
if n_seqs <= 200:
cmd = ["mafft", "--localpair", "--maxiterate", "1000",
"--thread", str(n_threads), "--inputorder", input_fasta]
elif n_seqs <= 1000:
cmd = ["mafft", "--auto", "--thread", str(n_threads),
"--inputorder", input_fasta]
else:
cmd = ["mafft", "--fftns", "--thread", str(n_threads),
"--inputorder", input_fasta]
else:
cmd = ["mafft", f"--{method}", "--thread", str(n_threads),
"--inputorder", input_fasta]
with open(output_fasta, 'w') as out:
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"MAFFT failed:\n{result.stderr[:500]}")
print(f" Alignment complete → {output_fasta}")
return output_fasta
def run_iqtree(aligned_fasta: str, prefix: str, seq_type: str = "nt",
bootstrap: int = 1000, n_threads: int = 4,
outgroup: str = None) -> str:
"""Run IQ-TREE 2 phylogenetic inference."""
print(f"IQ-TREE 2: Building maximum likelihood tree...")
cmd = [
"iqtree2",
"-s", aligned_fasta,
"--prefix", prefix,
"-m", "TEST", # Auto model selection
"-B", str(bootstrap), # Ultrafast bootstrap
"-T", str(n_threads),
"--redo",
"-alrt", "1000", # SH-aLRT test
]
if outgroup:
cmd += ["-o", outgroup]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"IQ-TREE failed:\n{result.stderr[:500]}")
tree_file = f"{prefix}.treefile"
# Extract best model from log
log_file = f"{prefix}.log"
if os.path.exists(log_file):
with open(log_file) as f:
for line in f:
if "Best-fit model" in line:
print(f" {line.strip()}")
print(f" Tree saved → {tree_file}")
return tree_file
def run_fasttree(aligned_fasta: str, output_tree: str, seq_type: str = "nt") -> str:
"""Run FastTree (faster alternative for large datasets)."""
print("FastTree: Building approximate ML tree (faster)...")
if seq_type == "nt":
cmd = ["FastTree", "-nt", "-gtr", "-gamma", aligned_fasta]
else:
cmd = ["FastTree", "-lg", "-gamma", aligned_fasta]
with open(output_tree, 'w') as out:
result = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
raise RuntimeError(f"FastTree failed:\n{result.stderr[:500]}")
print(f" Tree saved → {output_tree}")
return output_tree
def visualize_tree(tree_file: str, output_png: str, outgroup: str = None) -> None:
"""Visualize the phylogenetic tree with ETE3."""
try:
from ete3 import Tree, TreeStyle, NodeStyle
except ImportError:
print("ETE3 not installed. Skipping visualization.")
print(" Install: pip install ete3")
return
t = Tree(tree_file)
# Root the tree
if outgroup and outgroup in [leaf.name for leaf in t.get_leaves()]:
t.set_outgroup(outgroup)
print(f" Rooted at outgroup: {outgroup}")
else:
# Midpoint rooting
t.set_outgroup(t.get_midpoint_outgroup())
print(" Applied midpoint rooting")
# Style
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_support = True
ts.mode = "r" # rectangular
try:
t.render(output_png, tree_style=ts, w=800, units="px")
print(f" Visualization saved → {output_png}")
except Exception as e:
print(f" Visualization failed (display issue?): {e}")
# Save tree in Newick format as fallback
rooted_nwk = output_png.replace(".png", "_rooted.nwk")
t.write(format=1, outfile=rooted_nwk)
print(f" Rooted tree saved → {rooted_nwk}")
def tree_summary(tree_file: str) -> dict:
"""Print summary statistics for the tree."""
try:
from ete3 import Tree
t = Tree(tree_file)
t.set_outgroup(t.get_midpoint_outgroup())
leaves = t.get_leaves()
branch_lengths = [n.dist for n in t.traverse() if n.dist > 0]
stats = {
"n_taxa": len(leaves),
"total_branch_length": sum(branch_lengths),
"mean_branch_length": sum(branch_lengths) / len(branch_lengths) if branch_lengths else 0,
"max_branch_length": max(branch_lengths) if branch_lengths else 0,
}
print("\nTree Summary:")
for k, v in stats.items():
if isinstance(v, float):
print(f" {k}: {v:.6f}")
else:
print(f" {k}: {v}")
return stats
except Exception as e:
print(f"Could not compute tree stats: {e}")
return {}
def main():
parser = argparse.ArgumentParser(description="Phylogenetic analysis pipeline")
parser.add_argument("input", help="Input FASTA file (unaligned)")
parser.add_argument("--type", choices=["nt", "aa"], default="nt",
help="Sequence type: nt (nucleotide) or aa (amino acid)")
parser.add_argument("--threads", type=int, default=4, help="Number of threads")
parser.add_argument("--bootstrap", type=int, default=1000,
help="Bootstrap replicates for IQ-TREE")
parser.add_argument("--fasttree", action="store_true",
help="Use FastTree instead of IQ-TREE (faster, less accurate)")
parser.add_argument("--outgroup", help="Outgroup taxon name for rooting")
parser.add_argument("--mafft-method", default="auto",
choices=["auto", "linsi", "einsi", "fftnsi", "fftns"],
help="MAFFT alignment method")
parser.add_argument("--output-dir", default="phylo_results",
help="Output directory")
args = parser.parse_args()
# Setup
os.makedirs(args.output_dir, exist_ok=True)
prefix = os.path.join(args.output_dir, Path(args.input).stem)
print("=" * 60)
print("Phylogenetic Analysis Pipeline")
print("=" * 60)
print(f"Input: {args.input}")
print(f"Sequence type: {args.type}")
print(f"Output dir: {args.output_dir}")
# Step 1: Multiple Sequence Alignment
print("\n[Step 1/3] Multiple Sequence Alignment (MAFFT)")
aligned = run_mafft(
args.input,
f"{prefix}_aligned.fasta",
n_threads=args.threads,
method=args.mafft_method
)
# Step 2: Tree Inference
print("\n[Step 2/3] Tree Inference")
if args.fasttree:
tree_file = run_fasttree(aligned, f"{prefix}.tree", seq_type=args.type)
else:
tree_file = run_iqtree(
aligned, prefix,
seq_type=args.type,
bootstrap=args.bootstrap,
n_threads=args.threads,
outgroup=args.outgroup
)
# Step 3: Visualization
print("\n[Step 3/3] Visualization (ETE3)")
visualize_tree(tree_file, f"{prefix}_tree.png", outgroup=args.outgroup)
tree_summary(tree_file)
print("\n" + "=" * 60)
print("Analysis complete!")
print(f"Key outputs:")
print(f" Aligned sequences: {aligned}")
print(f" Tree file: {tree_file}")
print(f" Visualization: {prefix}_tree.png")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,321 @@
---
name: scvelo
description: RNA velocity analysis with scVelo. Estimate cell state transitions from unspliced/spliced mRNA dynamics, infer trajectory directions, compute latent time, and identify driver genes in single-cell RNA-seq data. Complements Scanpy/scVI-tools for trajectory inference.
license: BSD-3-Clause
metadata:
skill-author: Kuan-lin Huang
---
# scVelo — RNA Velocity Analysis
## Overview
scVelo is the leading Python package for RNA velocity analysis in single-cell RNA-seq data. It infers cell state transitions by modeling the kinetics of mRNA splicing — using the ratio of unspliced (pre-mRNA) to spliced (mature mRNA) abundances to determine whether a gene is being upregulated or downregulated in each cell. This allows reconstruction of developmental trajectories and identification of cell fate decisions without requiring time-course data.
**Installation:** `pip install scvelo`
**Key resources:**
- Documentation: https://scvelo.readthedocs.io/
- GitHub: https://github.com/theislab/scvelo
- Paper: Bergen et al. (2020) Nature Biotechnology. PMID: 32747759
## When to Use This Skill
Use scVelo when:
- **Trajectory inference from snapshot data**: Determine which direction cells are differentiating
- **Cell fate prediction**: Identify progenitor cells and their downstream fates
- **Driver gene identification**: Find genes whose dynamics best explain observed trajectories
- **Developmental biology**: Model hematopoiesis, neurogenesis, epithelial-to-mesenchymal transitions
- **Latent time estimation**: Order cells along a pseudotime derived from splicing dynamics
- **Complement to Scanpy**: Add directional information to UMAP embeddings
## Prerequisites
scVelo requires count matrices for both **unspliced** and **spliced** RNA. These are generated by:
1. **STARsolo** or **kallisto|bustools** with `lamanno` mode
2. **velocyto** CLI: `velocyto run10x` / `velocyto run`
3. **alevin-fry** / **simpleaf** with spliced/unspliced output
Data is stored in an `AnnData` object with `layers["spliced"]` and `layers["unspliced"]`.
## Standard RNA Velocity Workflow
### 1. Setup and Data Loading
```python
import scvelo as scv
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
# Configure settings
scv.settings.verbosity = 3 # Show computation steps
scv.settings.presenter_view = True
scv.settings.set_figure_params('scvelo')
# Load data (AnnData with spliced/unspliced layers)
# Option A: Load from loom (velocyto output)
adata = scv.read("cellranger_output.loom", cache=True)
# Option B: Merge velocyto loom with Scanpy-processed AnnData
adata_processed = sc.read_h5ad("processed.h5ad") # Has UMAP, clusters
adata_velocity = scv.read("velocyto.loom")
adata = scv.utils.merge(adata_processed, adata_velocity)
# Verify layers
print(adata)
# obs × var: N × G
# layers: 'spliced', 'unspliced' (required)
# obsm['X_umap'] (required for visualization)
```
### 2. Preprocessing
```python
# Filter and normalize (follows Scanpy conventions)
scv.pp.filter_and_normalize(
adata,
min_shared_counts=20, # Minimum counts in spliced+unspliced
n_top_genes=2000 # Top highly variable genes
)
# Compute first and second order moments (means and variances)
# knn_connectivities must be computed first
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
scv.pp.moments(
adata,
n_pcs=30,
n_neighbors=30
)
```
### 3. Velocity Estimation — Stochastic Model
The stochastic model is fast and suitable for exploratory analysis:
```python
# Stochastic velocity (faster, less accurate)
scv.tl.velocity(adata, mode='stochastic')
scv.tl.velocity_graph(adata)
# Visualize
scv.pl.velocity_embedding_stream(
adata,
basis='umap',
color='leiden',
title="RNA Velocity (Stochastic)"
)
```
### 4. Velocity Estimation — Dynamical Model (Recommended)
The dynamical model fits the full splicing kinetics and is more accurate:
```python
# Recover dynamics (computationally intensive; ~10-30 min for 10K cells)
scv.tl.recover_dynamics(adata, n_jobs=4)
# Compute velocity from dynamical model
scv.tl.velocity(adata, mode='dynamical')
scv.tl.velocity_graph(adata)
```
### 5. Latent Time
The dynamical model enables computation of a shared latent time (pseudotime):
```python
# Compute latent time
scv.tl.latent_time(adata)
# Visualize latent time on UMAP
scv.pl.scatter(
adata,
color='latent_time',
color_map='gnuplot',
size=80,
title='Latent time'
)
# Identify top genes ordered by latent time
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(
adata,
var_names=top_genes,
sortby='latent_time',
col_color='leiden',
n_convolve=100
)
```
### 6. Driver Gene Analysis
```python
# Identify genes with highest velocity fit
scv.tl.rank_velocity_genes(adata, groupby='leiden', min_corr=0.3)
df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
print(df.head(10))
# Speed and coherence
scv.tl.velocity_confidence(adata)
scv.pl.scatter(
adata,
c=['velocity_length', 'velocity_confidence'],
cmap='coolwarm',
perc=[5, 95]
)
# Phase portraits for specific genes
scv.pl.velocity(adata, ['Cpe', 'Gnao1', 'Ins2'],
ncols=3, figsize=(16, 4))
```
### 7. Velocity Arrows and Pseudotime
```python
# Arrow plot on UMAP
scv.pl.velocity_embedding(
adata,
arrow_length=3,
arrow_size=2,
color='leiden',
basis='umap'
)
# Stream plot (cleaner visualization)
scv.pl.velocity_embedding_stream(
adata,
basis='umap',
color='leiden',
smooth=0.8,
min_mass=4
)
# Velocity pseudotime (alternative to latent time)
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot')
```
### 8. PAGA Trajectory Graph
```python
# PAGA graph with velocity-informed transitions
scv.tl.paga(adata, groups='leiden')
df = scv.get_df(adata, 'paga/transitions_confidence', precision=2).T
df.style.background_gradient(cmap='Blues').format('{:.2g}')
# Plot PAGA with velocity
scv.pl.paga(
adata,
basis='umap',
size=50,
alpha=0.1,
min_edge_width=2,
node_size_scale=1.5
)
```
## Complete Workflow Script
```python
import scvelo as scv
import scanpy as sc
def run_rna_velocity(adata, n_top_genes=2000, mode='dynamical', n_jobs=4):
"""
Complete RNA velocity workflow.
Args:
adata: AnnData with 'spliced' and 'unspliced' layers, UMAP in obsm
n_top_genes: Number of top HVGs for velocity
mode: 'stochastic' (fast) or 'dynamical' (accurate)
n_jobs: Parallel jobs for dynamical model
Returns:
Processed AnnData with velocity information
"""
scv.settings.verbosity = 2
# 1. Preprocessing
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=n_top_genes)
if 'neighbors' not in adata.uns:
sc.pp.neighbors(adata, n_neighbors=30)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
# 2. Velocity estimation
if mode == 'dynamical':
scv.tl.recover_dynamics(adata, n_jobs=n_jobs)
scv.tl.velocity(adata, mode=mode)
scv.tl.velocity_graph(adata)
# 3. Downstream analyses
if mode == 'dynamical':
scv.tl.latent_time(adata)
scv.tl.rank_velocity_genes(adata, groupby='leiden', min_corr=0.3)
scv.tl.velocity_confidence(adata)
scv.tl.velocity_pseudotime(adata)
return adata
```
## Key Output Fields in AnnData
After running the workflow, the following fields are added:
| Location | Key | Description |
|----------|-----|-------------|
| `adata.layers` | `velocity` | RNA velocity per gene per cell |
| `adata.layers` | `fit_t` | Fitted latent time per gene per cell |
| `adata.obsm` | `velocity_umap` | 2D velocity vectors on UMAP |
| `adata.obs` | `velocity_pseudotime` | Pseudotime from velocity |
| `adata.obs` | `latent_time` | Latent time from dynamical model |
| `adata.obs` | `velocity_length` | Speed of each cell |
| `adata.obs` | `velocity_confidence` | Confidence score per cell |
| `adata.var` | `fit_likelihood` | Gene-level model fit quality |
| `adata.var` | `fit_alpha` | Transcription rate |
| `adata.var` | `fit_beta` | Splicing rate |
| `adata.var` | `fit_gamma` | Degradation rate |
| `adata.uns` | `velocity_graph` | Cell-cell transition probability matrix |
## Velocity Models Comparison
| Model | Speed | Accuracy | When to Use |
|-------|-------|----------|-------------|
| `stochastic` | Fast | Moderate | Exploratory; large datasets |
| `deterministic` | Medium | Moderate | Simple linear kinetics |
| `dynamical` | Slow | High | Publication-quality; identifies driver genes |
## Best Practices
- **Start with stochastic mode** for exploration; switch to dynamical for final analysis
- **Need good coverage of unspliced reads**: Short reads (< 100 bp) may miss intron coverage
- **Minimum 2,000 cells**: RNA velocity is noisy with fewer cells
- **Velocity should be coherent**: Arrows should follow known biology; randomness indicates issues
- **k-NN bandwidth matters**: Too few neighbors → noisy velocity; too many → oversmoothed
- **Sanity check**: Root cells (progenitors) should have high unspliced/spliced ratios for marker genes
- **Dynamical model requires distinct kinetic states**: Works best for clear differentiation processes
## Troubleshooting
| Problem | Solution |
|---------|---------|
| Missing unspliced layer | Re-run velocyto or use STARsolo with `--soloFeatures Gene Velocyto` |
| Very few velocity genes | Lower `min_shared_counts`; check sequencing depth |
| Random-looking arrows | Try different `n_neighbors` or velocity model |
| Memory error with dynamical | Set `n_jobs=1`; reduce `n_top_genes` |
| Negative velocity everywhere | Check that spliced/unspliced layers are not swapped |
## Additional Resources
- **scVelo documentation**: https://scvelo.readthedocs.io/
- **Tutorial notebooks**: https://scvelo.readthedocs.io/tutorials/
- **GitHub**: https://github.com/theislab/scvelo
- **Paper**: Bergen V et al. (2020) Nature Biotechnology. PMID: 32747759
- **velocyto** (preprocessing): http://velocyto.org/
- **CellRank** (fate prediction, extends scVelo): https://cellrank.readthedocs.io/
- **dynamo** (metabolic labeling alternative): https://dynamo-release.readthedocs.io/

View File

@@ -0,0 +1,168 @@
# scVelo Velocity Models Reference
## Mathematical Framework
RNA velocity is based on the kinetic model of transcription:
```
dx_s/dt = β·x_u - γ·x_s (spliced dynamics)
dx_u/dt = α(t) - β·x_u (unspliced dynamics)
```
Where:
- `x_s`: spliced mRNA abundance
- `x_u`: unspliced (pre-mRNA) abundance
- `α(t)`: transcription rate (varies over time)
- `β`: splicing rate
- `γ`: degradation rate
**Velocity** is defined as: `v = dx_s/dt = β·x_u - γ·x_s`
- **v > 0**: Gene is being upregulated (more unspliced than expected at steady state)
- **v < 0**: Gene is being downregulated (less unspliced than expected)
## Model Comparison
### Steady-State (Velocyto, original)
- Assumes constant α (transcription rate)
- Fits γ using linear regression on steady-state cells
- **Limitation**: Requires identifiable steady states; assumes constant transcription
```python
# Use with scVelo for backward compatibility
scv.tl.velocity(adata, mode='steady_state')
```
### Stochastic Model (scVelo v1)
- Extends steady-state with variance/covariance terms
- Models cell-to-cell variability in mRNA counts
- More robust to noise than steady-state
```python
scv.tl.velocity(adata, mode='stochastic')
```
### Dynamical Model (scVelo v2, recommended)
- Jointly estimates all kinetic rates (α, β, γ) and cell-specific latent time
- Does not assume steady state
- Identifies induction vs. repression phases
- Computes fit_likelihood per gene (quality measure)
```python
scv.tl.recover_dynamics(adata, n_jobs=4)
scv.tl.velocity(adata, mode='dynamical')
```
**Kinetic states identified by dynamical model:**
| State | Description |
|-------|-------------|
| Induction | α > 0, x_u increasing |
| Steady-state on | α > 0, constant high expression |
| Repression | α = 0, x_u decreasing |
| Steady-state off | α = 0, constant low expression |
## Velocity Graph
The velocity graph connects cells based on their velocity similarity to neighboring cells' states:
```python
scv.tl.velocity_graph(adata)
# Stored in adata.uns['velocity_graph']
# Entry [i,j] = probability that cell i transitions to cell j
```
**Parameters:**
- `n_neighbors`: Number of neighbors considered
- `sqrt_transform`: Apply sqrt transform to data (default: False for spliced)
- `approx`: Use approximate nearest neighbor search (faster for large datasets)
## Latent Time Interpretation
Latent time τ ∈ [0, 1] for each gene represents:
- τ = 0: Gene is at onset of induction
- τ = 0.5: Gene is at peak of induction (for a complete cycle)
- τ = 1: Gene has returned to steady-state off
**Shared latent time** is computed by taking the average over all velocity genes, weighted by fit_likelihood.
## Quality Metrics
### Gene-level
- `fit_likelihood`: Goodness-of-fit of dynamical model (0-1; higher = better)
- Use for filtering driver genes: `adata.var[adata.var['fit_likelihood'] > 0.1]`
- `fit_alpha`: Transcription rate during induction
- `fit_gamma`: mRNA degradation rate
- `fit_r2`: R² of kinetic fit
### Cell-level
- `velocity_length`: Magnitude of velocity vector (cell speed)
- `velocity_confidence`: Coherence of velocity with neighboring cells (0-1)
### Dataset-level
```python
# Check overall velocity quality
scv.pl.proportions(adata) # Ratio of spliced/unspliced per cell
scv.pl.velocity_confidence(adata, groupby='leiden')
```
## Parameter Tuning Guide
| Parameter | Function | Default | When to Change |
|-----------|----------|---------|----------------|
| `min_shared_counts` | Filter genes | 20 | Increase for deep sequencing; decrease for shallow |
| `n_top_genes` | HVG selection | 2000 | Increase for complex datasets |
| `n_neighbors` | kNN graph | 30 | Decrease for small datasets; increase for noisy |
| `n_pcs` | PCA dimensions | 30 | Match to elbow in scree plot |
| `t_max_rank` | Latent time constraint | None | Set if known developmental direction |
## Integration with Other Tools
### CellRank (Fate Prediction)
```python
import cellrank as cr
from cellrank.kernels import VelocityKernel, ConnectivityKernel
# Combine velocity and connectivity kernels
vk = VelocityKernel(adata).compute_transition_matrix()
ck = ConnectivityKernel(adata).compute_transition_matrix()
combined = 0.8 * vk + 0.2 * ck
# Compute macrostates (terminal and initial states)
g = cr.estimators.GPCCA(combined)
g.compute_macrostates(n_states=4, cluster_key='leiden')
g.plot_macrostates(which="all")
# Compute fate probabilities
g.compute_fate_probabilities()
g.plot_fate_probabilities()
```
### Scanpy Integration
scVelo works natively with Scanpy's AnnData:
```python
import scanpy as sc
import scvelo as scv
# Run standard Scanpy pipeline first
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata)
# Then add velocity on top
scv.pp.moments(adata)
scv.tl.recover_dynamics(adata)
scv.tl.velocity(adata, mode='dynamical')
scv.tl.velocity_graph(adata)
scv.tl.latent_time(adata)
```

View File

@@ -0,0 +1,232 @@
"""
RNA Velocity Analysis Workflow using scVelo
===========================================
Complete pipeline from raw data to velocity visualization.
Usage:
python rna_velocity_workflow.py
Or import and use run_velocity_analysis() with your AnnData object.
"""
import scvelo as scv
import scanpy as sc
import numpy as np
import matplotlib
matplotlib.use('Agg') # Non-interactive backend
import matplotlib.pyplot as plt
import os
def run_velocity_analysis(
adata,
groupby="leiden",
n_top_genes=2000,
n_neighbors=30,
mode="dynamical",
n_jobs=4,
output_dir="velocity_results",
):
"""
Complete RNA velocity analysis workflow.
Parameters
----------
adata : AnnData
AnnData object with 'spliced' and 'unspliced' layers.
Should already have UMAP and cluster annotations.
groupby : str
Column in adata.obs for cell type labels.
n_top_genes : int
Number of top highly variable genes.
n_neighbors : int
Number of neighbors for moment computation.
mode : str
Velocity model: 'stochastic' (fast) or 'dynamical' (accurate).
n_jobs : int
Parallel jobs for dynamical model fitting.
output_dir : str
Directory for saving output figures.
Returns
-------
AnnData with velocity annotations.
"""
os.makedirs(output_dir, exist_ok=True)
# ── Settings ──────────────────────────────────────────────────────────────
scv.settings.verbosity = 2
scv.settings.figdir = output_dir
# ── Step 1: Check layers ───────────────────────────────────────────────────
assert "spliced" in adata.layers, "Missing 'spliced' layer. Run velocyto first."
assert "unspliced" in adata.layers, "Missing 'unspliced' layer. Run velocyto first."
print(f"Input: {adata.n_obs} cells × {adata.n_vars} genes")
# ── Step 2: Preprocessing ─────────────────────────────────────────────────
print("Step 1/5: Preprocessing...")
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=n_top_genes)
if "neighbors" not in adata.uns:
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=30)
scv.pp.moments(adata, n_pcs=30, n_neighbors=n_neighbors)
print(f" {adata.n_vars} velocity genes selected")
# ── Step 3: Velocity estimation ────────────────────────────────────────────
print(f"Step 2/5: Fitting velocity model ({mode})...")
if mode == "dynamical":
scv.tl.recover_dynamics(adata, n_jobs=n_jobs)
scv.tl.velocity(adata, mode=mode)
scv.tl.velocity_graph(adata)
print(" Velocity graph computed")
# ── Step 4: Downstream analyses ────────────────────────────────────────────
print("Step 3/5: Computing latent time and confidence...")
scv.tl.velocity_confidence(adata)
scv.tl.velocity_pseudotime(adata)
if mode == "dynamical":
scv.tl.latent_time(adata)
if groupby in adata.obs.columns:
scv.tl.rank_velocity_genes(adata, groupby=groupby, min_corr=0.3)
# ── Step 5: Visualization ─────────────────────────────────────────────────
print("Step 4/5: Generating figures...")
# Stream plot
scv.pl.velocity_embedding_stream(
adata,
basis="umap",
color=groupby,
title="RNA Velocity",
save=f"{output_dir}/velocity_stream.png",
)
# Arrow plot
scv.pl.velocity_embedding(
adata,
arrow_length=3,
arrow_size=2,
color=groupby,
basis="umap",
save=f"{output_dir}/velocity_arrows.png",
)
# Pseudotime
scv.pl.scatter(
adata,
color="velocity_pseudotime",
cmap="gnuplot",
title="Velocity Pseudotime",
save=f"{output_dir}/pseudotime.png",
)
if mode == "dynamical" and "latent_time" in adata.obs:
scv.pl.scatter(
adata,
color="latent_time",
color_map="gnuplot",
title="Latent Time",
save=f"{output_dir}/latent_time.png",
)
# Speed and coherence
scv.pl.scatter(
adata,
c=["velocity_length", "velocity_confidence"],
cmap="coolwarm",
perc=[5, 95],
save=f"{output_dir}/velocity_quality.png",
)
# Top driver genes heatmap (dynamical only)
if mode == "dynamical" and "fit_likelihood" in adata.var:
top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:50]
scv.pl.heatmap(
adata,
var_names=top_genes,
sortby="latent_time",
col_color=groupby,
n_convolve=50,
save=f"{output_dir}/driver_gene_heatmap.png",
)
# ── Step 6: Save results ───────────────────────────────────────────────────
print("Step 5/5: Saving results...")
output_h5ad = os.path.join(output_dir, "adata_velocity.h5ad")
adata.write_h5ad(output_h5ad)
print(f" Saved to {output_h5ad}")
# Summary statistics
confidence = adata.obs["velocity_confidence"].dropna()
print("\nSummary:")
print(f" Velocity model: {mode}")
print(f" Cells: {adata.n_obs}")
print(f" Velocity genes: {adata.n_vars}")
print(f" Mean velocity confidence: {confidence.mean():.3f}")
print(f" High-confidence cells (>0.7): {(confidence > 0.7).sum()} ({(confidence > 0.7).mean():.1%})")
if mode == "dynamical" and "fit_likelihood" in adata.var:
good_genes = (adata.var["fit_likelihood"] > 0.1).sum()
print(f" Well-fit genes (likelihood>0.1): {good_genes}")
print(f"\nOutput files saved to: {output_dir}/")
return adata
def load_from_loom(loom_path, processed_h5ad=None):
"""
Load velocity data from velocyto loom file.
Args:
loom_path: Path to velocyto output loom file
processed_h5ad: Optional path to pre-processed Scanpy h5ad file
"""
adata_loom = scv.read(loom_path, cache=True)
if processed_h5ad:
adata_processed = sc.read_h5ad(processed_h5ad)
# Merge: keep processed metadata and add velocity layers
adata = scv.utils.merge(adata_processed, adata_loom)
else:
adata = adata_loom
# Run basic Scanpy pipeline
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.5)
return adata
if __name__ == "__main__":
# Example usage with simulated data (for testing)
print("scVelo RNA Velocity Workflow - Demo Mode")
print("=" * 50)
# Load example dataset
adata = scv.datasets.pancreas()
print(f"Loaded pancreas dataset: {adata}")
# Run analysis
adata = run_velocity_analysis(
adata,
groupby="clusters",
n_top_genes=2000,
mode="dynamical",
n_jobs=2,
output_dir="pancreas_velocity",
)
print("\nAnalysis complete!")
print(f"Key results:")
print(f" adata.layers['velocity']: velocity per gene per cell")
print(f" adata.obs['latent_time']: pseudotime from dynamics")
print(f" adata.obs['velocity_confidence']: per-cell confidence")
if "rank_velocity_genes" in adata.uns:
print(f" adata.uns['rank_velocity_genes']: driver genes per cluster")