Add more databases

This commit is contained in:
Timothy Kassis
2025-10-19 19:16:45 -07:00
parent 56a8312fc9
commit 9f4154a9ed
13 changed files with 4739 additions and 5 deletions

View File

@@ -7,7 +7,7 @@
},
"metadata": {
"description": "Claude scientific skills from K-Dense Inc",
"version": "1.12.0"
"version": "1.15.0"
},
"plugins": [
{
@@ -61,15 +61,20 @@
"skills": [
"./scientific-databases/alphafold-database",
"./scientific-databases/chembl-database",
"./scientific-databases/clinpgx-database",
"./scientific-databases/clinvar-database",
"./scientific-databases/cosmic-database",
"./scientific-databases/ena-database",
"./scientific-databases/ensembl-database",
"./scientific-databases/gene-database",
"./scientific-databases/geo-database",
"./scientific-databases/hmdb-database",
"./scientific-databases/kegg-database",
"./scientific-databases/metabolomics-workbench-database",
"./scientific-databases/pdb-database",
"./scientific-databases/pubchem-database",
"./scientific-databases/pubmed-database",
"./scientific-databases/reactome-database",
"./scientific-databases/string-database",
"./scientific-databases/uniprot-database",
"./scientific-databases/zinc-database"

View File

@@ -8,16 +8,20 @@ A comprehensive collection of ready-to-use scientific skills for Claude, curated
- **AlphaFold DB** - AI-predicted protein structure database with 200M+ predictions, confidence metrics (pLDDT, PAE), and Google Cloud bulk access
- **ChEMBL** - Bioactive molecule database with drug-like properties (2M+ compounds, 19M+ activities, 13K+ targets)
- **ClinPGx** - Clinical pharmacogenomics database (successor to PharmGKB) providing gene-drug interactions, CPIC clinical guidelines, allele functions, drug labels, and pharmacogenomic annotations for precision medicine and personalized pharmacotherapy (consolidates PharmGKB, CPIC, and PharmCAT resources)
- **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
- **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
- **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
- **Ensembl** - Genome browser and bioinformatics database providing genomic annotations, sequences, variants, and comparative genomics data for 250+ vertebrate species (Release 115, 2025) with comprehensive REST API for gene lookups, sequence retrieval, variant effect prediction (VEP), ortholog finding, assembly mapping (GRCh37/GRCh38), and region analysis
- **GEO (Gene Expression Omnibus)** - High-throughput gene expression and functional genomics data repository (264K+ studies, 8M+ samples) with microarray, RNA-seq, and expression profile access
- **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
- **KEGG** - Kyoto Encyclopedia of Genes and Genomes for biological pathway analysis, gene-to-pathway mapping, compound searches, and molecular interaction networks (pathway enrichment, metabolic pathways, gene annotations, drug-drug interactions, ID conversion)
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository with 4,200+ processed studies, standardized nomenclature (RefMet), mass spectrometry searches, and comprehensive REST API for accessing metabolite structures, study metadata, experimental results, and gene/protein-metabolite associations
- **NCBI Gene** - Work with NCBI Gene database to search, retrieve, and analyze gene information including nomenclature, sequences, variations, phenotypes, and pathways using E-utilities and Datasets API
- **Protein Data Bank (PDB)** - Access 3D structural data of proteins, nucleic acids, and biological macromolecules (200K+ structures) with search, retrieval, and analysis capabilities
- **PubChem** - Access chemical compound data from the world's largest free chemical database (110M+ compounds, 270M+ bioactivities)
- **PubMed** - Access to PubMed literature database with advanced search capabilities
- **Reactome** - Curated pathway database for biological processes and molecular interactions (2,825+ human pathways, 16K+ reactions, 11K+ proteins) with pathway enrichment analysis, expression data analysis, and species comparison using Content Service and Analysis Service APIs
- **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
- **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
- **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery (230M+ purchasable compounds in ready-to-dock 3D formats)
@@ -125,16 +129,12 @@ You can use Anthropic's pre-built skills, and upload custom skills, via the Clau
- **DAVID** - Database for Annotation, Visualization and Integrated Discovery for functional enrichment analysis
- **dbSNP** - NCBI's database of single nucleotide polymorphisms and short genetic variations
- **DrugBank** - Comprehensive drug and drug target database with pharmacological and pharmaceutical data
- **Ensembl** - Genome browser with annotation, comparative genomics, and variant data
- **GenBank** - NIH genetic sequence database (part of NCBI but with specific access patterns)
- **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies
- **InterPro** - Protein sequence analysis and classification with functional annotations
- **MetaboLights** - EMBL-EBI metabolomics database with experimental data and metadata
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository
- **OMIM** - Online Mendelian Inheritance in Man for genetic disorders and genes
- **Pfam** - Protein families database with multiple sequence alignments and HMMs
- **PharmGKB** - Pharmacogenomics Knowledge Base linking genetic variation to drug response
- **Reactome** - Curated pathway database with biological processes and molecular interactions
- **RefSeq** - NCBI's non-redundant reference sequence database
- **TCGA** - The Cancer Genome Atlas with multi-omic cancer genomics data
- **UCSC Genome Browser** - Genomic data visualization and custom track integration

View File

@@ -0,0 +1,632 @@
---
name: clinpgx-database
description: Toolkit for accessing ClinPGx, a clinical pharmacogenomics database providing information on how genetic variation affects drug response. Use this skill when working with pharmacogenomics data, querying gene-drug interactions, accessing CPIC clinical guidelines, retrieving allele function and frequency information, exploring PharmGKB annotations, or conducting research on personalized medicine and precision pharmacotherapy. ClinPGx consolidates PharmGKB, CPIC, and PharmCAT resources.
---
# ClinPGx Database
## Overview
Facilitate access to and querying of ClinPGx (Clinical Pharmacogenomics Database), a comprehensive resource for clinical pharmacogenomics information. ClinPGx is the successor to PharmGKB (launched officially in July 2025) and consolidates data from PharmGKB, CPIC (Clinical Pharmacogenetics Implementation Consortium), and PharmCAT (Pharmacogenomics Clinical Annotation Tool). The database provides curated information on how human genetic variation affects medication response, including gene-drug pairs, clinical guidelines, allele functions, and drug labels. Managed at Stanford University as a ClinGen (Clinical Genome Resource) affiliate grant.
## When to Use This Skill
Use this skill when queries involve:
- **Gene-drug interactions**: Querying how genetic variants affect drug metabolism, efficacy, or toxicity
- **CPIC guidelines**: Accessing evidence-based clinical practice guidelines for pharmacogenetics
- **Allele information**: Retrieving allele function, frequency, and phenotype data
- **Drug labels**: Exploring FDA and other regulatory pharmacogenomic drug labeling
- **Pharmacogenomic annotations**: Accessing curated literature on gene-drug-disease relationships
- **Clinical decision support**: Using PharmDOG tool for phenoconversion and custom genotype interpretation
- **Precision medicine**: Implementing pharmacogenomic testing in clinical practice
- **Drug metabolism**: Understanding CYP450 and other pharmacogene functions
- **Personalized dosing**: Finding genotype-guided dosing recommendations
- **Adverse drug reactions**: Identifying genetic risk factors for drug toxicity
## Installation and Setup
### Python API Access
The ClinPGx REST API provides programmatic access to all database resources. Basic setup:
```bash
pip install requests
```
### API Endpoint
```python
BASE_URL = "https://api.clinpgx.org/v1/"
```
**Rate Limits**:
- 2 requests per second maximum
- Excessive requests will result in HTTP 429 (Too Many Requests) response
**Authentication**: Not required for basic access
**Data License**: Creative Commons Attribution-ShareAlike 4.0 International License
For substantial API use, notify the ClinPGx team at api@clinpgx.org
## Core Capabilities
### 1. Gene Queries
**Retrieve gene information** including function, clinical annotations, and pharmacogenomic significance:
```python
import requests
# Get gene details
response = requests.get("https://api.clinpgx.org/v1/gene/CYP2D6")
gene_data = response.json()
# Search for genes by name
response = requests.get("https://api.clinpgx.org/v1/gene",
params={"q": "CYP"})
genes = response.json()
```
**Key pharmacogenes**:
- **CYP450 enzymes**: CYP2D6, CYP2C19, CYP2C9, CYP3A4, CYP3A5
- **Transporters**: SLCO1B1, ABCB1, ABCG2
- **Other metabolizers**: TPMT, DPYD, NUDT15, UGT1A1
- **Receptors**: OPRM1, HTR2A, ADRB1
- **HLA genes**: HLA-B, HLA-A
### 2. Drug and Chemical Queries
**Retrieve drug information** including pharmacogenomic annotations and mechanisms:
```python
# Get drug details
response = requests.get("https://api.clinpgx.org/v1/chemical/PA448515") # Warfarin
drug_data = response.json()
# Search drugs by name
response = requests.get("https://api.clinpgx.org/v1/chemical",
params={"name": "warfarin"})
drugs = response.json()
```
**Drug categories with pharmacogenomic significance**:
- Anticoagulants (warfarin, clopidogrel)
- Antidepressants (SSRIs, TCAs)
- Immunosuppressants (tacrolimus, azathioprine)
- Oncology drugs (5-fluorouracil, irinotecan, tamoxifen)
- Cardiovascular drugs (statins, beta-blockers)
- Pain medications (codeine, tramadol)
- Antivirals (abacavir)
### 3. Gene-Drug Pair Queries
**Access curated gene-drug relationships** with clinical annotations:
```python
# Get gene-drug pair information
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "CYP2D6", "drug": "codeine"})
pair_data = response.json()
# Get all pairs for a gene
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "CYP2C19"})
all_pairs = response.json()
```
**Clinical annotation sources**:
- CPIC (Clinical Pharmacogenetics Implementation Consortium)
- DPWG (Dutch Pharmacogenetics Working Group)
- FDA (Food and Drug Administration) labels
- Peer-reviewed literature summary annotations
### 4. CPIC Guidelines
**Access evidence-based clinical practice guidelines**:
```python
# Get CPIC guideline
response = requests.get("https://api.clinpgx.org/v1/guideline/PA166104939")
guideline = response.json()
# List all CPIC guidelines
response = requests.get("https://api.clinpgx.org/v1/guideline",
params={"source": "CPIC"})
guidelines = response.json()
```
**CPIC guideline components**:
- Gene-drug pairs covered
- Clinical recommendations by phenotype
- Evidence levels and strength ratings
- Supporting literature
- Downloadable PDFs and supplementary materials
- Implementation considerations
**Example guidelines**:
- CYP2D6-codeine (avoid in ultra-rapid metabolizers)
- CYP2C19-clopidogrel (alternative therapy for poor metabolizers)
- TPMT-azathioprine (dose reduction for intermediate/poor metabolizers)
- DPYD-fluoropyrimidines (dose adjustment based on activity)
- HLA-B*57:01-abacavir (avoid if positive)
### 5. Allele and Variant Information
**Query allele function and frequency data**:
```python
# Get allele information
response = requests.get("https://api.clinpgx.org/v1/allele/CYP2D6*4")
allele_data = response.json()
# Get all alleles for a gene
response = requests.get("https://api.clinpgx.org/v1/allele",
params={"gene": "CYP2D6"})
alleles = response.json()
```
**Allele information includes**:
- Functional status (normal, decreased, no function, increased, uncertain)
- Population frequencies across ethnic groups
- Defining variants (SNPs, indels, CNVs)
- Phenotype assignment
- References to PharmVar and other nomenclature systems
**Phenotype categories**:
- **Ultra-rapid metabolizer** (UM): Increased enzyme activity
- **Normal metabolizer** (NM): Normal enzyme activity
- **Intermediate metabolizer** (IM): Reduced enzyme activity
- **Poor metabolizer** (PM): Little to no enzyme activity
### 6. Variant Annotations
**Access clinical annotations for specific genetic variants**:
```python
# Get variant information
response = requests.get("https://api.clinpgx.org/v1/variant/rs4244285")
variant_data = response.json()
# Search variants by position (if supported)
response = requests.get("https://api.clinpgx.org/v1/variant",
params={"chromosome": "10", "position": "94781859"})
variants = response.json()
```
**Variant data includes**:
- rsID and genomic coordinates
- Gene and functional consequence
- Allele associations
- Clinical significance
- Population frequencies
- Literature references
### 7. Clinical Annotations
**Retrieve curated literature annotations** (formerly PharmGKB clinical annotations):
```python
# Get clinical annotations
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
params={"gene": "CYP2D6"})
annotations = response.json()
# Filter by evidence level
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
params={"evidenceLevel": "1A"})
high_evidence = response.json()
```
**Evidence levels** (from highest to lowest):
- **Level 1A**: High-quality evidence, CPIC/FDA/DPWG guidelines
- **Level 1B**: High-quality evidence, not yet guideline
- **Level 2A**: Moderate evidence from well-designed studies
- **Level 2B**: Moderate evidence with some limitations
- **Level 3**: Limited or conflicting evidence
- **Level 4**: Case reports or weak evidence
### 8. Drug Labels
**Access pharmacogenomic information from drug labels**:
```python
# Get drug labels with PGx information
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
params={"drug": "warfarin"})
labels = response.json()
# Filter by regulatory source
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
params={"source": "FDA"})
fda_labels = response.json()
```
**Label information includes**:
- Testing recommendations
- Dosing guidance by genotype
- Warnings and precautions
- Biomarker information
- Regulatory source (FDA, EMA, PMDA, etc.)
### 9. Pathways
**Explore pharmacokinetic and pharmacodynamic pathways**:
```python
# Get pathway information
response = requests.get("https://api.clinpgx.org/v1/pathway/PA146123006") # Warfarin pathway
pathway_data = response.json()
# Search pathways by drug
response = requests.get("https://api.clinpgx.org/v1/pathway",
params={"drug": "warfarin"})
pathways = response.json()
```
**Pathway diagrams** show:
- Drug metabolism steps
- Enzymes and transporters involved
- Gene variants affecting each step
- Downstream effects on efficacy/toxicity
- Interactions with other pathways
## Query Workflow
### Workflow 1: Clinical Decision Support for Drug Prescription
1. **Identify patient genotype** for relevant pharmacogenes:
```python
# Example: Patient is CYP2C19 *1/*2 (intermediate metabolizer)
response = requests.get("https://api.clinpgx.org/v1/allele/CYP2C19*2")
allele_function = response.json()
```
2. **Query gene-drug pairs** for medication of interest:
```python
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "CYP2C19", "drug": "clopidogrel"})
pair_info = response.json()
```
3. **Retrieve CPIC guideline** for dosing recommendations:
```python
response = requests.get("https://api.clinpgx.org/v1/guideline",
params={"gene": "CYP2C19", "drug": "clopidogrel"})
guideline = response.json()
# Recommendation: Alternative antiplatelet therapy for IM/PM
```
4. **Check drug label** for regulatory guidance:
```python
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
params={"drug": "clopidogrel"})
label = response.json()
```
### Workflow 2: Gene Panel Analysis
1. **Get list of pharmacogenes** in clinical panel:
```python
pgx_panel = ["CYP2C19", "CYP2D6", "CYP2C9", "TPMT", "DPYD", "SLCO1B1"]
```
2. **For each gene, retrieve all drug interactions**:
```python
all_interactions = {}
for gene in pgx_panel:
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": gene})
all_interactions[gene] = response.json()
```
3. **Filter for CPIC guideline-level evidence**:
```python
for gene, pairs in all_interactions.items():
for pair in pairs:
if pair.get('cpicLevel'): # Has CPIC guideline
print(f"{gene} - {pair['drug']}: {pair['cpicLevel']}")
```
4. **Generate patient report** with actionable pharmacogenomic findings.
### Workflow 3: Drug Safety Assessment
1. **Query drug for PGx associations**:
```python
response = requests.get("https://api.clinpgx.org/v1/chemical",
params={"name": "abacavir"})
drug_id = response.json()[0]['id']
```
2. **Get clinical annotations**:
```python
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
params={"drug": drug_id})
annotations = response.json()
```
3. **Check for HLA associations** and toxicity risk:
```python
for annotation in annotations:
if 'HLA' in annotation.get('genes', []):
print(f"Toxicity risk: {annotation['phenotype']}")
print(f"Evidence level: {annotation['evidenceLevel']}")
```
4. **Retrieve screening recommendations** from guidelines and labels.
### Workflow 4: Research Analysis - Population Pharmacogenomics
1. **Get allele frequencies** for population comparison:
```python
response = requests.get("https://api.clinpgx.org/v1/allele",
params={"gene": "CYP2D6"})
alleles = response.json()
```
2. **Extract population-specific frequencies**:
```python
populations = ['European', 'African', 'East Asian', 'Latino']
frequency_data = {}
for allele in alleles:
allele_name = allele['name']
frequency_data[allele_name] = {
pop: allele.get(f'{pop}_frequency', 'N/A')
for pop in populations
}
```
3. **Calculate phenotype distributions** by population:
```python
# Combine allele frequencies with function to predict phenotypes
phenotype_dist = calculate_phenotype_frequencies(frequency_data)
```
4. **Analyze implications** for drug dosing in diverse populations.
### Workflow 5: Literature Evidence Review
1. **Search for gene-drug pair**:
```python
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "TPMT", "drug": "azathioprine"})
pair = response.json()
```
2. **Retrieve all clinical annotations**:
```python
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
params={"gene": "TPMT", "drug": "azathioprine"})
annotations = response.json()
```
3. **Filter by evidence level and publication date**:
```python
high_quality = [a for a in annotations
if a['evidenceLevel'] in ['1A', '1B', '2A']]
```
4. **Extract PMIDs** and retrieve full references:
```python
pmids = [a['pmid'] for a in high_quality if 'pmid' in a]
# Use PubMed skill to retrieve full citations
```
## Rate Limiting and Best Practices
### Rate Limit Compliance
```python
import time
def rate_limited_request(url, params=None, delay=0.5):
"""Make API request with rate limiting (2 req/sec max)"""
response = requests.get(url, params=params)
time.sleep(delay) # Wait 0.5 seconds between requests
return response
# Use in loops
genes = ["CYP2D6", "CYP2C19", "CYP2C9"]
for gene in genes:
response = rate_limited_request(
"https://api.clinpgx.org/v1/gene/" + gene
)
data = response.json()
```
### Error Handling
```python
def safe_api_call(url, params=None, max_retries=3):
"""API call with error handling and retries"""
for attempt in range(max_retries):
try:
response = requests.get(url, params=params, timeout=10)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limit exceeded
wait_time = 2 ** attempt # Exponential backoff
print(f"Rate limit hit. Waiting {wait_time}s...")
time.sleep(wait_time)
else:
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt == max_retries - 1:
raise
time.sleep(1)
```
### Caching Results
```python
import json
from pathlib import Path
def cached_query(cache_file, api_func, *args, **kwargs):
"""Cache API results to avoid repeated queries"""
cache_path = Path(cache_file)
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
result = api_func(*args, **kwargs)
with open(cache_path, 'w') as f:
json.dump(result, f, indent=2)
return result
# Usage
gene_data = cached_query(
'cyp2d6_cache.json',
rate_limited_request,
"https://api.clinpgx.org/v1/gene/CYP2D6"
)
```
## PharmDOG Tool
PharmDOG (formerly DDRx) is ClinPGx's clinical decision support tool for interpreting pharmacogenomic test results:
**Key features**:
- **Phenoconversion calculator**: Adjusts phenotype predictions for drug-drug interactions affecting CYP2D6
- **Custom genotypes**: Input patient genotypes to get phenotype predictions
- **QR code sharing**: Generate shareable patient reports
- **Flexible guidance sources**: Select which guidelines to apply (CPIC, DPWG, FDA)
- **Multi-drug analysis**: Assess multiple medications simultaneously
**Access**: Available at https://www.clinpgx.org/pharmacogenomic-decision-support
**Use cases**:
- Clinical interpretation of PGx panel results
- Medication review for patients with known genotypes
- Patient education materials
- Point-of-care decision support
## Resources
### scripts/query_clinpgx.py
Python script with ready-to-use functions for common ClinPGx queries:
- `get_gene_info(gene_symbol)` - Retrieve gene details
- `get_drug_info(drug_name)` - Get drug information
- `get_gene_drug_pairs(gene, drug)` - Query gene-drug interactions
- `get_cpic_guidelines(gene, drug)` - Retrieve CPIC guidelines
- `get_alleles(gene)` - Get all alleles for a gene
- `get_clinical_annotations(gene, drug, evidence_level)` - Query literature annotations
- `get_drug_labels(drug)` - Retrieve pharmacogenomic drug labels
- `search_variants(rsid)` - Search by variant rsID
- `export_to_dataframe(data)` - Convert results to pandas DataFrame
Consult this script for implementation examples with proper rate limiting and error handling.
### references/api_reference.md
Comprehensive API documentation including:
- Complete endpoint listing with parameters
- Request/response format specifications
- Example queries for each endpoint
- Filter operators and search patterns
- Data schema definitions
- Rate limiting details
- Authentication requirements (if any)
- Troubleshooting common errors
Refer to this document when detailed API information is needed or when constructing complex queries.
## Important Notes
### Data Sources and Integration
ClinPGx consolidates multiple authoritative sources:
- **PharmGKB**: Curated pharmacogenomics knowledge base (now part of ClinPGx)
- **CPIC**: Evidence-based clinical implementation guidelines
- **PharmCAT**: Allele calling and phenotype interpretation tool
- **DPWG**: Dutch pharmacogenetics guidelines
- **FDA/EMA labels**: Regulatory pharmacogenomic information
As of July 2025, all PharmGKB URLs redirect to corresponding ClinPGx pages.
### Clinical Implementation Considerations
- **Evidence levels**: Always check evidence strength before clinical application
- **Population differences**: Allele frequencies vary significantly across populations
- **Phenoconversion**: Consider drug-drug interactions that affect enzyme activity
- **Multi-gene effects**: Some drugs affected by multiple pharmacogenes
- **Non-genetic factors**: Age, organ function, drug interactions also affect response
- **Testing limitations**: Not all clinically relevant alleles detected by all assays
### Data Updates
- ClinPGx continuously updates with new evidence and guidelines
- Check publication dates for clinical annotations
- Monitor ClinPGx Blog (https://blog.clinpgx.org/) for announcements
- CPIC guidelines updated as new evidence emerges
- PharmVar provides nomenclature updates for allele definitions
### API Stability
- API endpoints are relatively stable but may change during development
- Parameters and response formats subject to modification
- Monitor API changelog and ClinPGx blog for updates
- Consider version pinning for production applications
- Test API changes in development before production deployment
## Common Use Cases
### Pre-emptive Pharmacogenomic Testing
Query all clinically actionable gene-drug pairs to guide panel selection:
```python
# Get all CPIC guideline pairs
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"cpicLevel": "A"}) # Level A recommendations
actionable_pairs = response.json()
```
### Medication Therapy Management
Review patient medications against known genotypes:
```python
patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
medications = ["clopidogrel", "simvastatin", "escitalopram"]
for med in medications:
for gene in patient_genes:
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": gene, "drug": med})
# Check for interactions and dosing guidance
```
### Clinical Trial Eligibility
Screen for pharmacogenomic contraindications:
```python
# Check for HLA-B*57:01 before abacavir trial
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "HLA-B", "drug": "abacavir"})
pair_info = response.json()
# CPIC: Do not use if HLA-B*57:01 positive
```
## Additional Resources
- **ClinPGx website**: https://www.clinpgx.org/
- **ClinPGx Blog**: https://blog.clinpgx.org/
- **API documentation**: https://api.clinpgx.org/
- **CPIC website**: https://cpicpgx.org/
- **PharmCAT**: https://pharmcat.clinpgx.org/
- **ClinGen**: https://clinicalgenome.org/
- **Contact**: api@clinpgx.org (for substantial API use)

View File

@@ -0,0 +1,757 @@
# ClinPGx API Reference
Complete reference documentation for the ClinPGx REST API.
## Base URL
```
https://api.clinpgx.org/v1/
```
## Rate Limiting
- **Maximum rate**: 2 requests per second
- **Enforcement**: Requests exceeding the limit will receive HTTP 429 (Too Many Requests)
- **Best practice**: Implement 500ms delay between requests (0.5 seconds)
- **Recommendation**: For substantial API use, contact api@clinpgx.org
## Authentication
No authentication is required for basic API access. All endpoints are publicly accessible.
## Data License
All data accessed through the API is subject to:
- Creative Commons Attribution-ShareAlike 4.0 International License
- ClinPGx Data Usage Policy
## Response Format
All successful responses return JSON with appropriate HTTP status codes:
- `200 OK`: Successful request
- `404 Not Found`: Resource does not exist
- `429 Too Many Requests`: Rate limit exceeded
- `500 Internal Server Error`: Server error
## Core Endpoints
### 1. Gene Endpoint
Retrieve pharmacogene information including function, variants, and clinical significance.
#### Get Gene by Symbol
```http
GET /v1/gene/{gene_symbol}
```
**Parameters:**
- `gene_symbol` (path, required): Gene symbol (e.g., CYP2D6, TPMT, DPYD)
**Example Request:**
```bash
curl "https://api.clinpgx.org/v1/gene/CYP2D6"
```
**Example Response:**
```json
{
"id": "PA126",
"symbol": "CYP2D6",
"name": "cytochrome P450 family 2 subfamily D member 6",
"chromosome": "22",
"chromosomeLocation": "22q13.2",
"function": "Drug metabolism",
"description": "Highly polymorphic gene encoding enzyme...",
"clinicalAnnotations": [...],
"relatedDrugs": [...]
}
```
#### Search Genes
```http
GET /v1/gene?q={search_term}
```
**Parameters:**
- `q` (query, optional): Search term for gene name or symbol
**Example:**
```bash
curl "https://api.clinpgx.org/v1/gene?q=CYP"
```
### 2. Chemical/Drug Endpoint
Access drug and chemical compound information including pharmacogenomic annotations.
#### Get Drug by ID
```http
GET /v1/chemical/{drug_id}
```
**Parameters:**
- `drug_id` (path, required): ClinPGx drug identifier (e.g., PA448515)
**Example Request:**
```bash
curl "https://api.clinpgx.org/v1/chemical/PA448515"
```
#### Search Drugs by Name
```http
GET /v1/chemical?name={drug_name}
```
**Parameters:**
- `name` (query, optional): Drug name or synonym
**Example:**
```bash
curl "https://api.clinpgx.org/v1/chemical?name=warfarin"
```
**Example Response:**
```json
[
{
"id": "PA448515",
"name": "warfarin",
"genericNames": ["warfarin sodium"],
"tradeNames": ["Coumadin", "Jantoven"],
"drugClasses": ["Anticoagulants"],
"indication": "Prevention of thrombosis",
"relatedGenes": ["CYP2C9", "VKORC1", "CYP4F2"]
}
]
```
### 3. Gene-Drug Pair Endpoint
Query curated gene-drug interaction relationships with clinical annotations.
#### Get Gene-Drug Pairs
```http
GET /v1/geneDrugPair?gene={gene}&drug={drug}
```
**Parameters:**
- `gene` (query, optional): Gene symbol
- `drug` (query, optional): Drug name
- `cpicLevel` (query, optional): Filter by CPIC recommendation level (A, B, C, D)
**Example Requests:**
```bash
# Get all pairs for a gene
curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6"
# Get specific gene-drug pair
curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6&drug=codeine"
# Get all CPIC Level A pairs
curl "https://api.clinpgx.org/v1/geneDrugPair?cpicLevel=A"
```
**Example Response:**
```json
[
{
"gene": "CYP2D6",
"drug": "codeine",
"sources": ["CPIC", "FDA", "DPWG"],
"cpicLevel": "A",
"evidenceLevel": "1A",
"clinicalAnnotationCount": 45,
"hasGuideline": true,
"guidelineUrl": "https://www.clinpgx.org/guideline/..."
}
]
```
### 4. Guideline Endpoint
Access clinical practice guidelines from CPIC, DPWG, and other sources.
#### Get Guidelines
```http
GET /v1/guideline?source={source}&gene={gene}&drug={drug}
```
**Parameters:**
- `source` (query, optional): Guideline source (CPIC, DPWG, FDA)
- `gene` (query, optional): Gene symbol
- `drug` (query, optional): Drug name
**Example Requests:**
```bash
# Get all CPIC guidelines
curl "https://api.clinpgx.org/v1/guideline?source=CPIC"
# Get guideline for specific gene-drug
curl "https://api.clinpgx.org/v1/guideline?gene=CYP2C19&drug=clopidogrel"
```
#### Get Guideline by ID
```http
GET /v1/guideline/{guideline_id}
```
**Example:**
```bash
curl "https://api.clinpgx.org/v1/guideline/PA166104939"
```
**Example Response:**
```json
{
"id": "PA166104939",
"name": "CPIC Guideline for CYP2C19 and Clopidogrel",
"source": "CPIC",
"genes": ["CYP2C19"],
"drugs": ["clopidogrel"],
"recommendationLevel": "A",
"lastUpdated": "2023-08-01",
"summary": "Alternative antiplatelet therapy recommended for...",
"recommendations": [...],
"pdfUrl": "https://www.clinpgx.org/...",
"pmid": "23400754"
}
```
### 5. Allele Endpoint
Query allele definitions, functions, and population frequencies.
#### Get All Alleles for a Gene
```http
GET /v1/allele?gene={gene_symbol}
```
**Parameters:**
- `gene` (query, required): Gene symbol
**Example Request:**
```bash
curl "https://api.clinpgx.org/v1/allele?gene=CYP2D6"
```
**Example Response:**
```json
[
{
"name": "CYP2D6*1",
"gene": "CYP2D6",
"function": "Normal function",
"activityScore": 1.0,
"frequencies": {
"European": 0.42,
"African": 0.37,
"East Asian": 0.50,
"Latino": 0.44
},
"definingVariants": ["Reference allele"],
"pharmVarId": "PV00001"
},
{
"name": "CYP2D6*4",
"gene": "CYP2D6",
"function": "No function",
"activityScore": 0.0,
"frequencies": {
"European": 0.20,
"African": 0.05,
"East Asian": 0.01,
"Latino": 0.10
},
"definingVariants": ["rs3892097"],
"pharmVarId": "PV00004"
}
]
```
#### Get Specific Allele
```http
GET /v1/allele/{allele_name}
```
**Parameters:**
- `allele_name` (path, required): Allele name with star nomenclature (e.g., CYP2D6*4)
**Example:**
```bash
curl "https://api.clinpgx.org/v1/allele/CYP2D6*4"
```
### 6. Variant Endpoint
Search for genetic variants and their pharmacogenomic annotations.
#### Get Variant by rsID
```http
GET /v1/variant/{rsid}
```
**Parameters:**
- `rsid` (path, required): dbSNP reference SNP ID
**Example Request:**
```bash
curl "https://api.clinpgx.org/v1/variant/rs4244285"
```
**Example Response:**
```json
{
"rsid": "rs4244285",
"chromosome": "10",
"position": 94781859,
"gene": "CYP2C19",
"alleles": ["CYP2C19*2"],
"consequence": "Splice site variant",
"clinicalSignificance": "Pathogenic - reduced enzyme activity",
"frequencies": {
"European": 0.15,
"African": 0.18,
"East Asian": 0.29,
"Latino": 0.12
},
"references": [...]
}
```
#### Search Variants by Position
```http
GET /v1/variant?chromosome={chr}&position={pos}
```
**Parameters:**
- `chromosome` (query, optional): Chromosome number (1-22, X, Y)
- `position` (query, optional): Genomic position (GRCh38)
**Example:**
```bash
curl "https://api.clinpgx.org/v1/variant?chromosome=10&position=94781859"
```
### 7. Clinical Annotation Endpoint
Access curated literature annotations for gene-drug-phenotype relationships.
#### Get Clinical Annotations
```http
GET /v1/clinicalAnnotation?gene={gene}&drug={drug}&evidenceLevel={level}
```
**Parameters:**
- `gene` (query, optional): Gene symbol
- `drug` (query, optional): Drug name
- `evidenceLevel` (query, optional): Evidence level (1A, 1B, 2A, 2B, 3, 4)
- `phenotype` (query, optional): Phenotype or outcome
**Example Requests:**
```bash
# Get all annotations for a gene
curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=CYP2D6"
# Get high-quality evidence only
curl "https://api.clinpgx.org/v1/clinicalAnnotation?evidenceLevel=1A"
# Get annotations for specific gene-drug pair
curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=TPMT&drug=azathioprine"
```
**Example Response:**
```json
[
{
"id": "PA166153683",
"gene": "CYP2D6",
"drug": "codeine",
"phenotype": "Reduced analgesic effect",
"evidenceLevel": "1A",
"annotation": "Poor metabolizers have reduced conversion...",
"pmid": "24618998",
"studyType": "Clinical trial",
"population": "European",
"sources": ["CPIC"]
}
]
```
**Evidence Levels:**
- **1A**: High-quality evidence from guidelines (CPIC, FDA, DPWG)
- **1B**: High-quality evidence not yet guideline
- **2A**: Moderate evidence from well-designed studies
- **2B**: Moderate evidence with some limitations
- **3**: Limited or conflicting evidence
- **4**: Case reports or weak evidence
### 8. Drug Label Endpoint
Retrieve regulatory drug label information with pharmacogenomic content.
#### Get Drug Labels
```http
GET /v1/drugLabel?drug={drug_name}&source={source}
```
**Parameters:**
- `drug` (query, required): Drug name
- `source` (query, optional): Regulatory source (FDA, EMA, PMDA, Health Canada)
**Example Requests:**
```bash
# Get all labels for warfarin
curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin"
# Get only FDA labels
curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin&source=FDA"
```
**Example Response:**
```json
[
{
"id": "DL001234",
"drug": "warfarin",
"source": "FDA",
"sections": {
"testing": "Consider CYP2C9 and VKORC1 genotyping...",
"dosing": "Dose adjustment based on genotype...",
"warnings": "Risk of bleeding in certain genotypes"
},
"biomarkers": ["CYP2C9", "VKORC1"],
"testingRecommended": true,
"labelUrl": "https://dailymed.nlm.nih.gov/...",
"lastUpdated": "2024-01-15"
}
]
```
### 9. Pathway Endpoint
Access pharmacokinetic and pharmacodynamic pathway diagrams and information.
#### Get Pathway by ID
```http
GET /v1/pathway/{pathway_id}
```
**Parameters:**
- `pathway_id` (path, required): ClinPGx pathway identifier
**Example:**
```bash
curl "https://api.clinpgx.org/v1/pathway/PA146123006"
```
#### Search Pathways
```http
GET /v1/pathway?drug={drug_name}&gene={gene}
```
**Parameters:**
- `drug` (query, optional): Drug name
- `gene` (query, optional): Gene symbol
**Example:**
```bash
curl "https://api.clinpgx.org/v1/pathway?drug=warfarin"
```
**Example Response:**
```json
{
"id": "PA146123006",
"name": "Warfarin Pharmacokinetics and Pharmacodynamics",
"drugs": ["warfarin"],
"genes": ["CYP2C9", "VKORC1", "CYP4F2", "GGCX"],
"description": "Warfarin is metabolized primarily by CYP2C9...",
"diagramUrl": "https://www.clinpgx.org/pathway/...",
"steps": [
{
"step": 1,
"process": "Absorption",
"genes": []
},
{
"step": 2,
"process": "Metabolism",
"genes": ["CYP2C9", "CYP2C19"]
},
{
"step": 3,
"process": "Target interaction",
"genes": ["VKORC1"]
}
]
}
```
## Query Patterns and Examples
### Common Query Patterns
#### 1. Patient Medication Review
Query all gene-drug pairs for a patient's medications:
```python
import requests
patient_meds = ["clopidogrel", "simvastatin", "codeine"]
patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
for med in patient_meds:
for gene in patient_genes:
response = requests.get(
"https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": gene, "drug": med}
)
pairs = response.json()
# Check for interactions
```
#### 2. Actionable Gene Panel
Find all genes with CPIC Level A recommendations:
```python
response = requests.get(
"https://api.clinpgx.org/v1/geneDrugPair",
params={"cpicLevel": "A"}
)
actionable_pairs = response.json()
genes = set(pair['gene'] for pair in actionable_pairs)
print(f"Panel should include: {sorted(genes)}")
```
#### 3. Population Frequency Analysis
Compare allele frequencies across populations:
```python
alleles = requests.get(
"https://api.clinpgx.org/v1/allele",
params={"gene": "CYP2D6"}
).json()
# Calculate phenotype frequencies
pm_freq = {} # Poor metabolizer frequencies
for allele in alleles:
if allele['function'] == 'No function':
for pop, freq in allele['frequencies'].items():
pm_freq[pop] = pm_freq.get(pop, 0) + freq
```
#### 4. Drug Safety Screen
Check for high-risk gene-drug associations:
```python
# Screen for HLA-B*57:01 before abacavir
response = requests.get(
"https://api.clinpgx.org/v1/geneDrugPair",
params={"gene": "HLA-B", "drug": "abacavir"}
)
pair = response.json()[0]
if pair['cpicLevel'] == 'A':
print("CRITICAL: Do not use if HLA-B*57:01 positive")
```
## Error Handling
### Common Error Responses
#### 404 Not Found
```json
{
"error": "Resource not found",
"message": "Gene 'INVALID' does not exist"
}
```
#### 429 Too Many Requests
```json
{
"error": "Rate limit exceeded",
"message": "Maximum 2 requests per second allowed"
}
```
### Recommended Error Handling Pattern
```python
import requests
import time
def safe_query(url, params=None, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.get(url, params=params, timeout=10)
if response.status_code == 200:
time.sleep(0.5) # Rate limiting
return response.json()
elif response.status_code == 429:
wait = 2 ** attempt
print(f"Rate limited. Waiting {wait}s...")
time.sleep(wait)
elif response.status_code == 404:
print("Resource not found")
return None
else:
response.raise_for_status()
except requests.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
if attempt == max_retries - 1:
raise
return None
```
## Best Practices
### Rate Limiting
- Implement 500ms delay between requests (2 requests/second maximum)
- Use exponential backoff for rate limit errors
- Consider caching results for frequently accessed data
- For bulk operations, contact api@clinpgx.org
### Caching Strategy
```python
import json
from pathlib import Path
def cached_query(cache_file, query_func, *args, **kwargs):
cache_path = Path(cache_file)
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
result = query_func(*args, **kwargs)
if result:
with open(cache_path, 'w') as f:
json.dump(result, f)
return result
```
### Batch Processing
```python
import time
def batch_gene_query(genes, delay=0.5):
results = {}
for gene in genes:
response = requests.get(f"https://api.clinpgx.org/v1/gene/{gene}")
if response.status_code == 200:
results[gene] = response.json()
time.sleep(delay)
return results
```
## Data Schema Definitions
### Gene Object
```typescript
{
id: string; // ClinPGx gene ID
symbol: string; // HGNC gene symbol
name: string; // Full gene name
chromosome: string; // Chromosome location
function: string; // Pharmacogenomic function
clinicalAnnotations: number; // Count of annotations
relatedDrugs: string[]; // Associated drugs
}
```
### Drug Object
```typescript
{
id: string; // ClinPGx drug ID
name: string; // Generic name
tradeNames: string[]; // Brand names
drugClasses: string[]; // Therapeutic classes
indication: string; // Primary indication
relatedGenes: string[]; // Pharmacogenes
}
```
### Gene-Drug Pair Object
```typescript
{
gene: string; // Gene symbol
drug: string; // Drug name
sources: string[]; // CPIC, FDA, DPWG, etc.
cpicLevel: string; // A, B, C, D
evidenceLevel: string; // 1A, 1B, 2A, 2B, 3, 4
hasGuideline: boolean; // Has clinical guideline
}
```
### Allele Object
```typescript
{
name: string; // Allele name (e.g., CYP2D6*4)
gene: string; // Gene symbol
function: string; // Normal/decreased/no/increased/uncertain
activityScore: number; // 0.0 to 2.0+
frequencies: { // Population frequencies
[population: string]: number;
};
definingVariants: string[]; // rsIDs or descriptions
}
```
## API Stability and Versioning
### Current Status
- API version: v1
- Stability: Beta - endpoints stable, parameters may change
- Monitor: https://blog.clinpgx.org/ for updates
### Migration from PharmGKB
As of July 2025, PharmGKB URLs redirect to ClinPGx. Update references:
- Old: `https://api.pharmgkb.org/`
- New: `https://api.clinpgx.org/`
### Future Changes
- Watch for API v2 announcements
- Breaking changes will be announced on ClinPGx Blog
- Consider version pinning for production applications
## Support and Contact
- **API Issues**: api@clinpgx.org
- **Documentation**: https://api.clinpgx.org/
- **General Questions**: https://www.clinpgx.org/page/faqs
- **Blog**: https://blog.clinpgx.org/
- **CPIC Guidelines**: https://cpicpgx.org/
## Related Resources
- **PharmCAT**: Pharmacogenomic variant calling and annotation tool
- **PharmVar**: Pharmacogene allele nomenclature database
- **CPIC**: Clinical Pharmacogenetics Implementation Consortium
- **DPWG**: Dutch Pharmacogenetics Working Group
- **ClinGen**: Clinical Genome Resource

View File

@@ -0,0 +1,518 @@
#!/usr/bin/env python3
"""
ClinPGx API Query Helper Script
Provides ready-to-use functions for querying the ClinPGx database API.
Includes rate limiting, error handling, and caching functionality.
ClinPGx API: https://api.clinpgx.org/
Rate limit: 2 requests per second
License: Creative Commons Attribution-ShareAlike 4.0 International
"""
import requests
import time
import json
from pathlib import Path
from typing import Dict, List, Optional, Any
# API Configuration
BASE_URL = "https://api.clinpgx.org/v1/"
RATE_LIMIT_DELAY = 0.5 # 500ms delay = 2 requests/second
def rate_limited_request(url: str, params: Optional[Dict] = None, delay: float = RATE_LIMIT_DELAY) -> requests.Response:
"""
Make API request with rate limiting compliance.
Args:
url: API endpoint URL
params: Query parameters
delay: Delay in seconds between requests (default 0.5s for 2 req/sec)
Returns:
Response object
"""
response = requests.get(url, params=params)
time.sleep(delay)
return response
def safe_api_call(url: str, params: Optional[Dict] = None, max_retries: int = 3) -> Optional[Dict]:
"""
Make API call with error handling and exponential backoff retry.
Args:
url: API endpoint URL
params: Query parameters
max_retries: Maximum number of retry attempts
Returns:
JSON response data or None on failure
"""
for attempt in range(max_retries):
try:
response = requests.get(url, params=params, timeout=10)
if response.status_code == 200:
time.sleep(RATE_LIMIT_DELAY)
return response.json()
elif response.status_code == 429:
# Rate limit exceeded
wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s
print(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
elif response.status_code == 404:
print(f"Resource not found: {url}")
return None
else:
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
if attempt == max_retries - 1:
print(f"Failed after {max_retries} attempts")
return None
time.sleep(1)
return None
def cached_query(cache_file: str, query_func, *args, **kwargs) -> Any:
"""
Cache API results to avoid repeated queries.
Args:
cache_file: Path to cache file
query_func: Function to call if cache miss
*args, **kwargs: Arguments to pass to query_func
Returns:
Cached or freshly queried data
"""
cache_path = Path(cache_file)
if cache_path.exists():
print(f"Loading from cache: {cache_file}")
with open(cache_path) as f:
return json.load(f)
print(f"Cache miss. Querying API...")
result = query_func(*args, **kwargs)
if result is not None:
cache_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_path, 'w') as f:
json.dump(result, f, indent=2)
print(f"Cached to: {cache_file}")
return result
# Core Query Functions
def get_gene_info(gene_symbol: str) -> Optional[Dict]:
"""
Retrieve detailed information about a pharmacogene.
Args:
gene_symbol: Gene symbol (e.g., "CYP2D6", "TPMT")
Returns:
Gene information dictionary
Example:
>>> gene_data = get_gene_info("CYP2D6")
>>> print(gene_data['symbol'], gene_data['name'])
"""
url = f"{BASE_URL}gene/{gene_symbol}"
return safe_api_call(url)
def get_drug_info(drug_name: str) -> Optional[List[Dict]]:
"""
Search for drug/chemical information by name.
Args:
drug_name: Drug name (e.g., "warfarin", "codeine")
Returns:
List of matching drugs
Example:
>>> drugs = get_drug_info("warfarin")
>>> for drug in drugs:
>>> print(drug['name'], drug['id'])
"""
url = f"{BASE_URL}chemical"
params = {"name": drug_name}
return safe_api_call(url, params)
def get_gene_drug_pairs(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
"""
Query gene-drug interaction pairs.
Args:
gene: Gene symbol (optional)
drug: Drug name (optional)
Returns:
List of gene-drug pairs with clinical annotations
Example:
>>> # Get all pairs for CYP2D6
>>> pairs = get_gene_drug_pairs(gene="CYP2D6")
>>>
>>> # Get specific gene-drug pair
>>> pair = get_gene_drug_pairs(gene="CYP2D6", drug="codeine")
"""
url = f"{BASE_URL}geneDrugPair"
params = {}
if gene:
params["gene"] = gene
if drug:
params["drug"] = drug
return safe_api_call(url, params)
def get_cpic_guidelines(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
"""
Retrieve CPIC clinical practice guidelines.
Args:
gene: Gene symbol (optional)
drug: Drug name (optional)
Returns:
List of CPIC guidelines
Example:
>>> # Get all CPIC guidelines
>>> guidelines = get_cpic_guidelines()
>>>
>>> # Get guideline for specific gene-drug
>>> guideline = get_cpic_guidelines(gene="CYP2C19", drug="clopidogrel")
"""
url = f"{BASE_URL}guideline"
params = {"source": "CPIC"}
if gene:
params["gene"] = gene
if drug:
params["drug"] = drug
return safe_api_call(url, params)
def get_alleles(gene: str) -> Optional[List[Dict]]:
"""
Get all alleles for a pharmacogene including function and frequency.
Args:
gene: Gene symbol (e.g., "CYP2D6")
Returns:
List of alleles with functional annotations and population frequencies
Example:
>>> alleles = get_alleles("CYP2D6")
>>> for allele in alleles:
>>> print(f"{allele['name']}: {allele['function']}")
"""
url = f"{BASE_URL}allele"
params = {"gene": gene}
return safe_api_call(url, params)
def get_allele_info(allele_name: str) -> Optional[Dict]:
"""
Get detailed information about a specific allele.
Args:
allele_name: Allele name (e.g., "CYP2D6*4")
Returns:
Allele information dictionary
Example:
>>> allele = get_allele_info("CYP2D6*4")
>>> print(allele['function'], allele['frequencies'])
"""
url = f"{BASE_URL}allele/{allele_name}"
return safe_api_call(url)
def get_clinical_annotations(
gene: Optional[str] = None,
drug: Optional[str] = None,
evidence_level: Optional[str] = None
) -> Optional[List[Dict]]:
"""
Retrieve curated literature annotations for gene-drug interactions.
Args:
gene: Gene symbol (optional)
drug: Drug name (optional)
evidence_level: Filter by evidence level (1A, 1B, 2A, 2B, 3, 4)
Returns:
List of clinical annotations
Example:
>>> # Get all annotations for CYP2D6
>>> annotations = get_clinical_annotations(gene="CYP2D6")
>>>
>>> # Get high-quality evidence only
>>> high_quality = get_clinical_annotations(evidence_level="1A")
"""
url = f"{BASE_URL}clinicalAnnotation"
params = {}
if gene:
params["gene"] = gene
if drug:
params["drug"] = drug
if evidence_level:
params["evidenceLevel"] = evidence_level
return safe_api_call(url, params)
def get_drug_labels(drug: str, source: Optional[str] = None) -> Optional[List[Dict]]:
"""
Retrieve pharmacogenomic drug label information.
Args:
drug: Drug name
source: Regulatory source (e.g., "FDA", "EMA")
Returns:
List of drug labels with PGx information
Example:
>>> # Get all labels for warfarin
>>> labels = get_drug_labels("warfarin")
>>>
>>> # Get only FDA labels
>>> fda_labels = get_drug_labels("warfarin", source="FDA")
"""
url = f"{BASE_URL}drugLabel"
params = {"drug": drug}
if source:
params["source"] = source
return safe_api_call(url, params)
def search_variants(rsid: Optional[str] = None, chromosome: Optional[str] = None,
position: Optional[str] = None) -> Optional[List[Dict]]:
"""
Search for genetic variants by rsID or genomic position.
Args:
rsid: dbSNP rsID (e.g., "rs4244285")
chromosome: Chromosome number
position: Genomic position
Returns:
List of matching variants
Example:
>>> # Search by rsID
>>> variant = search_variants(rsid="rs4244285")
>>>
>>> # Search by position
>>> variants = search_variants(chromosome="10", position="94781859")
"""
url = f"{BASE_URL}variant"
if rsid:
url = f"{BASE_URL}variant/{rsid}"
return safe_api_call(url)
params = {}
if chromosome:
params["chromosome"] = chromosome
if position:
params["position"] = position
return safe_api_call(url, params)
def get_pathway_info(pathway_id: Optional[str] = None, drug: Optional[str] = None) -> Optional[Any]:
"""
Retrieve pharmacokinetic/pharmacodynamic pathway information.
Args:
pathway_id: ClinPGx pathway ID (optional)
drug: Drug name (optional)
Returns:
Pathway information or list of pathways
Example:
>>> # Get specific pathway
>>> pathway = get_pathway_info(pathway_id="PA146123006")
>>>
>>> # Get all pathways for a drug
>>> pathways = get_pathway_info(drug="warfarin")
"""
if pathway_id:
url = f"{BASE_URL}pathway/{pathway_id}"
return safe_api_call(url)
url = f"{BASE_URL}pathway"
params = {}
if drug:
params["drug"] = drug
return safe_api_call(url, params)
# Utility Functions
def export_to_dataframe(data: List[Dict], output_file: Optional[str] = None):
"""
Convert API results to pandas DataFrame for analysis.
Args:
data: List of dictionaries from API
output_file: Optional CSV output file path
Returns:
pandas DataFrame
Example:
>>> pairs = get_gene_drug_pairs(gene="CYP2D6")
>>> df = export_to_dataframe(pairs, "cyp2d6_pairs.csv")
>>> print(df.head())
"""
try:
import pandas as pd
except ImportError:
print("pandas not installed. Install with: pip install pandas")
return None
df = pd.DataFrame(data)
if output_file:
df.to_csv(output_file, index=False)
print(f"Data exported to: {output_file}")
return df
def batch_gene_query(gene_list: List[str], delay: float = 0.5) -> Dict[str, Dict]:
"""
Query multiple genes in batch with rate limiting.
Args:
gene_list: List of gene symbols
delay: Delay between requests (default 0.5s)
Returns:
Dictionary mapping gene symbols to gene data
Example:
>>> genes = ["CYP2D6", "CYP2C19", "CYP2C9", "TPMT"]
>>> results = batch_gene_query(genes)
>>> for gene, data in results.items():
>>> print(f"{gene}: {data['name']}")
"""
results = {}
print(f"Querying {len(gene_list)} genes with {delay}s delay between requests...")
for gene in gene_list:
print(f"Fetching: {gene}")
data = get_gene_info(gene)
if data:
results[gene] = data
time.sleep(delay)
print(f"Completed: {len(results)}/{len(gene_list)} successful")
return results
def find_actionable_gene_drug_pairs(cpic_level: str = "A") -> Optional[List[Dict]]:
"""
Find all clinically actionable gene-drug pairs with CPIC guidelines.
Args:
cpic_level: CPIC recommendation level (A, B, C, D)
Returns:
List of actionable gene-drug pairs
Example:
>>> # Get all Level A recommendations
>>> actionable = find_actionable_gene_drug_pairs(cpic_level="A")
>>> for pair in actionable:
>>> print(f"{pair['gene']} - {pair['drug']}")
"""
url = f"{BASE_URL}geneDrugPair"
params = {"cpicLevel": cpic_level}
return safe_api_call(url, params)
# Example Usage
if __name__ == "__main__":
print("ClinPGx API Query Examples\n")
# Example 1: Get gene information
print("=" * 60)
print("Example 1: Get CYP2D6 gene information")
print("=" * 60)
cyp2d6 = get_gene_info("CYP2D6")
if cyp2d6:
print(f"Gene: {cyp2d6.get('symbol')}")
print(f"Name: {cyp2d6.get('name')}")
print()
# Example 2: Search for a drug
print("=" * 60)
print("Example 2: Search for warfarin")
print("=" * 60)
warfarin = get_drug_info("warfarin")
if warfarin:
for drug in warfarin[:1]: # Show first result
print(f"Drug: {drug.get('name')}")
print(f"ID: {drug.get('id')}")
print()
# Example 3: Get gene-drug pairs
print("=" * 60)
print("Example 3: Get CYP2C19-clopidogrel pair")
print("=" * 60)
pair = get_gene_drug_pairs(gene="CYP2C19", drug="clopidogrel")
if pair:
print(f"Found {len(pair)} gene-drug pair(s)")
if len(pair) > 0:
print(f"Annotations: {pair[0].get('sources', [])}")
print()
# Example 4: Get CPIC guidelines
print("=" * 60)
print("Example 4: Get CPIC guidelines for CYP2C19")
print("=" * 60)
guidelines = get_cpic_guidelines(gene="CYP2C19")
if guidelines:
print(f"Found {len(guidelines)} guideline(s)")
for g in guidelines[:2]: # Show first 2
print(f" - {g.get('name')}")
print()
# Example 5: Get alleles for a gene
print("=" * 60)
print("Example 5: Get CYP2D6 alleles")
print("=" * 60)
alleles = get_alleles("CYP2D6")
if alleles:
print(f"Found {len(alleles)} allele(s)")
for allele in alleles[:3]: # Show first 3
print(f" - {allele.get('name')}: {allele.get('function')}")
print()
print("=" * 60)
print("Examples completed!")
print("=" * 60)

View File

@@ -0,0 +1,292 @@
---
name: ensembl-database
description: Work with the Ensembl genome database to query genomic data, retrieve sequences, analyze variants, and perform comparative genomics. This skill should be used when working with vertebrate genomic data, gene annotations, variant analysis, ortholog identification, or when users need to query the Ensembl REST API for genomic information across multiple species.
---
# Ensembl Database
## Overview
Access and query the Ensembl genome database, a comprehensive resource for vertebrate genomic data maintained by EMBL-EBI. The database provides gene annotations, sequences, variants, regulatory information, and comparative genomics data for over 250 species. Current release is 115 (September 2025).
## Core Capabilities
### 1. Gene Information Retrieval
Query gene data by symbol, Ensembl ID, or external database identifiers.
**Common operations:**
- Look up gene information by symbol (e.g., "BRCA2", "TP53")
- Retrieve transcript and protein information
- Get gene coordinates and chromosomal locations
- Access cross-references to external databases (UniProt, RefSeq, etc.)
**Using the ensembl_rest package:**
```python
from ensembl_rest import EnsemblClient
client = EnsemblClient()
# Look up gene by symbol
gene_data = client.symbol_lookup(
species='human',
symbol='BRCA2'
)
# Get detailed gene information
gene_info = client.lookup_id(
id='ENSG00000139618', # BRCA2 Ensembl ID
expand=True
)
```
**Direct REST API (no package):**
```python
import requests
server = "https://rest.ensembl.org"
# Symbol lookup
response = requests.get(
f"{server}/lookup/symbol/homo_sapiens/BRCA2",
headers={"Content-Type": "application/json"}
)
gene_data = response.json()
```
### 2. Sequence Retrieval
Fetch genomic, transcript, or protein sequences in various formats (JSON, FASTA, plain text).
**Operations:**
- Get DNA sequences for genes or genomic regions
- Retrieve transcript sequences (cDNA)
- Access protein sequences
- Extract sequences with flanking regions or modifications
**Example:**
```python
# Using ensembl_rest package
sequence = client.sequence_id(
id='ENSG00000139618', # Gene ID
content_type='application/json'
)
# Get sequence for a genomic region
region_seq = client.sequence_region(
species='human',
region='7:140424943-140624564' # chromosome:start-end
)
```
### 3. Variant Analysis
Query genetic variation data and predict variant consequences using the Variant Effect Predictor (VEP).
**Capabilities:**
- Look up variants by rsID or genomic coordinates
- Predict functional consequences of variants
- Access population frequency data
- Retrieve phenotype associations
**VEP example:**
```python
# Predict variant consequences
vep_result = client.vep_hgvs(
species='human',
hgvs_notation='ENST00000380152.7:c.803C>T'
)
# Query variant by rsID
variant = client.variation_id(
species='human',
id='rs699'
)
```
### 4. Comparative Genomics
Perform cross-species comparisons to identify orthologs, paralogs, and evolutionary relationships.
**Operations:**
- Find orthologs (same gene in different species)
- Identify paralogs (related genes in same species)
- Access gene trees showing evolutionary relationships
- Retrieve gene family information
**Example:**
```python
# Find orthologs for a human gene
orthologs = client.homology_ensemblgene(
id='ENSG00000139618', # Human BRCA2
target_species='mouse'
)
# Get gene tree
gene_tree = client.genetree_member_symbol(
species='human',
symbol='BRCA2'
)
```
### 5. Genomic Region Analysis
Find all genomic features (genes, transcripts, regulatory elements) in a specific region.
**Use cases:**
- Identify all genes in a chromosomal region
- Find regulatory features (promoters, enhancers)
- Locate variants within a region
- Retrieve structural features
**Example:**
```python
# Find all features in a region
features = client.overlap_region(
species='human',
region='7:140424943-140624564',
feature='gene'
)
```
### 6. Assembly Mapping
Convert coordinates between different genome assemblies (e.g., GRCh37 to GRCh38).
**Important:** Use `https://grch37.rest.ensembl.org` for GRCh37/hg19 queries and `https://rest.ensembl.org` for current assemblies.
**Example:**
```python
from ensembl_rest import AssemblyMapper
# Map coordinates from GRCh37 to GRCh38
mapper = AssemblyMapper(
species='human',
asm_from='GRCh37',
asm_to='GRCh38'
)
mapped = mapper.map(chrom='7', start=140453136, end=140453136)
```
## API Best Practices
### Rate Limiting
The Ensembl REST API has rate limits. Follow these practices:
1. **Respect rate limits:** Maximum 15 requests per second for anonymous users
2. **Handle 429 responses:** When rate-limited, check the `Retry-After` header and wait
3. **Use batch endpoints:** When querying multiple items, use batch endpoints where available
4. **Cache results:** Store frequently accessed data to reduce API calls
### Error Handling
Always implement proper error handling:
```python
import requests
import time
def query_ensembl(endpoint, params=None, max_retries=3):
server = "https://rest.ensembl.org"
headers = {"Content-Type": "application/json"}
for attempt in range(max_retries):
response = requests.get(
f"{server}{endpoint}",
headers=headers,
params=params
)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limited - wait and retry
retry_after = int(response.headers.get('Retry-After', 1))
time.sleep(retry_after)
else:
response.raise_for_status()
raise Exception(f"Failed after {max_retries} attempts")
```
## Installation
### Python Package (Recommended)
```bash
pip install ensembl_rest
```
The `ensembl_rest` package provides a Pythonic interface to all Ensembl REST API endpoints.
### Direct REST API
No installation needed - use standard HTTP libraries like `requests`:
```bash
pip install requests
```
## Resources
### references/
- `api_endpoints.md`: Comprehensive documentation of all 17 API endpoint categories with examples and parameters
### scripts/
- `ensembl_query.py`: Reusable Python script for common Ensembl queries with built-in rate limiting and error handling
## Common Workflows
### Workflow 1: Gene Annotation Pipeline
1. Look up gene by symbol to get Ensembl ID
2. Retrieve transcript information
3. Get protein sequences for all transcripts
4. Find orthologs in other species
5. Export results
### Workflow 2: Variant Analysis
1. Query variant by rsID or coordinates
2. Use VEP to predict functional consequences
3. Check population frequencies
4. Retrieve phenotype associations
5. Generate report
### Workflow 3: Comparative Analysis
1. Start with gene of interest in reference species
2. Find orthologs in target species
3. Retrieve sequences for all orthologs
4. Compare gene structures and features
5. Analyze evolutionary conservation
## Species and Assembly Information
To query available species and assemblies:
```python
# List all available species
species_list = client.info_species()
# Get assembly information for a species
assembly_info = client.info_assembly(species='human')
```
Common species identifiers:
- Human: `homo_sapiens` or `human`
- Mouse: `mus_musculus` or `mouse`
- Zebrafish: `danio_rerio` or `zebrafish`
- Fruit fly: `drosophila_melanogaster`
## Additional Resources
- **Official Documentation:** https://rest.ensembl.org/documentation
- **Python Package Docs:** https://ensemblrest.readthedocs.io
- **EBI Training:** https://www.ebi.ac.uk/training/online/courses/ensembl-rest-api/
- **Ensembl Browser:** https://useast.ensembl.org
- **GitHub Examples:** https://github.com/Ensembl/ensembl-rest/wiki

View File

@@ -0,0 +1,346 @@
# Ensembl REST API Endpoints Reference
Comprehensive documentation of all 17 API endpoint categories available in the Ensembl REST API (Release 115, September 2025).
**Base URLs:**
- Current assemblies: `https://rest.ensembl.org`
- GRCh37/hg19 (human): `https://grch37.rest.ensembl.org`
**Rate Limits:**
- Anonymous: 15 requests/second
- Registered: 55,000 requests/hour
## 1. Archive
Retrieve historical information about retired Ensembl identifiers.
**GET /archive/id/:id**
- Retrieve archived entries for a retired identifier
- Example: `/archive/id/ENSG00000157764` (retired gene ID)
## 2. Comparative Genomics
Access gene trees, genomic alignments, and homology data across species.
**GET /alignment/region/:species/:region**
- Get genomic alignments for a region
- Example: `/alignment/region/human/2:106040000-106040050:1?species_set_group=mammals`
**GET /genetree/id/:id**
- Retrieve gene tree for a gene family
- Example: `/genetree/id/ENSGT00390000003602`
**GET /genetree/member/id/:id**
- Get gene tree by member gene ID
- Example: `/genetree/member/id/ENSG00000139618`
**GET /homology/id/:id**
- Find orthologs and paralogs for a gene
- Parameters: `target_species`, `type` (orthologues, paralogues, all)
- Example: `/homology/id/ENSG00000139618?target_species=mouse`
**GET /homology/symbol/:species/:symbol**
- Find homologs by gene symbol
- Example: `/homology/symbol/human/BRCA2?target_species=mouse`
## 3. Cross References
Link external database identifiers to Ensembl objects.
**GET /xrefs/id/:id**
- Get external references for Ensembl ID
- Example: `/xrefs/id/ENSG00000139618`
**GET /xrefs/symbol/:species/:symbol**
- Get cross-references by gene symbol
- Example: `/xrefs/symbol/human/BRCA2`
**GET /xrefs/name/:species/:name**
- Search for objects by external name
- Example: `/xrefs/name/human/NP_000050`
## 4. Information
Query metadata about species, assemblies, biotypes, and database versions.
**GET /info/species**
- List all available species
- Returns species names, assemblies, taxonomy IDs
**GET /info/assembly/:species**
- Get assembly information for a species
- Example: `/info/assembly/human` (returns GRCh38.p14)
**GET /info/assembly/:species/:region**
- Get detailed information about a chromosomal region
- Example: `/info/assembly/human/X`
**GET /info/biotypes/:species**
- List all available biotypes (gene types)
- Example: `/info/biotypes/human`
**GET /info/analysis/:species**
- List available analysis types
- Example: `/info/analysis/human`
**GET /info/data**
- Get general information about the current Ensembl release
## 5. Linkage Disequilibrium (LD)
Calculate linkage disequilibrium between variants.
**GET /ld/:species/:id/:population_name**
- Calculate LD for a variant
- Example: `/ld/human/rs1042522/1000GENOMES:phase_3:KHV`
**GET /ld/pairwise/:species/:id1/:id2**
- Calculate LD between two variants
- Example: `/ld/pairwise/human/rs1042522/rs11540652`
## 6. Lookup
Identify species and database information for identifiers.
**GET /lookup/id/:id**
- Look up object by Ensembl ID
- Parameter: `expand` (include child objects)
- Example: `/lookup/id/ENSG00000139618?expand=1`
**POST /lookup/id**
- Batch lookup multiple IDs
- Submit JSON array of IDs
- Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
**GET /lookup/symbol/:species/:symbol**
- Look up gene by symbol
- Parameter: `expand` (include transcripts)
- Example: `/lookup/symbol/human/BRCA2?expand=1`
## 7. Mapping
Convert coordinates between assemblies, cDNA, CDS, and protein positions.
**GET /map/cdna/:id/:region**
- Map cDNA coordinates to genomic
- Example: `/map/cdna/ENST00000288602/100..300`
**GET /map/cds/:id/:region**
- Map CDS coordinates to genomic
- Example: `/map/cds/ENST00000288602/1..300`
**GET /map/translation/:id/:region**
- Map protein coordinates to genomic
- Example: `/map/translation/ENSP00000288602/1..100`
**GET /map/:species/:asm_one/:region/:asm_two**
- Map coordinates between assemblies
- Example: `/map/human/GRCh37/7:140453136..140453136/GRCh38`
**POST /map/:species/:asm_one/:asm_two**
- Batch assembly mapping
- Submit JSON array of regions
## 8. Ontologies and Taxonomy
Search biological ontologies and taxonomic classifications.
**GET /ontology/id/:id**
- Get ontology term information
- Example: `/ontology/id/GO:0005515`
**GET /ontology/name/:name**
- Search ontology by term name
- Example: `/ontology/name/protein%20binding`
**GET /taxonomy/classification/:id**
- Get taxonomic classification
- Example: `/taxonomy/classification/9606` (human)
**GET /taxonomy/id/:id**
- Get taxonomy information by ID
- Example: `/taxonomy/id/9606`
## 9. Overlap
Find genomic features overlapping a region.
**GET /overlap/id/:id**
- Get features overlapping a gene/transcript
- Parameters: `feature` (gene, transcript, cds, exon, repeat, etc.)
- Example: `/overlap/id/ENSG00000139618?feature=transcript`
**GET /overlap/region/:species/:region**
- Get all features in a genomic region
- Parameters: `feature` (gene, transcript, variation, regulatory, etc.)
- Example: `/overlap/region/human/7:140424943..140624564?feature=gene`
**GET /overlap/translation/:id**
- Get protein features
- Example: `/overlap/translation/ENSP00000288602`
## 10. Phenotype Annotations
Retrieve disease and trait associations.
**GET /phenotype/accession/:species/:accession**
- Get phenotypes by ontology accession
- Example: `/phenotype/accession/human/EFO:0003767`
**GET /phenotype/gene/:species/:gene**
- Get phenotype associations for a gene
- Example: `/phenotype/gene/human/ENSG00000139618`
**GET /phenotype/region/:species/:region**
- Get phenotypes in genomic region
- Example: `/phenotype/region/human/7:140424943-140624564`
**GET /phenotype/term/:species/:term**
- Search phenotypes by term
- Example: `/phenotype/term/human/cancer`
## 11. Regulation
Access regulatory feature and binding motif data.
**GET /regulatory/species/:species/microarray/:microarray/:probe**
- Get microarray probe information
- Example: `/regulatory/species/human/microarray/HumanWG_6_V2/ILMN_1773626`
**GET /species/:species/binding_matrix/:binding_matrix_id**
- Get transcription factor binding matrix
- Example: `/species/human/binding_matrix/ENSPFM0001`
## 12. Sequence
Retrieve genomic, transcript, and protein sequences.
**GET /sequence/id/:id**
- Get sequence by ID
- Parameters: `type` (genomic, cds, cdna, protein), `format` (json, fasta, text)
- Example: `/sequence/id/ENSG00000139618?type=genomic`
**POST /sequence/id**
- Batch sequence retrieval
- Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
**GET /sequence/region/:species/:region**
- Get genomic sequence for region
- Parameters: `coord_system`, `format`
- Example: `/sequence/region/human/7:140424943..140624564?format=fasta`
**POST /sequence/region/:species**
- Batch region sequence retrieval
## 13. Transcript Haplotypes
Compute transcript haplotypes from phased genotypes.
**GET /transcript_haplotypes/:species/:id**
- Get transcript haplotypes
- Example: `/transcript_haplotypes/human/ENST00000288602`
## 14. Variant Effect Predictor (VEP)
Predict functional consequences of variants.
**GET /vep/:species/hgvs/:hgvs_notation**
- Predict variant effects using HGVS notation
- Parameters: numerous VEP options
- Example: `/vep/human/hgvs/ENST00000288602:c.803C>T`
**POST /vep/:species/hgvs**
- Batch VEP analysis with HGVS
- Example: `{"hgvs_notations": ["ENST00000288602:c.803C>T"]}`
**GET /vep/:species/id/:id**
- Predict effects for variant ID
- Example: `/vep/human/id/rs699`
**POST /vep/:species/id**
- Batch VEP by variant IDs
**GET /vep/:species/region/:region/:allele**
- Predict effects for region and allele
- Example: `/vep/human/region/7:140453136:C/T`
**POST /vep/:species/region**
- Batch VEP by regions
## 15. Variation
Query genetic variation data and associated publications.
**GET /variation/:species/:id**
- Get variant information by ID
- Parameters: `pops` (include population frequencies), `genotypes`
- Example: `/variation/human/rs699?pops=1`
**POST /variation/:species**
- Batch variant queries
- Example: `{"ids": ["rs699", "rs6025"]}`
**GET /variation/:species/pmcid/:pmcid**
- Get variants from PubMed Central article
- Example: `/variation/human/pmcid/PMC5002951`
**GET /variation/:species/pmid/:pmid**
- Get variants from PubMed article
- Example: `/variation/human/pmid/26318936`
## 16. Variation GA4GH
Access genomic variation data using GA4GH standards.
**POST /ga4gh/beacon**
- Query beacon for variant presence
**GET /ga4gh/features/:id**
- Get feature by ID in GA4GH format
**POST /ga4gh/features/search**
- Search features using GA4GH protocol
**POST /ga4gh/variants/search**
- Search variants using GA4GH protocol
## Response Formats
Most endpoints support multiple response formats:
- **JSON** (default): `Content-Type: application/json`
- **FASTA**: For sequence data
- **XML**: Some endpoints support XML
- **Text**: Plain text output
Specify format using:
1. `Content-Type` header
2. URL parameter: `content-type=text/x-fasta`
3. File extension: `/sequence/id/ENSG00000139618.fasta`
## Common Parameters
Many endpoints share these parameters:
- **expand**: Include child objects (transcripts, proteins)
- **format**: Output format (json, xml, fasta)
- **db_type**: Database type (core, otherfeatures, variation)
- **object_type**: Type of object to return
- **species**: Species name (can be common or scientific)
## Error Codes
- **200**: Success
- **400**: Bad request (invalid parameters)
- **404**: Not found (ID doesn't exist)
- **429**: Rate limit exceeded
- **500**: Internal server error
## Best Practices
1. **Use batch endpoints** for multiple queries (more efficient)
2. **Cache responses** to minimize API calls
3. **Check rate limit headers** in responses
4. **Handle 429 errors** by respecting `Retry-After` header
5. **Use appropriate content types** for sequence data
6. **Specify assembly** when querying older genome versions
7. **Enable expand parameter** when you need full object details

View File

@@ -0,0 +1,427 @@
#!/usr/bin/env python3
"""
Ensembl REST API Query Script
Reusable functions for common Ensembl database queries with built-in rate limiting and error handling.
Usage:
python ensembl_query.py --gene BRCA2 --species human
python ensembl_query.py --variant rs699 --species human
python ensembl_query.py --region "7:140424943-140624564" --species human
"""
import requests
import time
import json
import argparse
from typing import Dict, List, Optional, Any
class EnsemblAPIClient:
"""Client for querying the Ensembl REST API with rate limiting and error handling."""
def __init__(self, server: str = "https://rest.ensembl.org", rate_limit: int = 15):
"""
Initialize the Ensembl API client.
Args:
server: Base URL for the Ensembl REST API
rate_limit: Maximum requests per second (default 15 for anonymous users)
"""
self.server = server
self.rate_limit = rate_limit
self.request_count = 0
self.last_request_time = 0
def _rate_limit_check(self):
"""Enforce rate limiting before making requests."""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < 1.0:
if self.request_count >= self.rate_limit:
sleep_time = 1.0 - time_since_last
time.sleep(sleep_time)
self.request_count = 0
self.last_request_time = time.time()
else:
self.request_count = 0
self.last_request_time = current_time
def _make_request(
self,
endpoint: str,
params: Optional[Dict] = None,
max_retries: int = 3,
method: str = "GET",
data: Optional[Dict] = None
) -> Any:
"""
Make an API request with error handling and retries.
Args:
endpoint: API endpoint path
params: Query parameters
max_retries: Maximum number of retry attempts
method: HTTP method (GET or POST)
data: JSON data for POST requests
Returns:
JSON response data
Raises:
Exception: If request fails after max retries
"""
headers = {"Content-Type": "application/json"}
url = f"{self.server}{endpoint}"
for attempt in range(max_retries):
self._rate_limit_check()
self.request_count += 1
try:
if method == "POST":
response = requests.post(url, headers=headers, json=data)
else:
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limited - wait and retry
retry_after = int(response.headers.get('Retry-After', 1))
print(f"Rate limited. Waiting {retry_after} seconds...")
time.sleep(retry_after)
elif response.status_code == 404:
raise Exception(f"Resource not found: {endpoint}")
else:
response.raise_for_status()
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise Exception(f"Request failed after {max_retries} attempts: {e}")
time.sleep(2 ** attempt) # Exponential backoff
raise Exception(f"Failed after {max_retries} attempts")
def lookup_gene_by_symbol(self, species: str, symbol: str, expand: bool = True) -> Dict:
"""
Look up gene information by symbol.
Args:
species: Species name (e.g., 'human', 'mouse')
symbol: Gene symbol (e.g., 'BRCA2', 'TP53')
expand: Include transcript information
Returns:
Gene information dictionary
"""
endpoint = f"/lookup/symbol/{species}/{symbol}"
params = {"expand": 1} if expand else {}
return self._make_request(endpoint, params=params)
def lookup_by_id(self, ensembl_id: str, expand: bool = False) -> Dict:
"""
Look up object by Ensembl ID.
Args:
ensembl_id: Ensembl identifier (e.g., 'ENSG00000139618')
expand: Include child objects
Returns:
Object information dictionary
"""
endpoint = f"/lookup/id/{ensembl_id}"
params = {"expand": 1} if expand else {}
return self._make_request(endpoint, params=params)
def get_sequence(
self,
ensembl_id: str,
seq_type: str = "genomic",
format: str = "json"
) -> Any:
"""
Retrieve sequence by Ensembl ID.
Args:
ensembl_id: Ensembl identifier
seq_type: Sequence type ('genomic', 'cds', 'cdna', 'protein')
format: Output format ('json', 'fasta', 'text')
Returns:
Sequence data
"""
endpoint = f"/sequence/id/{ensembl_id}"
params = {"type": seq_type}
if format == "fasta":
headers = {"Content-Type": "text/x-fasta"}
url = f"{self.server}{endpoint}"
response = requests.get(url, headers=headers, params=params)
return response.text
return self._make_request(endpoint, params=params)
def get_region_sequence(
self,
species: str,
region: str,
format: str = "json"
) -> Any:
"""
Get genomic sequence for a region.
Args:
species: Species name
region: Region string (e.g., '7:140424943-140624564')
format: Output format ('json', 'fasta', 'text')
Returns:
Sequence data
"""
endpoint = f"/sequence/region/{species}/{region}"
if format == "fasta":
headers = {"Content-Type": "text/x-fasta"}
url = f"{self.server}{endpoint}"
response = requests.get(url, headers=headers)
return response.text
return self._make_request(endpoint)
def get_variant(self, species: str, variant_id: str, include_pops: bool = True) -> Dict:
"""
Get variant information by ID.
Args:
species: Species name
variant_id: Variant identifier (e.g., 'rs699')
include_pops: Include population frequencies
Returns:
Variant information dictionary
"""
endpoint = f"/variation/{species}/{variant_id}"
params = {"pops": 1} if include_pops else {}
return self._make_request(endpoint, params=params)
def predict_variant_effect(
self,
species: str,
hgvs_notation: str
) -> List[Dict]:
"""
Predict variant consequences using VEP.
Args:
species: Species name
hgvs_notation: HGVS notation (e.g., 'ENST00000288602:c.803C>T')
Returns:
List of predicted consequences
"""
endpoint = f"/vep/{species}/hgvs/{hgvs_notation}"
return self._make_request(endpoint)
def find_orthologs(
self,
ensembl_id: str,
target_species: Optional[str] = None
) -> Dict:
"""
Find orthologs for a gene.
Args:
ensembl_id: Source gene Ensembl ID
target_species: Target species (optional, returns all if not specified)
Returns:
Homology information dictionary
"""
endpoint = f"/homology/id/{ensembl_id}"
params = {}
if target_species:
params["target_species"] = target_species
return self._make_request(endpoint, params=params)
def get_region_features(
self,
species: str,
region: str,
feature_type: str = "gene"
) -> List[Dict]:
"""
Get genomic features in a region.
Args:
species: Species name
region: Region string (e.g., '7:140424943-140624564')
feature_type: Feature type ('gene', 'transcript', 'variation', etc.)
Returns:
List of features
"""
endpoint = f"/overlap/region/{species}/{region}"
params = {"feature": feature_type}
return self._make_request(endpoint, params=params)
def get_species_info(self) -> List[Dict]:
"""
Get information about all available species.
Returns:
List of species information dictionaries
"""
endpoint = "/info/species"
result = self._make_request(endpoint)
return result.get("species", [])
def get_assembly_info(self, species: str) -> Dict:
"""
Get assembly information for a species.
Args:
species: Species name
Returns:
Assembly information dictionary
"""
endpoint = f"/info/assembly/{species}"
return self._make_request(endpoint)
def map_coordinates(
self,
species: str,
asm_from: str,
region: str,
asm_to: str
) -> Dict:
"""
Map coordinates between genome assemblies.
Args:
species: Species name
asm_from: Source assembly (e.g., 'GRCh37')
region: Region string (e.g., '7:140453136-140453136')
asm_to: Target assembly (e.g., 'GRCh38')
Returns:
Mapped coordinates
"""
endpoint = f"/map/{species}/{asm_from}/{region}/{asm_to}"
return self._make_request(endpoint)
def main():
"""Command-line interface for common Ensembl queries."""
parser = argparse.ArgumentParser(
description="Query the Ensembl database via REST API"
)
parser.add_argument("--gene", help="Gene symbol to look up")
parser.add_argument("--ensembl-id", help="Ensembl ID to look up")
parser.add_argument("--variant", help="Variant ID (e.g., rs699)")
parser.add_argument("--region", help="Genomic region (chr:start-end)")
parser.add_argument(
"--species",
default="human",
help="Species name (default: human)"
)
parser.add_argument(
"--orthologs",
help="Find orthologs for gene (provide Ensembl ID)"
)
parser.add_argument(
"--target-species",
help="Target species for ortholog search"
)
parser.add_argument(
"--sequence",
action="store_true",
help="Retrieve sequence (requires --gene or --ensembl-id or --region)"
)
parser.add_argument(
"--format",
choices=["json", "fasta"],
default="json",
help="Output format (default: json)"
)
parser.add_argument(
"--assembly",
default="GRCh37",
help="For GRCh37, use grch37.rest.ensembl.org server"
)
args = parser.parse_args()
# Select appropriate server
server = "https://rest.ensembl.org"
if args.assembly.lower() == "grch37":
server = "https://grch37.rest.ensembl.org"
client = EnsemblAPIClient(server=server)
try:
if args.gene:
print(f"Looking up gene: {args.gene}")
result = client.lookup_gene_by_symbol(args.species, args.gene)
if args.sequence:
print(f"\nRetrieving sequence for {result['id']}...")
seq_result = client.get_sequence(
result['id'],
format=args.format
)
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
else:
print(json.dumps(result, indent=2))
elif args.ensembl_id:
print(f"Looking up ID: {args.ensembl_id}")
result = client.lookup_by_id(args.ensembl_id, expand=True)
if args.sequence:
print(f"\nRetrieving sequence...")
seq_result = client.get_sequence(
args.ensembl_id,
format=args.format
)
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
else:
print(json.dumps(result, indent=2))
elif args.variant:
print(f"Looking up variant: {args.variant}")
result = client.get_variant(args.species, args.variant)
print(json.dumps(result, indent=2))
elif args.region:
if args.sequence:
print(f"Retrieving sequence for region: {args.region}")
result = client.get_region_sequence(
args.species,
args.region,
format=args.format
)
print(json.dumps(result, indent=2) if args.format == "json" else result)
else:
print(f"Finding features in region: {args.region}")
result = client.get_region_features(args.species, args.region)
print(json.dumps(result, indent=2))
elif args.orthologs:
print(f"Finding orthologs for: {args.orthologs}")
result = client.find_orthologs(
args.orthologs,
target_species=args.target_species
)
print(json.dumps(result, indent=2))
else:
parser.print_help()
except Exception as e:
print(f"Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,251 @@
---
name: metabolomics-workbench-database
description: Toolkit for accessing and querying the Metabolomics Workbench, an NIH-sponsored repository containing 4,200+ metabolomics studies with standardized nomenclature (RefMet), study metadata, experimental results, and comprehensive metabolite databases. Use this skill when working with metabolomics data, querying metabolite structures, accessing study results, standardizing metabolite names, performing mass spectrometry searches, or retrieving gene/protein associations with metabolites.
---
# Metabolomics Workbench Database
## Overview
The Metabolomics Workbench is a comprehensive NIH Common Fund-sponsored platform hosted at UCSD that serves as the primary repository for metabolomics research data. It provides programmatic access to over 4,200 processed studies (3,790+ publicly available), standardized metabolite nomenclature through RefMet, and powerful search capabilities across multiple analytical platforms (GC-MS, LC-MS, NMR).
This skill enables efficient interaction with the Metabolomics Workbench REST API to query metabolite structures, access study data, standardize nomenclature, perform mass spectrometry searches, and retrieve gene/protein-metabolite associations.
## Core Capabilities
### 1. Querying Metabolite Structures and Data
Access comprehensive metabolite information including structures, identifiers, and cross-references to external databases.
**Key operations:**
- Retrieve compound data by various identifiers (PubChem CID, InChI Key, KEGG ID, HMDB ID, etc.)
- Download molecular structures as MOL files or PNG images
- Access standardized compound classifications
- Cross-reference between different metabolite databases
**Example queries:**
```python
import requests
# Get compound information by PubChem CID
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json')
# Download molecular structure as PNG
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/png')
# Get compound name by registry number
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json')
```
### 2. Accessing Study Metadata and Experimental Results
Query metabolomics studies by various criteria and retrieve complete experimental datasets.
**Key operations:**
- Search studies by metabolite, institute, investigator, or title
- Access study summaries, experimental factors, and analysis details
- Retrieve complete experimental data in various formats
- Download mwTab format files for complete study information
- Query untargeted metabolomics data
**Example queries:**
```python
# List all available public studies
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json')
# Get study summary
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json')
# Retrieve experimental data
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
# Find studies containing a specific metabolite
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json')
```
### 3. Standardizing Metabolite Nomenclature with RefMet
Use the RefMet database to standardize metabolite names and access systematic classification across four structural resolution levels.
**Key operations:**
- Match common metabolite names to standardized RefMet names
- Query by chemical formula, exact mass, or InChI Key
- Access hierarchical classification (super class, main class, sub class)
- Retrieve all RefMet entries or filter by classification
**Example queries:**
```python
# Standardize a metabolite name
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json')
# Query by molecular formula
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/formula/C12H24O2/all/json')
# Get all metabolites in a specific class
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json')
# Retrieve complete RefMet database
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/all/json')
```
### 4. Performing Mass Spectrometry Searches
Search for compounds by mass-to-charge ratio (m/z) with specified ion adducts and tolerance levels.
**Key operations:**
- Search precursor ion masses across multiple databases (Metabolomics Workbench, LIPIDS, RefMet)
- Specify ion adduct types (M+H, M-H, M+Na, M+NH4, M+2H, etc.)
- Calculate exact masses for known metabolites with specific adducts
- Set mass tolerance for flexible matching
**Example queries:**
```python
# Search by m/z value with M+H adduct
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json')
# Calculate exact mass for a metabolite with specific adduct
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json')
# Search across RefMet database
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json')
```
### 5. Filtering Studies by Analytical and Biological Parameters
Use the MetStat context to find studies matching specific experimental conditions.
**Key operations:**
- Filter by analytical method (LCMS, GCMS, NMR)
- Specify ionization polarity (POSITIVE, NEGATIVE)
- Filter by chromatography type (HILIC, RP, GC)
- Target specific species, sample sources, or diseases
- Combine multiple filters using semicolon-delimited format
**Example queries:**
```python
# Find human blood studies on diabetes using LC-MS
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json')
# Find all human blood studies containing tyrosine
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json')
# Filter by analytical method only
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json')
```
### 6. Accessing Gene and Protein Information
Retrieve gene and protein data associated with metabolic pathways and metabolite metabolism.
**Key operations:**
- Query genes by symbol, name, or ID
- Access protein sequences and annotations
- Cross-reference between gene IDs, RefSeq IDs, and UniProt IDs
- Retrieve gene-metabolite associations
**Example queries:**
```python
# Get gene information by symbol
response = requests.get('https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json')
# Retrieve protein data by UniProt ID
response = requests.get('https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json')
```
## Common Workflows
### Workflow 1: Finding Studies for a Specific Metabolite
To find all studies containing measurements of a specific metabolite:
1. First standardize the metabolite name using RefMet:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/glucose/name/json')
```
2. Use the standardized name to search for studies:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Glucose/summary/json')
```
3. Retrieve experimental data from specific studies:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
```
### Workflow 2: Identifying Compounds from MS Data
To identify potential compounds from mass spectrometry m/z values:
1. Perform m/z search with appropriate adduct and tolerance:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/180.06/M+H/0.5/json')
```
2. Review candidate compounds from results
3. Retrieve detailed information for candidate compounds:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/all/json')
```
4. Download structures for confirmation:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
```
### Workflow 3: Exploring Disease-Specific Metabolomics
To find metabolomics studies for a specific disease and analytical platform:
1. Use MetStat to filter studies:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;;Human;;Cancer/json')
```
2. Review study IDs from results
3. Access detailed study information:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/summary/json')
```
4. Retrieve complete experimental data:
```python
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/data/json')
```
## Output Formats
The API supports two primary output formats:
- **JSON** (default): Machine-readable format, ideal for programmatic access
- **TXT**: Human-readable tab-delimited text format
Specify format by appending `/json` or `/txt` to API URLs. When format is omitted, JSON is returned by default.
## Best Practices
1. **Use RefMet for standardization**: Always standardize metabolite names through RefMet before searching studies to ensure consistent nomenclature
2. **Specify appropriate adducts**: When performing m/z searches, use the correct ion adduct type for your analytical method (e.g., M+H for positive mode ESI)
3. **Set reasonable tolerances**: Use appropriate mass tolerance values (typically 0.5 Da for low-resolution, 0.01 Da for high-resolution MS)
4. **Cache reference data**: Consider caching frequently used reference data (RefMet database, compound information) to minimize API calls
5. **Handle pagination**: For large result sets, be prepared to handle multiple data structures in responses
6. **Validate identifiers**: Cross-reference metabolite identifiers across multiple databases when possible to ensure correct compound identification
## Resources
### references/
Detailed API reference documentation is available in `references/api_reference.md`, including:
- Complete REST API endpoint specifications
- All available contexts (compound, study, refmet, metstat, gene, protein, moverz)
- Input/output parameter details
- Ion adduct types for mass spectrometry
- Additional query examples
Load this reference file when detailed API specifications are needed or when working with less common endpoints.

View File

@@ -0,0 +1,494 @@
# Metabolomics Workbench REST API Reference
## Base URL
All API requests use the following base URL:
```
https://www.metabolomicsworkbench.org/rest/
```
## API Structure
The REST API follows a consistent URL pattern:
```
/context/input_item/input_value/output_item/output_format
```
- **context**: The type of resource to access (study, compound, refmet, metstat, gene, protein, moverz)
- **input_item**: The type of identifier or search parameter
- **input_value**: The specific value to search for
- **output_item**: What data to return (e.g., all, name, summary)
- **output_format**: json or txt (json is default if omitted)
## Output Formats
- **json**: Machine-readable JSON format (default)
- **txt**: Tab-delimited text format for human readability
## Context 1: Compound
Retrieve metabolite structure and identification data.
### Input Items
| Input Item | Description | Example |
|------------|-------------|---------|
| `regno` | Metabolomics Workbench registry number | 11 |
| `pubchem_cid` | PubChem Compound ID | 5281365 |
| `inchi_key` | International Chemical Identifier Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
| `formula` | Molecular formula | C6H12O6 |
| `lm_id` | LIPID MAPS ID | LM... |
| `hmdb_id` | Human Metabolome Database ID | HMDB0000122 |
| `kegg_id` | KEGG Compound ID | C00031 |
### Output Items
| Output Item | Description |
|-------------|-------------|
| `all` | All available compound data |
| `classification` | Compound classification |
| `regno` | Registry number |
| `formula` | Molecular formula |
| `exactmass` | Exact mass |
| `inchi_key` | InChI Key |
| `name` | Common name |
| `sys_name` | Systematic name |
| `smiles` | SMILES notation |
| `lm_id` | LIPID MAPS ID |
| `pubchem_cid` | PubChem CID |
| `hmdb_id` | HMDB ID |
| `kegg_id` | KEGG ID |
| `chebi_id` | ChEBI ID |
| `metacyc_id` | MetaCyc ID |
| `molfile` | MOL file structure |
| `png` | PNG image of structure |
### Example Requests
```bash
# Get all compound data by PubChem CID
curl "https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json"
# Get compound name by registry number
curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json"
# Download structure as PNG
curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/png" -o structure.png
# Get compound by KEGG ID
curl "https://www.metabolomicsworkbench.org/rest/compound/kegg_id/C00031/all/json"
# Get compound by molecular formula
curl "https://www.metabolomicsworkbench.org/rest/compound/formula/C6H12O6/all/json"
```
## Context 2: Study
Access metabolomics research study metadata and experimental results.
### Input Items
| Input Item | Description | Example |
|------------|-------------|---------|
| `study_id` | Study identifier | ST000001 |
| `analysis_id` | Analysis identifier | AN000001 |
| `study_title` | Keywords in study title | diabetes |
| `institute` | Institute name | UCSD |
| `last_name` | Investigator last name | Smith |
| `metabolite_id` | Metabolite registry number | 11 |
| `refmet_name` | RefMet standardized name | Glucose |
| `kegg_id` | KEGG compound ID | C00031 |
### Output Items
| Output Item | Description |
|-------------|-------------|
| `summary` | Study overview and metadata |
| `factors` | Experimental factors and design |
| `analysis` | Analysis methods and parameters |
| `metabolites` | List of measured metabolites |
| `data` | Complete experimental data |
| `mwtab` | Complete study in mwTab format |
| `number_of_metabolites` | Count of metabolites measured |
| `species` | Organism species |
| `disease` | Disease studied |
| `source` | Sample source/tissue type |
| `untarg_studies` | Untargeted study information |
| `untarg_factors` | Untargeted study factors |
| `untarg_data` | Untargeted experimental data |
| `datatable` | Formatted data table |
| `available` | List available studies (use with ST as input_value) |
### Example Requests
```bash
# List all publicly available studies
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json"
# Get study summary
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json"
# Get experimental data
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json"
# Get study factors
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/factors/json"
# Find studies containing a specific metabolite
curl "https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json"
# Search studies by investigator
curl "https://www.metabolomicsworkbench.org/rest/study/last_name/Smith/summary/json"
# Download complete study in mwTab format
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/mwtab/txt"
```
## Context 3: RefMet
Query the standardized metabolite nomenclature database with hierarchical classification.
### Input Items
| Input Item | Description | Example |
|------------|-------------|---------|
| `name` | Metabolite name | glucose |
| `inchi_key` | InChI Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
| `pubchem_cid` | PubChem CID | 5793 |
| `exactmass` | Exact mass | 180.0634 |
| `formula` | Molecular formula | C6H12O6 |
| `super_class` | Super class name | Organic compounds |
| `main_class` | Main class name | Carbohydrates |
| `sub_class` | Sub class name | Monosaccharides |
| `match` | Name matching/standardization | citrate |
| `refmet_id` | RefMet identifier | 12345 |
| `all` | Retrieve all RefMet entries | (no value needed) |
### Output Items
| Output Item | Description |
|-------------|-------------|
| `all` | All available RefMet data |
| `name` | Standardized RefMet name |
| `inchi_key` | InChI Key |
| `pubchem_cid` | PubChem CID |
| `exactmass` | Exact mass |
| `formula` | Molecular formula |
| `sys_name` | Systematic name |
| `super_class` | Super class classification |
| `main_class` | Main class classification |
| `sub_class` | Sub class classification |
| `refmet_id` | RefMet identifier |
### Example Requests
```bash
# Standardize a metabolite name
curl "https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json"
# Get all RefMet data for a metabolite
curl "https://www.metabolomicsworkbench.org/rest/refmet/name/Glucose/all/json"
# Query by molecular formula
curl "https://www.metabolomicsworkbench.org/rest/refmet/formula/C6H12O6/all/json"
# Get all metabolites in a main class
curl "https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json"
# Query by exact mass
curl "https://www.metabolomicsworkbench.org/rest/refmet/exactmass/180.0634/all/json"
# Download complete RefMet database
curl "https://www.metabolomicsworkbench.org/rest/refmet/all/json"
```
### RefMet Classification Hierarchy
RefMet provides four-level structural resolution:
1. **Super Class**: Broadest categorization (e.g., "Organic compounds", "Lipids")
2. **Main Class**: Major biochemical categories (e.g., "Fatty Acids", "Carbohydrates")
3. **Sub Class**: More specific groupings (e.g., "Monosaccharides", "Amino acids")
4. **Individual Metabolite**: Specific compound with standardized name
## Context 4: MetStat
Filter studies by analytical and biological parameters using semicolon-delimited format.
### Format
```
/metstat/ANALYSIS_TYPE;POLARITY;CHROMATOGRAPHY;SPECIES;SAMPLE_SOURCE;DISEASE;KEGG_ID;REFMET_NAME
```
### Parameters
| Position | Parameter | Options |
|----------|-----------|---------|
| 1 | Analysis Type | LCMS, GCMS, NMR, MS, ICPMS |
| 2 | Polarity | POSITIVE, NEGATIVE |
| 3 | Chromatography | HILIC, RP (Reverse Phase), GC, IC |
| 4 | Species | Human, Mouse, Rat, etc. |
| 5 | Sample Source | Blood, Plasma, Serum, Urine, Liver, etc. |
| 6 | Disease | Diabetes, Cancer, Alzheimer, etc. |
| 7 | KEGG ID | C00031, etc. |
| 8 | RefMet Name | Glucose, Tyrosine, etc. |
**Note**: Use empty positions (consecutive semicolons) to skip parameters. All parameters are optional.
### Example Requests
```bash
# Human blood diabetes studies with LC-MS HILIC positive mode
curl "https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json"
# All human blood studies containing tyrosine
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json"
# All GC-MS studies regardless of other parameters
curl "https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json"
# Mouse liver studies
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Mouse;Liver;;/json"
# All studies measuring glucose
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;;;;;Glucose/json"
```
## Context 5: Moverz
Perform mass spectrometry precursor ion searches by m/z value.
### Format for m/z Search
```
/moverz/DATABASE/mass/adduct/tolerance/format
```
- **DATABASE**: MB (Metabolomics Workbench), LIPIDS, REFMET
- **mass**: m/z value (e.g., 635.52)
- **adduct**: Ion adduct type (see table below)
- **tolerance**: Mass tolerance in Daltons (e.g., 0.5)
- **format**: json or txt
### Format for Exact Mass Calculation
```
/moverz/exactmass/metabolite_name/adduct/format
```
### Ion Adduct Types
#### Positive Mode Adducts
| Adduct | Description | Example Use |
|--------|-------------|-------------|
| `M+H` | Protonated molecule | Most common positive ESI |
| `M+Na` | Sodium adduct | Common in ESI |
| `M+K` | Potassium adduct | Less common ESI |
| `M+NH4` | Ammonium adduct | Common with ammonium salts |
| `M+2H` | Doubly protonated | Multiply charged ions |
| `M+H-H2O` | Dehydrated protonated | Loss of water |
| `M+2Na-H` | Disodium minus hydrogen | Multiple sodium |
| `M+CH3OH+H` | Methanol adduct | Methanol in mobile phase |
| `M+ACN+H` | Acetonitrile adduct | ACN in mobile phase |
| `M+ACN+Na` | ACN + sodium | ACN and sodium |
#### Negative Mode Adducts
| Adduct | Description | Example Use |
|--------|-------------|-------------|
| `M-H` | Deprotonated molecule | Most common negative ESI |
| `M+Cl` | Chloride adduct | Chlorinated mobile phases |
| `M+FA-H` | Formate adduct | Formic acid in mobile phase |
| `M+HAc-H` | Acetate adduct | Acetic acid in mobile phase |
| `M-H-H2O` | Deprotonated minus water | Water loss |
| `M-2H` | Doubly deprotonated | Multiply charged ions |
| `M+Na-2H` | Sodium minus two protons | Mixed charge states |
#### Uncharged
| Adduct | Description |
|--------|-------------|
| `M` | Uncharged molecule | Direct ionization methods |
### Example Requests
```bash
# Search for compounds with m/z 635.52 (M+H) in MB database
curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json"
# Search in RefMet with negative mode
curl "https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json"
# Search lipids database
curl "https://www.metabolomicsworkbench.org/rest/moverz/LIPIDS/760.59/M+Na/0.5/json"
# Calculate exact mass for known metabolite
curl "https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json"
# High-resolution MS search (tight tolerance)
curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/180.0634/M+H/0.01/json"
```
## Context 6: Gene
Access gene information from the Metabolome Gene/Protein (MGP) database.
### Input Items
| Input Item | Description | Example |
|------------|-------------|---------|
| `mgp_id` | MGP database ID | MGP001 |
| `gene_id` | NCBI Gene ID | 31 |
| `gene_name` | Full gene name | acetyl-CoA carboxylase |
| `gene_symbol` | Gene symbol | ACACA |
| `taxid` | Taxonomy ID | 9606 (human) |
### Output Items
| Output Item | Description |
|-------------|-------------|
| `all` | All gene information |
| `mgp_id` | MGP identifier |
| `gene_id` | NCBI Gene ID |
| `gene_name` | Full gene name |
| `gene_symbol` | Gene symbol |
| `gene_synonyms` | Alternative names |
| `alt_names` | Alternative nomenclature |
| `chromosome` | Chromosomal location |
| `map_location` | Genetic map position |
| `summary` | Gene description |
| `taxid` | Taxonomy ID |
| `species` | Species short name |
| `species_long` | Full species name |
### Example Requests
```bash
# Get gene information by symbol
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json"
# Get gene by NCBI Gene ID
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_id/31/all/json"
# Search by gene name
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_name/carboxylase/summary/json"
```
## Context 7: Protein
Retrieve protein sequence and annotation data.
### Input Items
| Input Item | Description | Example |
|------------|-------------|---------|
| `mgp_id` | MGP database ID | MGP001 |
| `gene_id` | NCBI Gene ID | 31 |
| `gene_name` | Gene name | acetyl-CoA carboxylase |
| `gene_symbol` | Gene symbol | ACACA |
| `taxid` | Taxonomy ID | 9606 |
| `mrna_id` | mRNA identifier | NM_001093.3 |
| `refseq_id` | RefSeq ID | NP_001084 |
| `protein_gi` | GenInfo Identifier | 4557237 |
| `uniprot_id` | UniProt ID | Q13085 |
| `protein_entry` | Protein entry name | ACACA_HUMAN |
| `protein_name` | Protein name | Acetyl-CoA carboxylase |
### Output Items
| Output Item | Description |
|-------------|-------------|
| `all` | All protein information |
| `mgp_id` | MGP identifier |
| `gene_id` | NCBI Gene ID |
| `gene_name` | Gene name |
| `gene_symbol` | Gene symbol |
| `taxid` | Taxonomy ID |
| `species` | Species short name |
| `species_long` | Full species name |
| `mrna_id` | mRNA identifier |
| `refseq_id` | RefSeq protein ID |
| `protein_gi` | GenInfo Identifier |
| `uniprot_id` | UniProt accession |
| `protein_entry` | Protein entry name |
| `protein_name` | Full protein name |
| `seqlength` | Sequence length |
| `seq` | Amino acid sequence |
| `is_identical_to` | Identical sequences |
### Example Requests
```bash
# Get protein information by UniProt ID
curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json"
# Get protein by gene symbol
curl "https://www.metabolomicsworkbench.org/rest/protein/gene_symbol/ACACA/all/json"
# Get protein sequence
curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/seq/json"
# Search by RefSeq ID
curl "https://www.metabolomicsworkbench.org/rest/protein/refseq_id/NP_001084/all/json"
```
## Error Handling
The API returns appropriate HTTP status codes:
- **200 OK**: Successful request
- **400 Bad Request**: Invalid parameters or malformed request
- **404 Not Found**: Resource not found
- **500 Internal Server Error**: Server-side error
When no results are found, the API typically returns an empty array or object rather than an error code.
## Rate Limiting
As of 2025, the Metabolomics Workbench REST API does not enforce strict rate limits for reasonable use. However, best practices include:
- Implementing delays between bulk requests
- Caching frequently accessed reference data
- Using appropriate batch sizes for large-scale queries
## Additional Resources
- **Interactive REST URL Creator**: https://www.metabolomicsworkbench.org/tools/mw_rest.php
- **Official API Specification**: https://www.metabolomicsworkbench.org/tools/MWRestAPIv1.1.pdf
- **Python Library**: mwtab package for Python users
- **R Package**: metabolomicsWorkbenchR (Bioconductor)
- **Julia Package**: MetabolomicsWorkbenchAPI.jl
## Python Example: Complete Workflow
```python
import requests
import json
# 1. Standardize metabolite name using RefMet
metabolite = "citrate"
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/refmet/match/{metabolite}/name/json')
standardized_name = response.json()['name']
# 2. Search for studies containing this metabolite
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/refmet_name/{standardized_name}/summary/json')
studies = response.json()
# 3. Get detailed data from a specific study
study_id = studies[0]['study_id']
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/study_id/{study_id}/data/json')
data = response.json()
# 4. Perform m/z search for compound identification
mz_value = 180.06
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/moverz/MB/{mz_value}/M+H/0.5/json')
matches = response.json()
# 5. Get compound structure
regno = matches[0]['regno']
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
with open('structure.png', 'wb') as f:
f.write(response.content)
```

View File

@@ -0,0 +1,261 @@
---
name: reactome-database
description: Work with Reactome pathway database for analyzing biological pathways, performing pathway enrichment analysis, querying molecular interactions, and analyzing gene expression data. This skill should be used when working with biological pathways, performing overrepresentation analysis, mapping gene identifiers to pathways, analyzing gene expression datasets, or exploring disease-related pathways. Supports both direct REST API access and the reactome2py Python package.
---
# Reactome Database
## Overview
This skill enables interaction with Reactome, a free, open-source, curated and peer-reviewed pathway database. Reactome provides comprehensive biological pathway data for research, genome analysis, modeling, and systems biology. The database contains thousands of human pathways, reactions, proteins, small molecules, and drugs, all supported by extensive literature references.
## Core Capabilities
Reactome provides two main API services and a Python client library:
### 1. Content Service - Data Retrieval
Query and retrieve biological pathway data, molecular interactions, and entity information.
**Common operations:**
- Retrieve pathway information and hierarchies
- Query specific entities (proteins, reactions, complexes)
- Get participating molecules in pathways
- Access database version and metadata
- Explore pathway compartments and locations
**API Base URL:** `https://reactome.org/ContentService`
### 2. Analysis Service - Pathway Analysis
Perform computational analysis on gene lists and expression data.
**Analysis types:**
- **Overrepresentation Analysis**: Identify statistically significant pathways from gene/protein lists
- **Expression Data Analysis**: Analyze gene expression datasets to find relevant pathways
- **Species Comparison**: Compare pathway data across different organisms
**API Base URL:** `https://reactome.org/AnalysisService`
### 3. reactome2py Python Package
Python client library that wraps Reactome API calls for easier programmatic access.
**Installation:**
```bash
pip install reactome2py
```
**Note:** The reactome2py package (version 3.0.0, released January 2021) is functional but not actively maintained. For the most up-to-date functionality, consider using direct REST API calls.
## Querying Pathway Data
### Using Content Service REST API
The Content Service uses REST protocol and returns data in JSON or plain text formats.
**Get database version:**
```python
import requests
response = requests.get("https://reactome.org/ContentService/data/database/version")
version = response.text
print(f"Reactome version: {version}")
```
**Query a specific entity:**
```python
import requests
entity_id = "R-HSA-69278" # Example pathway ID
response = requests.get(f"https://reactome.org/ContentService/data/query/{entity_id}")
data = response.json()
```
**Get participating molecules in a pathway:**
```python
import requests
event_id = "R-HSA-69278"
response = requests.get(
f"https://reactome.org/ContentService/data/event/{event_id}/participatingPhysicalEntities"
)
molecules = response.json()
```
### Using reactome2py Package
```python
import reactome2py
from reactome2py import content
# Query pathway information
pathway_info = content.query_by_id("R-HSA-69278")
# Get database version
version = content.get_database_version()
```
**For detailed API endpoints and parameters**, refer to `references/api_reference.md` in this skill.
## Performing Pathway Analysis
### Overrepresentation Analysis
Submit a list of gene/protein identifiers to find enriched pathways.
**Using REST API:**
```python
import requests
# Prepare identifier list
identifiers = ["TP53", "BRCA1", "EGFR", "MYC"]
data = "\n".join(identifiers)
# Submit analysis
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/",
headers={"Content-Type": "text/plain"},
data=data
)
result = response.json()
token = result["summary"]["token"] # Save token to retrieve results later
# Access pathways
for pathway in result["pathways"]:
print(f"{pathway['stId']}: {pathway['name']} (p-value: {pathway['entities']['pValue']})")
```
**Retrieve analysis by token:**
```python
# Token is valid for 7 days
response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
results = response.json()
```
### Expression Data Analysis
Analyze gene expression datasets with quantitative values.
**Input format (TSV with header starting with #):**
```
#Gene Sample1 Sample2 Sample3
TP53 2.5 3.1 2.8
BRCA1 1.2 1.5 1.3
EGFR 4.5 4.2 4.8
```
**Submit expression data:**
```python
import requests
# Read TSV file
with open("expression_data.tsv", "r") as f:
data = f.read()
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/",
headers={"Content-Type": "text/plain"},
data=data
)
result = response.json()
```
### Species Projection
Map identifiers to human pathways exclusively using the `/projection/` endpoint:
```python
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/projection/",
headers={"Content-Type": "text/plain"},
data=data
)
```
## Visualizing Results
Analysis results can be visualized in the Reactome Pathway Browser by constructing URLs with the analysis token:
```python
token = result["summary"]["token"]
pathway_id = "R-HSA-69278"
url = f"https://reactome.org/PathwayBrowser/#{pathway_id}&DTAB=AN&ANALYSIS={token}"
print(f"View results: {url}")
```
## Working with Analysis Tokens
- Analysis tokens are valid for **7 days**
- Tokens allow retrieval of previously computed results without re-submission
- Store tokens to access results across sessions
- Use `GET /token/{TOKEN}` endpoint to retrieve results
## Data Formats and Identifiers
### Supported Identifier Types
Reactome accepts various identifier formats:
- UniProt accessions (e.g., P04637)
- Gene symbols (e.g., TP53)
- Ensembl IDs (e.g., ENSG00000141510)
- EntrezGene IDs (e.g., 7157)
- ChEBI IDs for small molecules
The system automatically detects identifier types.
### Input Format Requirements
**For overrepresentation analysis:**
- Plain text list of identifiers (one per line)
- OR single column in TSV format
**For expression analysis:**
- TSV format with mandatory header row starting with "#"
- Column 1: identifiers
- Columns 2+: numeric expression values
- Use period (.) as decimal separator
### Output Format
All API responses return JSON containing:
- `pathways`: Array of enriched pathways with statistical metrics
- `summary`: Analysis metadata and token
- `entities`: Matched and unmapped identifiers
- Statistical values: pValue, FDR (false discovery rate)
## Helper Scripts
This skill includes `scripts/reactome_query.py`, a helper script for common Reactome operations:
```bash
# Query pathway information
python scripts/reactome_query.py query R-HSA-69278
# Perform overrepresentation analysis
python scripts/reactome_query.py analyze gene_list.txt
# Get database version
python scripts/reactome_query.py version
```
## Additional Resources
- **API Documentation**: https://reactome.org/dev
- **User Guide**: https://reactome.org/userguide
- **Documentation Portal**: https://reactome.org/documentation
- **Data Downloads**: https://reactome.org/download-data
- **reactome2py Docs**: https://reactome.github.io/reactome2py/
For comprehensive API endpoint documentation, see `references/api_reference.md` in this skill.
## Current Database Statistics (Version 94, September 2025)
- 2,825 human pathways
- 16,002 reactions
- 11,630 proteins
- 2,176 small molecules
- 1,070 drugs
- 41,373 literature references

View File

@@ -0,0 +1,465 @@
# Reactome API Reference
This document provides comprehensive reference information for Reactome's REST APIs.
## Base URLs
- **Content Service**: `https://reactome.org/ContentService`
- **Analysis Service**: `https://reactome.org/AnalysisService`
## Content Service API
The Content Service provides access to Reactome's curated pathway data through REST endpoints.
### Database Information
#### Get Database Version
```
GET /data/database/version
```
**Response:** Plain text containing the database version number
**Example:**
```python
import requests
response = requests.get("https://reactome.org/ContentService/data/database/version")
print(response.text) # e.g., "94"
```
#### Get Database Name
```
GET /data/database/name
```
**Response:** Plain text containing the database name
### Entity Queries
#### Query Entity by ID
```
GET /data/query/{id}
```
**Parameters:**
- `id` (path): Stable identifier or database ID (e.g., "R-HSA-69278")
**Response:** JSON object containing full entity information including:
- `stId`: Stable identifier
- `displayName`: Human-readable name
- `schemaClass`: Entity type (Pathway, Reaction, Complex, etc.)
- `species`: Array of species information
- Additional type-specific fields
**Example:**
```python
import requests
response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278")
pathway = response.json()
print(f"Pathway: {pathway['displayName']}")
print(f"Species: {pathway['species'][0]['displayName']}")
```
#### Query Entity Attribute
```
GET /data/query/{id}/{attribute}
```
**Parameters:**
- `id` (path): Entity identifier
- `attribute` (path): Specific attribute name (e.g., "displayName", "compartment")
**Response:** JSON or plain text depending on attribute type
**Example:**
```python
response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278/displayName")
name = response.text
```
### Pathway Queries
#### Get Pathway Entities
```
GET /data/event/{id}/participatingPhysicalEntities
```
**Parameters:**
- `id` (path): Pathway or reaction stable identifier
**Response:** JSON array of physical entities (proteins, complexes, small molecules) participating in the pathway
**Example:**
```python
response = requests.get(
"https://reactome.org/ContentService/data/event/R-HSA-69278/participatingPhysicalEntities"
)
entities = response.json()
for entity in entities:
print(f"{entity['stId']}: {entity['displayName']} ({entity['schemaClass']})")
```
#### Get Contained Events
```
GET /data/pathway/{id}/containedEvents
```
**Parameters:**
- `id` (path): Pathway stable identifier
**Response:** JSON array of events (reactions, subpathways) contained within the pathway
### Search Queries
#### Search by Name
```
GET /data/query?name={query}
```
**Parameters:**
- `name` (query): Search term
**Response:** JSON array of matching entities
**Example:**
```python
response = requests.get(
"https://reactome.org/ContentService/data/query",
params={"name": "glycolysis"}
)
results = response.json()
```
## Analysis Service API
The Analysis Service performs pathway enrichment and expression analysis.
### Submit Analysis
#### Submit Identifiers (POST)
```
POST /identifiers/
POST /identifiers/projection/ # Map to human pathways only
```
**Headers:**
- `Content-Type: text/plain`
**Body:**
- For overrepresentation: Plain text list of identifiers (one per line)
- For expression analysis: TSV format with header starting with "#"
**Expression data format:**
```
#Gene Sample1 Sample2 Sample3
TP53 2.5 3.1 2.8
BRCA1 1.2 1.5 1.3
```
**Response:** JSON object containing:
```json
{
"summary": {
"token": "MzUxODM3NTQzMDAwMDA1ODI4MA==",
"type": "OVERREPRESENTATION",
"species": "9606",
"sampleName": null,
"fileName": null,
"text": true
},
"pathways": [
{
"stId": "R-HSA-69278",
"name": "Cell Cycle, Mitotic",
"species": {
"name": "Homo sapiens",
"taxId": "9606"
},
"entities": {
"found": 15,
"total": 450,
"pValue": 0.0000234,
"fdr": 0.00156
},
"reactions": {
"found": 12,
"total": 342
}
}
],
"resourceSummary": [
{
"resource": "TOTAL",
"pathways": 25
}
]
}
```
**Example:**
```python
import requests
# Overrepresentation analysis
identifiers = ["TP53", "BRCA1", "EGFR", "MYC", "CDK1"]
data = "\n".join(identifiers)
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/",
headers={"Content-Type": "text/plain"},
data=data
)
result = response.json()
token = result["summary"]["token"]
# Process pathways
for pathway in result["pathways"]:
print(f"Pathway: {pathway['name']}")
print(f" Found: {pathway['entities']['found']}/{pathway['entities']['total']}")
print(f" p-value: {pathway['entities']['pValue']:.6f}")
print(f" FDR: {pathway['entities']['fdr']:.6f}")
```
#### Submit File (Form Upload)
```
POST /identifiers/form/
```
**Content-Type:** `multipart/form-data`
**Parameters:**
- `file`: File containing identifiers or expression data
#### Submit URL
```
POST /identifiers/url/
```
**Parameters:**
- `url`: URL pointing to data file
### Retrieve Analysis Results
#### Get Results by Token
```
GET /token/{token}
GET /token/{token}/projection/ # With species projection
```
**Parameters:**
- `token` (path): Analysis token returned from submission
**Response:** Same structure as initial analysis response
**Example:**
```python
token = "MzUxODM3NTQzMDAwMDA1ODI4MA=="
response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
results = response.json()
```
**Note:** Tokens are valid for 7 days
#### Filter Results
```
GET /token/{token}/filter/pathways?resource={resource}
```
**Parameters:**
- `token` (path): Analysis token
- `resource` (query): Resource filter (e.g., "TOTAL", "UNIPROT", "ENSEMBL")
### Download Results
#### Download as CSV
```
GET /download/{token}/pathways/{resource}/result.csv
```
#### Download Mapping
```
GET /download/{token}/entities/found/{resource}/mapping.tsv
```
## Supported Identifiers
Reactome automatically detects and processes various identifier types:
### Proteins and Genes
- **UniProt**: P04637
- **Gene Symbol**: TP53
- **Ensembl**: ENSG00000141510
- **EntrezGene**: 7157
- **RefSeq**: NM_000546
- **OMIM**: 191170
### Small Molecules
- **ChEBI**: CHEBI:15377
- **KEGG Compound**: C00031
- **PubChem**: 702
### Other
- **miRBase**: hsa-miR-21
- **InterPro**: IPR011616
## Response Formats
### JSON Objects
Entity objects contain standardized fields:
```json
{
"stId": "R-HSA-69278",
"displayName": "Cell Cycle, Mitotic",
"schemaClass": "Pathway",
"species": [
{
"dbId": 48887,
"displayName": "Homo sapiens",
"taxId": "9606"
}
],
"isInDisease": false
}
```
### TSV Format
For bulk queries, TSV returns:
```
stId displayName schemaClass
R-HSA-69278 Cell Cycle, Mitotic Pathway
R-HSA-69306 DNA Replication Pathway
```
## Error Responses
### HTTP Status Codes
- `200`: Success
- `400`: Bad Request (invalid parameters)
- `404`: Not Found (invalid ID)
- `415`: Unsupported Media Type
- `500`: Internal Server Error
### Error JSON Structure
```json
{
"code": 404,
"reason": "NOT_FOUND",
"messages": ["Pathway R-HSA-INVALID not found"]
}
```
## Rate Limiting
Reactome does not currently enforce strict rate limits, but consider:
- Implementing reasonable delays between requests
- Using batch operations when available
- Caching results when appropriate
- Respecting the 7-day token validity period
## Best Practices
### 1. Use Analysis Tokens
Store and reuse analysis tokens to avoid redundant computation:
```python
# Store token after analysis
token = result["summary"]["token"]
save_token(token) # Save to file or database
# Retrieve results later
result = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
```
### 2. Batch Queries
Submit multiple identifiers in a single request rather than individual queries:
```python
# Good: Single batch request
identifiers = ["TP53", "BRCA1", "EGFR"]
result = analyze_batch(identifiers)
# Avoid: Multiple individual requests
# for gene in genes:
# result = analyze_single(gene) # Don't do this
```
### 3. Handle Species Appropriately
Use `/projection/` endpoints to map non-human identifiers to human pathways:
```python
# For mouse genes, project to human pathways
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/projection/",
headers={"Content-Type": "text/plain"},
data=mouse_genes
)
```
### 4. Process Large Result Sets
For analyses returning many pathways, filter by significance:
```python
significant_pathways = [
p for p in result["pathways"]
if p["entities"]["fdr"] < 0.05
]
```
## Integration Examples
### Complete Analysis Workflow
```python
import requests
import json
def analyze_gene_list(genes, output_file="analysis_results.json"):
"""
Perform pathway enrichment analysis on a list of genes
"""
# Submit analysis
data = "\n".join(genes)
response = requests.post(
"https://reactome.org/AnalysisService/identifiers/",
headers={"Content-Type": "text/plain"},
data=data
)
if response.status_code != 200:
raise Exception(f"Analysis failed: {response.text}")
result = response.json()
token = result["summary"]["token"]
# Filter significant pathways (FDR < 0.05)
significant = [
p for p in result["pathways"]
if p["entities"]["fdr"] < 0.05
]
# Save results
with open(output_file, "w") as f:
json.dump({
"token": token,
"total_pathways": len(result["pathways"]),
"significant_pathways": len(significant),
"pathways": significant
}, f, indent=2)
# Generate browser URL for top pathway
if significant:
top_pathway = significant[0]
url = f"https://reactome.org/PathwayBrowser/#{top_pathway['stId']}&DTAB=AN&ANALYSIS={token}"
print(f"View top result: {url}")
return result
# Usage
genes = ["TP53", "BRCA1", "BRCA2", "CDK1", "CDK2"]
result = analyze_gene_list(genes)
```
## Additional Resources
- **Interactive API Documentation**: https://reactome.org/dev/content-service
- **Analysis Service Docs**: https://reactome.org/dev/analysis
- **User Guide**: https://reactome.org/userguide
- **Data Downloads**: https://reactome.org/download-data

View File

@@ -0,0 +1,286 @@
#!/usr/bin/env python3
"""
Reactome Database Query Helper Script
This script provides convenient command-line access to common Reactome operations.
Usage:
python reactome_query.py version
python reactome_query.py query <pathway_id>
python reactome_query.py analyze <gene_list_file>
python reactome_query.py search <term>
python reactome_query.py entities <pathway_id>
Examples:
python reactome_query.py version
python reactome_query.py query R-HSA-69278
python reactome_query.py analyze genes.txt
python reactome_query.py search "cell cycle"
python reactome_query.py entities R-HSA-69278
"""
import sys
import json
import requests
from typing import List, Dict, Optional
class ReactomeClient:
"""Client for interacting with Reactome REST APIs"""
CONTENT_BASE = "https://reactome.org/ContentService"
ANALYSIS_BASE = "https://reactome.org/AnalysisService"
def get_version(self) -> str:
"""Get Reactome database version"""
response = requests.get(f"{self.CONTENT_BASE}/data/database/version")
response.raise_for_status()
return response.text.strip()
def query_pathway(self, pathway_id: str) -> Dict:
"""Query pathway information by ID"""
response = requests.get(f"{self.CONTENT_BASE}/data/query/{pathway_id}")
response.raise_for_status()
return response.json()
def get_pathway_entities(self, pathway_id: str) -> List[Dict]:
"""Get participating entities in a pathway"""
response = requests.get(
f"{self.CONTENT_BASE}/data/event/{pathway_id}/participatingPhysicalEntities"
)
response.raise_for_status()
return response.json()
def search_pathways(self, term: str) -> List[Dict]:
"""Search for pathways by name"""
response = requests.get(
f"{self.CONTENT_BASE}/data/query",
params={"name": term}
)
response.raise_for_status()
return response.json()
def analyze_genes(self, gene_list: List[str]) -> Dict:
"""Perform pathway enrichment analysis on gene list"""
data = "\n".join(gene_list)
response = requests.post(
f"{self.ANALYSIS_BASE}/identifiers/",
headers={"Content-Type": "text/plain"},
data=data
)
response.raise_for_status()
return response.json()
def get_analysis_by_token(self, token: str) -> Dict:
"""Retrieve analysis results by token"""
response = requests.get(f"{self.ANALYSIS_BASE}/token/{token}")
response.raise_for_status()
return response.json()
def print_json(data):
"""Pretty print JSON data"""
print(json.dumps(data, indent=2))
def command_version():
"""Get and display Reactome version"""
client = ReactomeClient()
version = client.get_version()
print(f"Reactome Database Version: {version}")
def command_query(pathway_id: str):
"""Query and display pathway information"""
client = ReactomeClient()
try:
pathway = client.query_pathway(pathway_id)
print(f"Pathway: {pathway['displayName']}")
print(f"ID: {pathway['stId']}")
print(f"Type: {pathway['schemaClass']}")
if 'species' in pathway and pathway['species']:
species = pathway['species'][0]['displayName']
print(f"Species: {species}")
if 'summation' in pathway and pathway['summation']:
summation = pathway['summation'][0]['text']
print(f"\nDescription: {summation}")
print("\nFull JSON response:")
print_json(pathway)
except requests.HTTPError as e:
if e.response.status_code == 404:
print(f"Error: Pathway '{pathway_id}' not found")
else:
print(f"Error: {e}")
sys.exit(1)
def command_entities(pathway_id: str):
"""Display entities participating in a pathway"""
client = ReactomeClient()
try:
entities = client.get_pathway_entities(pathway_id)
print(f"Entities in pathway {pathway_id}: {len(entities)} total\n")
# Group by type
by_type = {}
for entity in entities:
entity_type = entity['schemaClass']
if entity_type not in by_type:
by_type[entity_type] = []
by_type[entity_type].append(entity)
# Display by type
for entity_type, entities_list in sorted(by_type.items()):
print(f"{entity_type} ({len(entities_list)}):")
for entity in entities_list[:10]: # Show first 10
print(f" - {entity['stId']}: {entity['displayName']}")
if len(entities_list) > 10:
print(f" ... and {len(entities_list) - 10} more")
print()
except requests.HTTPError as e:
if e.response.status_code == 404:
print(f"Error: Pathway '{pathway_id}' not found")
else:
print(f"Error: {e}")
sys.exit(1)
def command_search(term: str):
"""Search for pathways by term"""
client = ReactomeClient()
try:
results = client.search_pathways(term)
print(f"Search results for '{term}': {len(results)} found\n")
for result in results[:20]: # Show first 20
print(f"{result['stId']}: {result['displayName']}")
if 'species' in result and result['species']:
species = result['species'][0]['displayName']
print(f" Species: {species}")
print(f" Type: {result['schemaClass']}")
print()
if len(results) > 20:
print(f"... and {len(results) - 20} more results")
except requests.HTTPError as e:
print(f"Error: {e}")
sys.exit(1)
def command_analyze(gene_file: str):
"""Perform pathway enrichment analysis"""
client = ReactomeClient()
# Read gene list
try:
with open(gene_file, 'r') as f:
genes = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
print(f"Error: File '{gene_file}' not found")
sys.exit(1)
print(f"Analyzing {len(genes)} genes...")
try:
result = client.analyze_genes(genes)
# Display summary
summary = result['summary']
print(f"\nAnalysis Type: {summary['type']}")
print(f"Token: {summary['token']} (valid for 7 days)")
print(f"Species: {summary.get('species', 'N/A')}")
# Display pathways
pathways = result.get('pathways', [])
print(f"\nEnriched Pathways: {len(pathways)} found")
# Show significant pathways (FDR < 0.05)
significant = [p for p in pathways if p['entities']['fdr'] < 0.05]
print(f"Significant (FDR < 0.05): {len(significant)}\n")
# Display top 10 pathways
print("Top 10 Pathways:")
for i, pathway in enumerate(pathways[:10], 1):
print(f"\n{i}. {pathway['name']}")
print(f" ID: {pathway['stId']}")
entities = pathway['entities']
print(f" Found: {entities['found']}/{entities['total']} entities")
print(f" p-value: {entities['pValue']:.6e}")
print(f" FDR: {entities['fdr']:.6e}")
# Generate browser URL for top pathway
if pathways:
token = summary['token']
top_pathway = pathways[0]['stId']
url = f"https://reactome.org/PathwayBrowser/#{top_pathway}&DTAB=AN&ANALYSIS={token}"
print(f"\nView top result in browser:")
print(url)
# Save full results
output_file = gene_file.replace('.txt', '_results.json')
with open(output_file, 'w') as f:
json.dump(result, f, indent=2)
print(f"\nFull results saved to: {output_file}")
except requests.HTTPError as e:
print(f"Error: {e}")
sys.exit(1)
def print_usage():
"""Print usage information"""
print(__doc__)
def main():
if len(sys.argv) < 2:
print_usage()
sys.exit(1)
command = sys.argv[1].lower()
if command == "version":
command_version()
elif command == "query":
if len(sys.argv) < 3:
print("Error: pathway_id required")
print("Usage: python reactome_query.py query <pathway_id>")
sys.exit(1)
command_query(sys.argv[2])
elif command == "entities":
if len(sys.argv) < 3:
print("Error: pathway_id required")
print("Usage: python reactome_query.py entities <pathway_id>")
sys.exit(1)
command_entities(sys.argv[2])
elif command == "search":
if len(sys.argv) < 3:
print("Error: search term required")
print("Usage: python reactome_query.py search <term>")
sys.exit(1)
command_search(" ".join(sys.argv[2:]))
elif command == "analyze":
if len(sys.argv) < 3:
print("Error: gene list file required")
print("Usage: python reactome_query.py analyze <gene_list_file>")
sys.exit(1)
command_analyze(sys.argv[2])
else:
print(f"Error: Unknown command '{command}'")
print_usage()
sys.exit(1)
if __name__ == "__main__":
main()