mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
Add more databases
This commit is contained in:
@@ -7,7 +7,7 @@
|
|||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"description": "Claude scientific skills from K-Dense Inc",
|
"description": "Claude scientific skills from K-Dense Inc",
|
||||||
"version": "1.12.0"
|
"version": "1.15.0"
|
||||||
},
|
},
|
||||||
"plugins": [
|
"plugins": [
|
||||||
{
|
{
|
||||||
@@ -61,15 +61,20 @@
|
|||||||
"skills": [
|
"skills": [
|
||||||
"./scientific-databases/alphafold-database",
|
"./scientific-databases/alphafold-database",
|
||||||
"./scientific-databases/chembl-database",
|
"./scientific-databases/chembl-database",
|
||||||
|
"./scientific-databases/clinpgx-database",
|
||||||
"./scientific-databases/clinvar-database",
|
"./scientific-databases/clinvar-database",
|
||||||
"./scientific-databases/cosmic-database",
|
"./scientific-databases/cosmic-database",
|
||||||
"./scientific-databases/ena-database",
|
"./scientific-databases/ena-database",
|
||||||
|
"./scientific-databases/ensembl-database",
|
||||||
"./scientific-databases/gene-database",
|
"./scientific-databases/gene-database",
|
||||||
"./scientific-databases/geo-database",
|
"./scientific-databases/geo-database",
|
||||||
|
"./scientific-databases/hmdb-database",
|
||||||
"./scientific-databases/kegg-database",
|
"./scientific-databases/kegg-database",
|
||||||
|
"./scientific-databases/metabolomics-workbench-database",
|
||||||
"./scientific-databases/pdb-database",
|
"./scientific-databases/pdb-database",
|
||||||
"./scientific-databases/pubchem-database",
|
"./scientific-databases/pubchem-database",
|
||||||
"./scientific-databases/pubmed-database",
|
"./scientific-databases/pubmed-database",
|
||||||
|
"./scientific-databases/reactome-database",
|
||||||
"./scientific-databases/string-database",
|
"./scientific-databases/string-database",
|
||||||
"./scientific-databases/uniprot-database",
|
"./scientific-databases/uniprot-database",
|
||||||
"./scientific-databases/zinc-database"
|
"./scientific-databases/zinc-database"
|
||||||
|
|||||||
@@ -8,16 +8,20 @@ A comprehensive collection of ready-to-use scientific skills for Claude, curated
|
|||||||
|
|
||||||
- **AlphaFold DB** - AI-predicted protein structure database with 200M+ predictions, confidence metrics (pLDDT, PAE), and Google Cloud bulk access
|
- **AlphaFold DB** - AI-predicted protein structure database with 200M+ predictions, confidence metrics (pLDDT, PAE), and Google Cloud bulk access
|
||||||
- **ChEMBL** - Bioactive molecule database with drug-like properties (2M+ compounds, 19M+ activities, 13K+ targets)
|
- **ChEMBL** - Bioactive molecule database with drug-like properties (2M+ compounds, 19M+ activities, 13K+ targets)
|
||||||
|
- **ClinPGx** - Clinical pharmacogenomics database (successor to PharmGKB) providing gene-drug interactions, CPIC clinical guidelines, allele functions, drug labels, and pharmacogenomic annotations for precision medicine and personalized pharmacotherapy (consolidates PharmGKB, CPIC, and PharmCAT resources)
|
||||||
- **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
|
- **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
|
||||||
- **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
|
- **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
|
||||||
- **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
|
- **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
|
||||||
|
- **Ensembl** - Genome browser and bioinformatics database providing genomic annotations, sequences, variants, and comparative genomics data for 250+ vertebrate species (Release 115, 2025) with comprehensive REST API for gene lookups, sequence retrieval, variant effect prediction (VEP), ortholog finding, assembly mapping (GRCh37/GRCh38), and region analysis
|
||||||
- **GEO (Gene Expression Omnibus)** - High-throughput gene expression and functional genomics data repository (264K+ studies, 8M+ samples) with microarray, RNA-seq, and expression profile access
|
- **GEO (Gene Expression Omnibus)** - High-throughput gene expression and functional genomics data repository (264K+ studies, 8M+ samples) with microarray, RNA-seq, and expression profile access
|
||||||
- **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
|
- **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
|
||||||
- **KEGG** - Kyoto Encyclopedia of Genes and Genomes for biological pathway analysis, gene-to-pathway mapping, compound searches, and molecular interaction networks (pathway enrichment, metabolic pathways, gene annotations, drug-drug interactions, ID conversion)
|
- **KEGG** - Kyoto Encyclopedia of Genes and Genomes for biological pathway analysis, gene-to-pathway mapping, compound searches, and molecular interaction networks (pathway enrichment, metabolic pathways, gene annotations, drug-drug interactions, ID conversion)
|
||||||
|
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository with 4,200+ processed studies, standardized nomenclature (RefMet), mass spectrometry searches, and comprehensive REST API for accessing metabolite structures, study metadata, experimental results, and gene/protein-metabolite associations
|
||||||
- **NCBI Gene** - Work with NCBI Gene database to search, retrieve, and analyze gene information including nomenclature, sequences, variations, phenotypes, and pathways using E-utilities and Datasets API
|
- **NCBI Gene** - Work with NCBI Gene database to search, retrieve, and analyze gene information including nomenclature, sequences, variations, phenotypes, and pathways using E-utilities and Datasets API
|
||||||
- **Protein Data Bank (PDB)** - Access 3D structural data of proteins, nucleic acids, and biological macromolecules (200K+ structures) with search, retrieval, and analysis capabilities
|
- **Protein Data Bank (PDB)** - Access 3D structural data of proteins, nucleic acids, and biological macromolecules (200K+ structures) with search, retrieval, and analysis capabilities
|
||||||
- **PubChem** - Access chemical compound data from the world's largest free chemical database (110M+ compounds, 270M+ bioactivities)
|
- **PubChem** - Access chemical compound data from the world's largest free chemical database (110M+ compounds, 270M+ bioactivities)
|
||||||
- **PubMed** - Access to PubMed literature database with advanced search capabilities
|
- **PubMed** - Access to PubMed literature database with advanced search capabilities
|
||||||
|
- **Reactome** - Curated pathway database for biological processes and molecular interactions (2,825+ human pathways, 16K+ reactions, 11K+ proteins) with pathway enrichment analysis, expression data analysis, and species comparison using Content Service and Analysis Service APIs
|
||||||
- **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
|
- **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
|
||||||
- **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
|
- **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
|
||||||
- **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery (230M+ purchasable compounds in ready-to-dock 3D formats)
|
- **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery (230M+ purchasable compounds in ready-to-dock 3D formats)
|
||||||
@@ -125,16 +129,12 @@ You can use Anthropic's pre-built skills, and upload custom skills, via the Clau
|
|||||||
- **DAVID** - Database for Annotation, Visualization and Integrated Discovery for functional enrichment analysis
|
- **DAVID** - Database for Annotation, Visualization and Integrated Discovery for functional enrichment analysis
|
||||||
- **dbSNP** - NCBI's database of single nucleotide polymorphisms and short genetic variations
|
- **dbSNP** - NCBI's database of single nucleotide polymorphisms and short genetic variations
|
||||||
- **DrugBank** - Comprehensive drug and drug target database with pharmacological and pharmaceutical data
|
- **DrugBank** - Comprehensive drug and drug target database with pharmacological and pharmaceutical data
|
||||||
- **Ensembl** - Genome browser with annotation, comparative genomics, and variant data
|
|
||||||
- **GenBank** - NIH genetic sequence database (part of NCBI but with specific access patterns)
|
- **GenBank** - NIH genetic sequence database (part of NCBI but with specific access patterns)
|
||||||
- **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies
|
- **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies
|
||||||
- **InterPro** - Protein sequence analysis and classification with functional annotations
|
- **InterPro** - Protein sequence analysis and classification with functional annotations
|
||||||
- **MetaboLights** - EMBL-EBI metabolomics database with experimental data and metadata
|
- **MetaboLights** - EMBL-EBI metabolomics database with experimental data and metadata
|
||||||
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository
|
|
||||||
- **OMIM** - Online Mendelian Inheritance in Man for genetic disorders and genes
|
- **OMIM** - Online Mendelian Inheritance in Man for genetic disorders and genes
|
||||||
- **Pfam** - Protein families database with multiple sequence alignments and HMMs
|
- **Pfam** - Protein families database with multiple sequence alignments and HMMs
|
||||||
- **PharmGKB** - Pharmacogenomics Knowledge Base linking genetic variation to drug response
|
|
||||||
- **Reactome** - Curated pathway database with biological processes and molecular interactions
|
|
||||||
- **RefSeq** - NCBI's non-redundant reference sequence database
|
- **RefSeq** - NCBI's non-redundant reference sequence database
|
||||||
- **TCGA** - The Cancer Genome Atlas with multi-omic cancer genomics data
|
- **TCGA** - The Cancer Genome Atlas with multi-omic cancer genomics data
|
||||||
- **UCSC Genome Browser** - Genomic data visualization and custom track integration
|
- **UCSC Genome Browser** - Genomic data visualization and custom track integration
|
||||||
|
|||||||
632
scientific-databases/clinpgx-database/SKILL.md
Normal file
632
scientific-databases/clinpgx-database/SKILL.md
Normal file
@@ -0,0 +1,632 @@
|
|||||||
|
---
|
||||||
|
name: clinpgx-database
|
||||||
|
description: Toolkit for accessing ClinPGx, a clinical pharmacogenomics database providing information on how genetic variation affects drug response. Use this skill when working with pharmacogenomics data, querying gene-drug interactions, accessing CPIC clinical guidelines, retrieving allele function and frequency information, exploring PharmGKB annotations, or conducting research on personalized medicine and precision pharmacotherapy. ClinPGx consolidates PharmGKB, CPIC, and PharmCAT resources.
|
||||||
|
---
|
||||||
|
|
||||||
|
# ClinPGx Database
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Facilitate access to and querying of ClinPGx (Clinical Pharmacogenomics Database), a comprehensive resource for clinical pharmacogenomics information. ClinPGx is the successor to PharmGKB (launched officially in July 2025) and consolidates data from PharmGKB, CPIC (Clinical Pharmacogenetics Implementation Consortium), and PharmCAT (Pharmacogenomics Clinical Annotation Tool). The database provides curated information on how human genetic variation affects medication response, including gene-drug pairs, clinical guidelines, allele functions, and drug labels. Managed at Stanford University as a ClinGen (Clinical Genome Resource) affiliate grant.
|
||||||
|
|
||||||
|
## When to Use This Skill
|
||||||
|
|
||||||
|
Use this skill when queries involve:
|
||||||
|
|
||||||
|
- **Gene-drug interactions**: Querying how genetic variants affect drug metabolism, efficacy, or toxicity
|
||||||
|
- **CPIC guidelines**: Accessing evidence-based clinical practice guidelines for pharmacogenetics
|
||||||
|
- **Allele information**: Retrieving allele function, frequency, and phenotype data
|
||||||
|
- **Drug labels**: Exploring FDA and other regulatory pharmacogenomic drug labeling
|
||||||
|
- **Pharmacogenomic annotations**: Accessing curated literature on gene-drug-disease relationships
|
||||||
|
- **Clinical decision support**: Using PharmDOG tool for phenoconversion and custom genotype interpretation
|
||||||
|
- **Precision medicine**: Implementing pharmacogenomic testing in clinical practice
|
||||||
|
- **Drug metabolism**: Understanding CYP450 and other pharmacogene functions
|
||||||
|
- **Personalized dosing**: Finding genotype-guided dosing recommendations
|
||||||
|
- **Adverse drug reactions**: Identifying genetic risk factors for drug toxicity
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
### Python API Access
|
||||||
|
|
||||||
|
The ClinPGx REST API provides programmatic access to all database resources. Basic setup:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install requests
|
||||||
|
```
|
||||||
|
|
||||||
|
### API Endpoint
|
||||||
|
|
||||||
|
```python
|
||||||
|
BASE_URL = "https://api.clinpgx.org/v1/"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rate Limits**:
|
||||||
|
- 2 requests per second maximum
|
||||||
|
- Excessive requests will result in HTTP 429 (Too Many Requests) response
|
||||||
|
|
||||||
|
**Authentication**: Not required for basic access
|
||||||
|
|
||||||
|
**Data License**: Creative Commons Attribution-ShareAlike 4.0 International License
|
||||||
|
|
||||||
|
For substantial API use, notify the ClinPGx team at api@clinpgx.org
|
||||||
|
|
||||||
|
## Core Capabilities
|
||||||
|
|
||||||
|
### 1. Gene Queries
|
||||||
|
|
||||||
|
**Retrieve gene information** including function, clinical annotations, and pharmacogenomic significance:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Get gene details
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/gene/CYP2D6")
|
||||||
|
gene_data = response.json()
|
||||||
|
|
||||||
|
# Search for genes by name
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/gene",
|
||||||
|
params={"q": "CYP"})
|
||||||
|
genes = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key pharmacogenes**:
|
||||||
|
- **CYP450 enzymes**: CYP2D6, CYP2C19, CYP2C9, CYP3A4, CYP3A5
|
||||||
|
- **Transporters**: SLCO1B1, ABCB1, ABCG2
|
||||||
|
- **Other metabolizers**: TPMT, DPYD, NUDT15, UGT1A1
|
||||||
|
- **Receptors**: OPRM1, HTR2A, ADRB1
|
||||||
|
- **HLA genes**: HLA-B, HLA-A
|
||||||
|
|
||||||
|
### 2. Drug and Chemical Queries
|
||||||
|
|
||||||
|
**Retrieve drug information** including pharmacogenomic annotations and mechanisms:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get drug details
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/chemical/PA448515") # Warfarin
|
||||||
|
drug_data = response.json()
|
||||||
|
|
||||||
|
# Search drugs by name
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/chemical",
|
||||||
|
params={"name": "warfarin"})
|
||||||
|
drugs = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Drug categories with pharmacogenomic significance**:
|
||||||
|
- Anticoagulants (warfarin, clopidogrel)
|
||||||
|
- Antidepressants (SSRIs, TCAs)
|
||||||
|
- Immunosuppressants (tacrolimus, azathioprine)
|
||||||
|
- Oncology drugs (5-fluorouracil, irinotecan, tamoxifen)
|
||||||
|
- Cardiovascular drugs (statins, beta-blockers)
|
||||||
|
- Pain medications (codeine, tramadol)
|
||||||
|
- Antivirals (abacavir)
|
||||||
|
|
||||||
|
### 3. Gene-Drug Pair Queries
|
||||||
|
|
||||||
|
**Access curated gene-drug relationships** with clinical annotations:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get gene-drug pair information
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "CYP2D6", "drug": "codeine"})
|
||||||
|
pair_data = response.json()
|
||||||
|
|
||||||
|
# Get all pairs for a gene
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "CYP2C19"})
|
||||||
|
all_pairs = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Clinical annotation sources**:
|
||||||
|
- CPIC (Clinical Pharmacogenetics Implementation Consortium)
|
||||||
|
- DPWG (Dutch Pharmacogenetics Working Group)
|
||||||
|
- FDA (Food and Drug Administration) labels
|
||||||
|
- Peer-reviewed literature summary annotations
|
||||||
|
|
||||||
|
### 4. CPIC Guidelines
|
||||||
|
|
||||||
|
**Access evidence-based clinical practice guidelines**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get CPIC guideline
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/guideline/PA166104939")
|
||||||
|
guideline = response.json()
|
||||||
|
|
||||||
|
# List all CPIC guidelines
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/guideline",
|
||||||
|
params={"source": "CPIC"})
|
||||||
|
guidelines = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**CPIC guideline components**:
|
||||||
|
- Gene-drug pairs covered
|
||||||
|
- Clinical recommendations by phenotype
|
||||||
|
- Evidence levels and strength ratings
|
||||||
|
- Supporting literature
|
||||||
|
- Downloadable PDFs and supplementary materials
|
||||||
|
- Implementation considerations
|
||||||
|
|
||||||
|
**Example guidelines**:
|
||||||
|
- CYP2D6-codeine (avoid in ultra-rapid metabolizers)
|
||||||
|
- CYP2C19-clopidogrel (alternative therapy for poor metabolizers)
|
||||||
|
- TPMT-azathioprine (dose reduction for intermediate/poor metabolizers)
|
||||||
|
- DPYD-fluoropyrimidines (dose adjustment based on activity)
|
||||||
|
- HLA-B*57:01-abacavir (avoid if positive)
|
||||||
|
|
||||||
|
### 5. Allele and Variant Information
|
||||||
|
|
||||||
|
**Query allele function and frequency data**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get allele information
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/allele/CYP2D6*4")
|
||||||
|
allele_data = response.json()
|
||||||
|
|
||||||
|
# Get all alleles for a gene
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/allele",
|
||||||
|
params={"gene": "CYP2D6"})
|
||||||
|
alleles = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Allele information includes**:
|
||||||
|
- Functional status (normal, decreased, no function, increased, uncertain)
|
||||||
|
- Population frequencies across ethnic groups
|
||||||
|
- Defining variants (SNPs, indels, CNVs)
|
||||||
|
- Phenotype assignment
|
||||||
|
- References to PharmVar and other nomenclature systems
|
||||||
|
|
||||||
|
**Phenotype categories**:
|
||||||
|
- **Ultra-rapid metabolizer** (UM): Increased enzyme activity
|
||||||
|
- **Normal metabolizer** (NM): Normal enzyme activity
|
||||||
|
- **Intermediate metabolizer** (IM): Reduced enzyme activity
|
||||||
|
- **Poor metabolizer** (PM): Little to no enzyme activity
|
||||||
|
|
||||||
|
### 6. Variant Annotations
|
||||||
|
|
||||||
|
**Access clinical annotations for specific genetic variants**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get variant information
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/variant/rs4244285")
|
||||||
|
variant_data = response.json()
|
||||||
|
|
||||||
|
# Search variants by position (if supported)
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/variant",
|
||||||
|
params={"chromosome": "10", "position": "94781859"})
|
||||||
|
variants = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Variant data includes**:
|
||||||
|
- rsID and genomic coordinates
|
||||||
|
- Gene and functional consequence
|
||||||
|
- Allele associations
|
||||||
|
- Clinical significance
|
||||||
|
- Population frequencies
|
||||||
|
- Literature references
|
||||||
|
|
||||||
|
### 7. Clinical Annotations
|
||||||
|
|
||||||
|
**Retrieve curated literature annotations** (formerly PharmGKB clinical annotations):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get clinical annotations
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
|
||||||
|
params={"gene": "CYP2D6"})
|
||||||
|
annotations = response.json()
|
||||||
|
|
||||||
|
# Filter by evidence level
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
|
||||||
|
params={"evidenceLevel": "1A"})
|
||||||
|
high_evidence = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Evidence levels** (from highest to lowest):
|
||||||
|
- **Level 1A**: High-quality evidence, CPIC/FDA/DPWG guidelines
|
||||||
|
- **Level 1B**: High-quality evidence, not yet guideline
|
||||||
|
- **Level 2A**: Moderate evidence from well-designed studies
|
||||||
|
- **Level 2B**: Moderate evidence with some limitations
|
||||||
|
- **Level 3**: Limited or conflicting evidence
|
||||||
|
- **Level 4**: Case reports or weak evidence
|
||||||
|
|
||||||
|
### 8. Drug Labels
|
||||||
|
|
||||||
|
**Access pharmacogenomic information from drug labels**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get drug labels with PGx information
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
|
||||||
|
params={"drug": "warfarin"})
|
||||||
|
labels = response.json()
|
||||||
|
|
||||||
|
# Filter by regulatory source
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
|
||||||
|
params={"source": "FDA"})
|
||||||
|
fda_labels = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Label information includes**:
|
||||||
|
- Testing recommendations
|
||||||
|
- Dosing guidance by genotype
|
||||||
|
- Warnings and precautions
|
||||||
|
- Biomarker information
|
||||||
|
- Regulatory source (FDA, EMA, PMDA, etc.)
|
||||||
|
|
||||||
|
### 9. Pathways
|
||||||
|
|
||||||
|
**Explore pharmacokinetic and pharmacodynamic pathways**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get pathway information
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/pathway/PA146123006") # Warfarin pathway
|
||||||
|
pathway_data = response.json()
|
||||||
|
|
||||||
|
# Search pathways by drug
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/pathway",
|
||||||
|
params={"drug": "warfarin"})
|
||||||
|
pathways = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pathway diagrams** show:
|
||||||
|
- Drug metabolism steps
|
||||||
|
- Enzymes and transporters involved
|
||||||
|
- Gene variants affecting each step
|
||||||
|
- Downstream effects on efficacy/toxicity
|
||||||
|
- Interactions with other pathways
|
||||||
|
|
||||||
|
## Query Workflow
|
||||||
|
|
||||||
|
### Workflow 1: Clinical Decision Support for Drug Prescription
|
||||||
|
|
||||||
|
1. **Identify patient genotype** for relevant pharmacogenes:
|
||||||
|
```python
|
||||||
|
# Example: Patient is CYP2C19 *1/*2 (intermediate metabolizer)
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/allele/CYP2C19*2")
|
||||||
|
allele_function = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Query gene-drug pairs** for medication of interest:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "CYP2C19", "drug": "clopidogrel"})
|
||||||
|
pair_info = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Retrieve CPIC guideline** for dosing recommendations:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/guideline",
|
||||||
|
params={"gene": "CYP2C19", "drug": "clopidogrel"})
|
||||||
|
guideline = response.json()
|
||||||
|
# Recommendation: Alternative antiplatelet therapy for IM/PM
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Check drug label** for regulatory guidance:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/drugLabel",
|
||||||
|
params={"drug": "clopidogrel"})
|
||||||
|
label = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 2: Gene Panel Analysis
|
||||||
|
|
||||||
|
1. **Get list of pharmacogenes** in clinical panel:
|
||||||
|
```python
|
||||||
|
pgx_panel = ["CYP2C19", "CYP2D6", "CYP2C9", "TPMT", "DPYD", "SLCO1B1"]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **For each gene, retrieve all drug interactions**:
|
||||||
|
```python
|
||||||
|
all_interactions = {}
|
||||||
|
for gene in pgx_panel:
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": gene})
|
||||||
|
all_interactions[gene] = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Filter for CPIC guideline-level evidence**:
|
||||||
|
```python
|
||||||
|
for gene, pairs in all_interactions.items():
|
||||||
|
for pair in pairs:
|
||||||
|
if pair.get('cpicLevel'): # Has CPIC guideline
|
||||||
|
print(f"{gene} - {pair['drug']}: {pair['cpicLevel']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Generate patient report** with actionable pharmacogenomic findings.
|
||||||
|
|
||||||
|
### Workflow 3: Drug Safety Assessment
|
||||||
|
|
||||||
|
1. **Query drug for PGx associations**:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/chemical",
|
||||||
|
params={"name": "abacavir"})
|
||||||
|
drug_id = response.json()[0]['id']
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Get clinical annotations**:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
|
||||||
|
params={"drug": drug_id})
|
||||||
|
annotations = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Check for HLA associations** and toxicity risk:
|
||||||
|
```python
|
||||||
|
for annotation in annotations:
|
||||||
|
if 'HLA' in annotation.get('genes', []):
|
||||||
|
print(f"Toxicity risk: {annotation['phenotype']}")
|
||||||
|
print(f"Evidence level: {annotation['evidenceLevel']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Retrieve screening recommendations** from guidelines and labels.
|
||||||
|
|
||||||
|
### Workflow 4: Research Analysis - Population Pharmacogenomics
|
||||||
|
|
||||||
|
1. **Get allele frequencies** for population comparison:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/allele",
|
||||||
|
params={"gene": "CYP2D6"})
|
||||||
|
alleles = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Extract population-specific frequencies**:
|
||||||
|
```python
|
||||||
|
populations = ['European', 'African', 'East Asian', 'Latino']
|
||||||
|
frequency_data = {}
|
||||||
|
for allele in alleles:
|
||||||
|
allele_name = allele['name']
|
||||||
|
frequency_data[allele_name] = {
|
||||||
|
pop: allele.get(f'{pop}_frequency', 'N/A')
|
||||||
|
for pop in populations
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Calculate phenotype distributions** by population:
|
||||||
|
```python
|
||||||
|
# Combine allele frequencies with function to predict phenotypes
|
||||||
|
phenotype_dist = calculate_phenotype_frequencies(frequency_data)
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Analyze implications** for drug dosing in diverse populations.
|
||||||
|
|
||||||
|
### Workflow 5: Literature Evidence Review
|
||||||
|
|
||||||
|
1. **Search for gene-drug pair**:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "TPMT", "drug": "azathioprine"})
|
||||||
|
pair = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Retrieve all clinical annotations**:
|
||||||
|
```python
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
|
||||||
|
params={"gene": "TPMT", "drug": "azathioprine"})
|
||||||
|
annotations = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Filter by evidence level and publication date**:
|
||||||
|
```python
|
||||||
|
high_quality = [a for a in annotations
|
||||||
|
if a['evidenceLevel'] in ['1A', '1B', '2A']]
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Extract PMIDs** and retrieve full references:
|
||||||
|
```python
|
||||||
|
pmids = [a['pmid'] for a in high_quality if 'pmid' in a]
|
||||||
|
# Use PubMed skill to retrieve full citations
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rate Limiting and Best Practices
|
||||||
|
|
||||||
|
### Rate Limit Compliance
|
||||||
|
|
||||||
|
```python
|
||||||
|
import time
|
||||||
|
|
||||||
|
def rate_limited_request(url, params=None, delay=0.5):
|
||||||
|
"""Make API request with rate limiting (2 req/sec max)"""
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
time.sleep(delay) # Wait 0.5 seconds between requests
|
||||||
|
return response
|
||||||
|
|
||||||
|
# Use in loops
|
||||||
|
genes = ["CYP2D6", "CYP2C19", "CYP2C9"]
|
||||||
|
for gene in genes:
|
||||||
|
response = rate_limited_request(
|
||||||
|
"https://api.clinpgx.org/v1/gene/" + gene
|
||||||
|
)
|
||||||
|
data = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
```python
|
||||||
|
def safe_api_call(url, params=None, max_retries=3):
|
||||||
|
"""API call with error handling and retries"""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 429:
|
||||||
|
# Rate limit exceeded
|
||||||
|
wait_time = 2 ** attempt # Exponential backoff
|
||||||
|
print(f"Rate limit hit. Waiting {wait_time}s...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed: {e}")
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
raise
|
||||||
|
time.sleep(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Caching Results
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def cached_query(cache_file, api_func, *args, **kwargs):
|
||||||
|
"""Cache API results to avoid repeated queries"""
|
||||||
|
cache_path = Path(cache_file)
|
||||||
|
|
||||||
|
if cache_path.exists():
|
||||||
|
with open(cache_path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
result = api_func(*args, **kwargs)
|
||||||
|
|
||||||
|
with open(cache_path, 'w') as f:
|
||||||
|
json.dump(result, f, indent=2)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
gene_data = cached_query(
|
||||||
|
'cyp2d6_cache.json',
|
||||||
|
rate_limited_request,
|
||||||
|
"https://api.clinpgx.org/v1/gene/CYP2D6"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## PharmDOG Tool
|
||||||
|
|
||||||
|
PharmDOG (formerly DDRx) is ClinPGx's clinical decision support tool for interpreting pharmacogenomic test results:
|
||||||
|
|
||||||
|
**Key features**:
|
||||||
|
- **Phenoconversion calculator**: Adjusts phenotype predictions for drug-drug interactions affecting CYP2D6
|
||||||
|
- **Custom genotypes**: Input patient genotypes to get phenotype predictions
|
||||||
|
- **QR code sharing**: Generate shareable patient reports
|
||||||
|
- **Flexible guidance sources**: Select which guidelines to apply (CPIC, DPWG, FDA)
|
||||||
|
- **Multi-drug analysis**: Assess multiple medications simultaneously
|
||||||
|
|
||||||
|
**Access**: Available at https://www.clinpgx.org/pharmacogenomic-decision-support
|
||||||
|
|
||||||
|
**Use cases**:
|
||||||
|
- Clinical interpretation of PGx panel results
|
||||||
|
- Medication review for patients with known genotypes
|
||||||
|
- Patient education materials
|
||||||
|
- Point-of-care decision support
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
### scripts/query_clinpgx.py
|
||||||
|
|
||||||
|
Python script with ready-to-use functions for common ClinPGx queries:
|
||||||
|
|
||||||
|
- `get_gene_info(gene_symbol)` - Retrieve gene details
|
||||||
|
- `get_drug_info(drug_name)` - Get drug information
|
||||||
|
- `get_gene_drug_pairs(gene, drug)` - Query gene-drug interactions
|
||||||
|
- `get_cpic_guidelines(gene, drug)` - Retrieve CPIC guidelines
|
||||||
|
- `get_alleles(gene)` - Get all alleles for a gene
|
||||||
|
- `get_clinical_annotations(gene, drug, evidence_level)` - Query literature annotations
|
||||||
|
- `get_drug_labels(drug)` - Retrieve pharmacogenomic drug labels
|
||||||
|
- `search_variants(rsid)` - Search by variant rsID
|
||||||
|
- `export_to_dataframe(data)` - Convert results to pandas DataFrame
|
||||||
|
|
||||||
|
Consult this script for implementation examples with proper rate limiting and error handling.
|
||||||
|
|
||||||
|
### references/api_reference.md
|
||||||
|
|
||||||
|
Comprehensive API documentation including:
|
||||||
|
|
||||||
|
- Complete endpoint listing with parameters
|
||||||
|
- Request/response format specifications
|
||||||
|
- Example queries for each endpoint
|
||||||
|
- Filter operators and search patterns
|
||||||
|
- Data schema definitions
|
||||||
|
- Rate limiting details
|
||||||
|
- Authentication requirements (if any)
|
||||||
|
- Troubleshooting common errors
|
||||||
|
|
||||||
|
Refer to this document when detailed API information is needed or when constructing complex queries.
|
||||||
|
|
||||||
|
## Important Notes
|
||||||
|
|
||||||
|
### Data Sources and Integration
|
||||||
|
|
||||||
|
ClinPGx consolidates multiple authoritative sources:
|
||||||
|
- **PharmGKB**: Curated pharmacogenomics knowledge base (now part of ClinPGx)
|
||||||
|
- **CPIC**: Evidence-based clinical implementation guidelines
|
||||||
|
- **PharmCAT**: Allele calling and phenotype interpretation tool
|
||||||
|
- **DPWG**: Dutch pharmacogenetics guidelines
|
||||||
|
- **FDA/EMA labels**: Regulatory pharmacogenomic information
|
||||||
|
|
||||||
|
As of July 2025, all PharmGKB URLs redirect to corresponding ClinPGx pages.
|
||||||
|
|
||||||
|
### Clinical Implementation Considerations
|
||||||
|
|
||||||
|
- **Evidence levels**: Always check evidence strength before clinical application
|
||||||
|
- **Population differences**: Allele frequencies vary significantly across populations
|
||||||
|
- **Phenoconversion**: Consider drug-drug interactions that affect enzyme activity
|
||||||
|
- **Multi-gene effects**: Some drugs affected by multiple pharmacogenes
|
||||||
|
- **Non-genetic factors**: Age, organ function, drug interactions also affect response
|
||||||
|
- **Testing limitations**: Not all clinically relevant alleles detected by all assays
|
||||||
|
|
||||||
|
### Data Updates
|
||||||
|
|
||||||
|
- ClinPGx continuously updates with new evidence and guidelines
|
||||||
|
- Check publication dates for clinical annotations
|
||||||
|
- Monitor ClinPGx Blog (https://blog.clinpgx.org/) for announcements
|
||||||
|
- CPIC guidelines updated as new evidence emerges
|
||||||
|
- PharmVar provides nomenclature updates for allele definitions
|
||||||
|
|
||||||
|
### API Stability
|
||||||
|
|
||||||
|
- API endpoints are relatively stable but may change during development
|
||||||
|
- Parameters and response formats subject to modification
|
||||||
|
- Monitor API changelog and ClinPGx blog for updates
|
||||||
|
- Consider version pinning for production applications
|
||||||
|
- Test API changes in development before production deployment
|
||||||
|
|
||||||
|
## Common Use Cases
|
||||||
|
|
||||||
|
### Pre-emptive Pharmacogenomic Testing
|
||||||
|
|
||||||
|
Query all clinically actionable gene-drug pairs to guide panel selection:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get all CPIC guideline pairs
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"cpicLevel": "A"}) # Level A recommendations
|
||||||
|
actionable_pairs = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Medication Therapy Management
|
||||||
|
|
||||||
|
Review patient medications against known genotypes:
|
||||||
|
|
||||||
|
```python
|
||||||
|
patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
|
||||||
|
medications = ["clopidogrel", "simvastatin", "escitalopram"]
|
||||||
|
|
||||||
|
for med in medications:
|
||||||
|
for gene in patient_genes:
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": gene, "drug": med})
|
||||||
|
# Check for interactions and dosing guidance
|
||||||
|
```
|
||||||
|
|
||||||
|
### Clinical Trial Eligibility
|
||||||
|
|
||||||
|
Screen for pharmacogenomic contraindications:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Check for HLA-B*57:01 before abacavir trial
|
||||||
|
response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "HLA-B", "drug": "abacavir"})
|
||||||
|
pair_info = response.json()
|
||||||
|
# CPIC: Do not use if HLA-B*57:01 positive
|
||||||
|
```
|
||||||
|
|
||||||
|
## Additional Resources
|
||||||
|
|
||||||
|
- **ClinPGx website**: https://www.clinpgx.org/
|
||||||
|
- **ClinPGx Blog**: https://blog.clinpgx.org/
|
||||||
|
- **API documentation**: https://api.clinpgx.org/
|
||||||
|
- **CPIC website**: https://cpicpgx.org/
|
||||||
|
- **PharmCAT**: https://pharmcat.clinpgx.org/
|
||||||
|
- **ClinGen**: https://clinicalgenome.org/
|
||||||
|
- **Contact**: api@clinpgx.org (for substantial API use)
|
||||||
@@ -0,0 +1,757 @@
|
|||||||
|
# ClinPGx API Reference
|
||||||
|
|
||||||
|
Complete reference documentation for the ClinPGx REST API.
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
```
|
||||||
|
https://api.clinpgx.org/v1/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
|
||||||
|
- **Maximum rate**: 2 requests per second
|
||||||
|
- **Enforcement**: Requests exceeding the limit will receive HTTP 429 (Too Many Requests)
|
||||||
|
- **Best practice**: Implement 500ms delay between requests (0.5 seconds)
|
||||||
|
- **Recommendation**: For substantial API use, contact api@clinpgx.org
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
No authentication is required for basic API access. All endpoints are publicly accessible.
|
||||||
|
|
||||||
|
## Data License
|
||||||
|
|
||||||
|
All data accessed through the API is subject to:
|
||||||
|
- Creative Commons Attribution-ShareAlike 4.0 International License
|
||||||
|
- ClinPGx Data Usage Policy
|
||||||
|
|
||||||
|
## Response Format
|
||||||
|
|
||||||
|
All successful responses return JSON with appropriate HTTP status codes:
|
||||||
|
- `200 OK`: Successful request
|
||||||
|
- `404 Not Found`: Resource does not exist
|
||||||
|
- `429 Too Many Requests`: Rate limit exceeded
|
||||||
|
- `500 Internal Server Error`: Server error
|
||||||
|
|
||||||
|
## Core Endpoints
|
||||||
|
|
||||||
|
### 1. Gene Endpoint
|
||||||
|
|
||||||
|
Retrieve pharmacogene information including function, variants, and clinical significance.
|
||||||
|
|
||||||
|
#### Get Gene by Symbol
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/gene/{gene_symbol}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `gene_symbol` (path, required): Gene symbol (e.g., CYP2D6, TPMT, DPYD)
|
||||||
|
|
||||||
|
**Example Request:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/gene/CYP2D6"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "PA126",
|
||||||
|
"symbol": "CYP2D6",
|
||||||
|
"name": "cytochrome P450 family 2 subfamily D member 6",
|
||||||
|
"chromosome": "22",
|
||||||
|
"chromosomeLocation": "22q13.2",
|
||||||
|
"function": "Drug metabolism",
|
||||||
|
"description": "Highly polymorphic gene encoding enzyme...",
|
||||||
|
"clinicalAnnotations": [...],
|
||||||
|
"relatedDrugs": [...]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Search Genes
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/gene?q={search_term}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `q` (query, optional): Search term for gene name or symbol
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/gene?q=CYP"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Chemical/Drug Endpoint
|
||||||
|
|
||||||
|
Access drug and chemical compound information including pharmacogenomic annotations.
|
||||||
|
|
||||||
|
#### Get Drug by ID
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/chemical/{drug_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `drug_id` (path, required): ClinPGx drug identifier (e.g., PA448515)
|
||||||
|
|
||||||
|
**Example Request:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/chemical/PA448515"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Search Drugs by Name
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/chemical?name={drug_name}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `name` (query, optional): Drug name or synonym
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/chemical?name=warfarin"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "PA448515",
|
||||||
|
"name": "warfarin",
|
||||||
|
"genericNames": ["warfarin sodium"],
|
||||||
|
"tradeNames": ["Coumadin", "Jantoven"],
|
||||||
|
"drugClasses": ["Anticoagulants"],
|
||||||
|
"indication": "Prevention of thrombosis",
|
||||||
|
"relatedGenes": ["CYP2C9", "VKORC1", "CYP4F2"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Gene-Drug Pair Endpoint
|
||||||
|
|
||||||
|
Query curated gene-drug interaction relationships with clinical annotations.
|
||||||
|
|
||||||
|
#### Get Gene-Drug Pairs
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/geneDrugPair?gene={gene}&drug={drug}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `gene` (query, optional): Gene symbol
|
||||||
|
- `drug` (query, optional): Drug name
|
||||||
|
- `cpicLevel` (query, optional): Filter by CPIC recommendation level (A, B, C, D)
|
||||||
|
|
||||||
|
**Example Requests:**
|
||||||
|
```bash
|
||||||
|
# Get all pairs for a gene
|
||||||
|
curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6"
|
||||||
|
|
||||||
|
# Get specific gene-drug pair
|
||||||
|
curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6&drug=codeine"
|
||||||
|
|
||||||
|
# Get all CPIC Level A pairs
|
||||||
|
curl "https://api.clinpgx.org/v1/geneDrugPair?cpicLevel=A"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"gene": "CYP2D6",
|
||||||
|
"drug": "codeine",
|
||||||
|
"sources": ["CPIC", "FDA", "DPWG"],
|
||||||
|
"cpicLevel": "A",
|
||||||
|
"evidenceLevel": "1A",
|
||||||
|
"clinicalAnnotationCount": 45,
|
||||||
|
"hasGuideline": true,
|
||||||
|
"guidelineUrl": "https://www.clinpgx.org/guideline/..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Guideline Endpoint
|
||||||
|
|
||||||
|
Access clinical practice guidelines from CPIC, DPWG, and other sources.
|
||||||
|
|
||||||
|
#### Get Guidelines
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/guideline?source={source}&gene={gene}&drug={drug}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `source` (query, optional): Guideline source (CPIC, DPWG, FDA)
|
||||||
|
- `gene` (query, optional): Gene symbol
|
||||||
|
- `drug` (query, optional): Drug name
|
||||||
|
|
||||||
|
**Example Requests:**
|
||||||
|
```bash
|
||||||
|
# Get all CPIC guidelines
|
||||||
|
curl "https://api.clinpgx.org/v1/guideline?source=CPIC"
|
||||||
|
|
||||||
|
# Get guideline for specific gene-drug
|
||||||
|
curl "https://api.clinpgx.org/v1/guideline?gene=CYP2C19&drug=clopidogrel"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Get Guideline by ID
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/guideline/{guideline_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/guideline/PA166104939"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "PA166104939",
|
||||||
|
"name": "CPIC Guideline for CYP2C19 and Clopidogrel",
|
||||||
|
"source": "CPIC",
|
||||||
|
"genes": ["CYP2C19"],
|
||||||
|
"drugs": ["clopidogrel"],
|
||||||
|
"recommendationLevel": "A",
|
||||||
|
"lastUpdated": "2023-08-01",
|
||||||
|
"summary": "Alternative antiplatelet therapy recommended for...",
|
||||||
|
"recommendations": [...],
|
||||||
|
"pdfUrl": "https://www.clinpgx.org/...",
|
||||||
|
"pmid": "23400754"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Allele Endpoint
|
||||||
|
|
||||||
|
Query allele definitions, functions, and population frequencies.
|
||||||
|
|
||||||
|
#### Get All Alleles for a Gene
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/allele?gene={gene_symbol}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `gene` (query, required): Gene symbol
|
||||||
|
|
||||||
|
**Example Request:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/allele?gene=CYP2D6"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "CYP2D6*1",
|
||||||
|
"gene": "CYP2D6",
|
||||||
|
"function": "Normal function",
|
||||||
|
"activityScore": 1.0,
|
||||||
|
"frequencies": {
|
||||||
|
"European": 0.42,
|
||||||
|
"African": 0.37,
|
||||||
|
"East Asian": 0.50,
|
||||||
|
"Latino": 0.44
|
||||||
|
},
|
||||||
|
"definingVariants": ["Reference allele"],
|
||||||
|
"pharmVarId": "PV00001"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CYP2D6*4",
|
||||||
|
"gene": "CYP2D6",
|
||||||
|
"function": "No function",
|
||||||
|
"activityScore": 0.0,
|
||||||
|
"frequencies": {
|
||||||
|
"European": 0.20,
|
||||||
|
"African": 0.05,
|
||||||
|
"East Asian": 0.01,
|
||||||
|
"Latino": 0.10
|
||||||
|
},
|
||||||
|
"definingVariants": ["rs3892097"],
|
||||||
|
"pharmVarId": "PV00004"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Get Specific Allele
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/allele/{allele_name}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `allele_name` (path, required): Allele name with star nomenclature (e.g., CYP2D6*4)
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/allele/CYP2D6*4"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Variant Endpoint
|
||||||
|
|
||||||
|
Search for genetic variants and their pharmacogenomic annotations.
|
||||||
|
|
||||||
|
#### Get Variant by rsID
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/variant/{rsid}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `rsid` (path, required): dbSNP reference SNP ID
|
||||||
|
|
||||||
|
**Example Request:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/variant/rs4244285"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"rsid": "rs4244285",
|
||||||
|
"chromosome": "10",
|
||||||
|
"position": 94781859,
|
||||||
|
"gene": "CYP2C19",
|
||||||
|
"alleles": ["CYP2C19*2"],
|
||||||
|
"consequence": "Splice site variant",
|
||||||
|
"clinicalSignificance": "Pathogenic - reduced enzyme activity",
|
||||||
|
"frequencies": {
|
||||||
|
"European": 0.15,
|
||||||
|
"African": 0.18,
|
||||||
|
"East Asian": 0.29,
|
||||||
|
"Latino": 0.12
|
||||||
|
},
|
||||||
|
"references": [...]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Search Variants by Position
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/variant?chromosome={chr}&position={pos}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `chromosome` (query, optional): Chromosome number (1-22, X, Y)
|
||||||
|
- `position` (query, optional): Genomic position (GRCh38)
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/variant?chromosome=10&position=94781859"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Clinical Annotation Endpoint
|
||||||
|
|
||||||
|
Access curated literature annotations for gene-drug-phenotype relationships.
|
||||||
|
|
||||||
|
#### Get Clinical Annotations
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/clinicalAnnotation?gene={gene}&drug={drug}&evidenceLevel={level}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `gene` (query, optional): Gene symbol
|
||||||
|
- `drug` (query, optional): Drug name
|
||||||
|
- `evidenceLevel` (query, optional): Evidence level (1A, 1B, 2A, 2B, 3, 4)
|
||||||
|
- `phenotype` (query, optional): Phenotype or outcome
|
||||||
|
|
||||||
|
**Example Requests:**
|
||||||
|
```bash
|
||||||
|
# Get all annotations for a gene
|
||||||
|
curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=CYP2D6"
|
||||||
|
|
||||||
|
# Get high-quality evidence only
|
||||||
|
curl "https://api.clinpgx.org/v1/clinicalAnnotation?evidenceLevel=1A"
|
||||||
|
|
||||||
|
# Get annotations for specific gene-drug pair
|
||||||
|
curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=TPMT&drug=azathioprine"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "PA166153683",
|
||||||
|
"gene": "CYP2D6",
|
||||||
|
"drug": "codeine",
|
||||||
|
"phenotype": "Reduced analgesic effect",
|
||||||
|
"evidenceLevel": "1A",
|
||||||
|
"annotation": "Poor metabolizers have reduced conversion...",
|
||||||
|
"pmid": "24618998",
|
||||||
|
"studyType": "Clinical trial",
|
||||||
|
"population": "European",
|
||||||
|
"sources": ["CPIC"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Evidence Levels:**
|
||||||
|
- **1A**: High-quality evidence from guidelines (CPIC, FDA, DPWG)
|
||||||
|
- **1B**: High-quality evidence not yet guideline
|
||||||
|
- **2A**: Moderate evidence from well-designed studies
|
||||||
|
- **2B**: Moderate evidence with some limitations
|
||||||
|
- **3**: Limited or conflicting evidence
|
||||||
|
- **4**: Case reports or weak evidence
|
||||||
|
|
||||||
|
### 8. Drug Label Endpoint
|
||||||
|
|
||||||
|
Retrieve regulatory drug label information with pharmacogenomic content.
|
||||||
|
|
||||||
|
#### Get Drug Labels
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/drugLabel?drug={drug_name}&source={source}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `drug` (query, required): Drug name
|
||||||
|
- `source` (query, optional): Regulatory source (FDA, EMA, PMDA, Health Canada)
|
||||||
|
|
||||||
|
**Example Requests:**
|
||||||
|
```bash
|
||||||
|
# Get all labels for warfarin
|
||||||
|
curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin"
|
||||||
|
|
||||||
|
# Get only FDA labels
|
||||||
|
curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin&source=FDA"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "DL001234",
|
||||||
|
"drug": "warfarin",
|
||||||
|
"source": "FDA",
|
||||||
|
"sections": {
|
||||||
|
"testing": "Consider CYP2C9 and VKORC1 genotyping...",
|
||||||
|
"dosing": "Dose adjustment based on genotype...",
|
||||||
|
"warnings": "Risk of bleeding in certain genotypes"
|
||||||
|
},
|
||||||
|
"biomarkers": ["CYP2C9", "VKORC1"],
|
||||||
|
"testingRecommended": true,
|
||||||
|
"labelUrl": "https://dailymed.nlm.nih.gov/...",
|
||||||
|
"lastUpdated": "2024-01-15"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 9. Pathway Endpoint
|
||||||
|
|
||||||
|
Access pharmacokinetic and pharmacodynamic pathway diagrams and information.
|
||||||
|
|
||||||
|
#### Get Pathway by ID
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/pathway/{pathway_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `pathway_id` (path, required): ClinPGx pathway identifier
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/pathway/PA146123006"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Search Pathways
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /v1/pathway?drug={drug_name}&gene={gene}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `drug` (query, optional): Drug name
|
||||||
|
- `gene` (query, optional): Gene symbol
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```bash
|
||||||
|
curl "https://api.clinpgx.org/v1/pathway?drug=warfarin"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "PA146123006",
|
||||||
|
"name": "Warfarin Pharmacokinetics and Pharmacodynamics",
|
||||||
|
"drugs": ["warfarin"],
|
||||||
|
"genes": ["CYP2C9", "VKORC1", "CYP4F2", "GGCX"],
|
||||||
|
"description": "Warfarin is metabolized primarily by CYP2C9...",
|
||||||
|
"diagramUrl": "https://www.clinpgx.org/pathway/...",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"step": 1,
|
||||||
|
"process": "Absorption",
|
||||||
|
"genes": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"step": 2,
|
||||||
|
"process": "Metabolism",
|
||||||
|
"genes": ["CYP2C9", "CYP2C19"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"step": 3,
|
||||||
|
"process": "Target interaction",
|
||||||
|
"genes": ["VKORC1"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Query Patterns and Examples
|
||||||
|
|
||||||
|
### Common Query Patterns
|
||||||
|
|
||||||
|
#### 1. Patient Medication Review
|
||||||
|
|
||||||
|
Query all gene-drug pairs for a patient's medications:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
patient_meds = ["clopidogrel", "simvastatin", "codeine"]
|
||||||
|
patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
|
||||||
|
|
||||||
|
for med in patient_meds:
|
||||||
|
for gene in patient_genes:
|
||||||
|
response = requests.get(
|
||||||
|
"https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": gene, "drug": med}
|
||||||
|
)
|
||||||
|
pairs = response.json()
|
||||||
|
# Check for interactions
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Actionable Gene Panel
|
||||||
|
|
||||||
|
Find all genes with CPIC Level A recommendations:
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.get(
|
||||||
|
"https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"cpicLevel": "A"}
|
||||||
|
)
|
||||||
|
actionable_pairs = response.json()
|
||||||
|
|
||||||
|
genes = set(pair['gene'] for pair in actionable_pairs)
|
||||||
|
print(f"Panel should include: {sorted(genes)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Population Frequency Analysis
|
||||||
|
|
||||||
|
Compare allele frequencies across populations:
|
||||||
|
|
||||||
|
```python
|
||||||
|
alleles = requests.get(
|
||||||
|
"https://api.clinpgx.org/v1/allele",
|
||||||
|
params={"gene": "CYP2D6"}
|
||||||
|
).json()
|
||||||
|
|
||||||
|
# Calculate phenotype frequencies
|
||||||
|
pm_freq = {} # Poor metabolizer frequencies
|
||||||
|
for allele in alleles:
|
||||||
|
if allele['function'] == 'No function':
|
||||||
|
for pop, freq in allele['frequencies'].items():
|
||||||
|
pm_freq[pop] = pm_freq.get(pop, 0) + freq
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. Drug Safety Screen
|
||||||
|
|
||||||
|
Check for high-risk gene-drug associations:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Screen for HLA-B*57:01 before abacavir
|
||||||
|
response = requests.get(
|
||||||
|
"https://api.clinpgx.org/v1/geneDrugPair",
|
||||||
|
params={"gene": "HLA-B", "drug": "abacavir"}
|
||||||
|
)
|
||||||
|
pair = response.json()[0]
|
||||||
|
|
||||||
|
if pair['cpicLevel'] == 'A':
|
||||||
|
print("CRITICAL: Do not use if HLA-B*57:01 positive")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
### Common Error Responses
|
||||||
|
|
||||||
|
#### 404 Not Found
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "Resource not found",
|
||||||
|
"message": "Gene 'INVALID' does not exist"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 429 Too Many Requests
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "Rate limit exceeded",
|
||||||
|
"message": "Maximum 2 requests per second allowed"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Recommended Error Handling Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
|
def safe_query(url, params=None, max_retries=3):
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
time.sleep(0.5) # Rate limiting
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 429:
|
||||||
|
wait = 2 ** attempt
|
||||||
|
print(f"Rate limited. Waiting {wait}s...")
|
||||||
|
time.sleep(wait)
|
||||||
|
elif response.status_code == 404:
|
||||||
|
print("Resource not found")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed: {e}")
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
raise
|
||||||
|
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### Rate Limiting
|
||||||
|
- Implement 500ms delay between requests (2 requests/second maximum)
|
||||||
|
- Use exponential backoff for rate limit errors
|
||||||
|
- Consider caching results for frequently accessed data
|
||||||
|
- For bulk operations, contact api@clinpgx.org
|
||||||
|
|
||||||
|
### Caching Strategy
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def cached_query(cache_file, query_func, *args, **kwargs):
|
||||||
|
cache_path = Path(cache_file)
|
||||||
|
|
||||||
|
if cache_path.exists():
|
||||||
|
with open(cache_path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
result = query_func(*args, **kwargs)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
with open(cache_path, 'w') as f:
|
||||||
|
json.dump(result, f)
|
||||||
|
|
||||||
|
return result
|
||||||
|
```
|
||||||
|
|
||||||
|
### Batch Processing
|
||||||
|
```python
|
||||||
|
import time
|
||||||
|
|
||||||
|
def batch_gene_query(genes, delay=0.5):
|
||||||
|
results = {}
|
||||||
|
for gene in genes:
|
||||||
|
response = requests.get(f"https://api.clinpgx.org/v1/gene/{gene}")
|
||||||
|
if response.status_code == 200:
|
||||||
|
results[gene] = response.json()
|
||||||
|
time.sleep(delay)
|
||||||
|
return results
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data Schema Definitions
|
||||||
|
|
||||||
|
### Gene Object
|
||||||
|
```typescript
|
||||||
|
{
|
||||||
|
id: string; // ClinPGx gene ID
|
||||||
|
symbol: string; // HGNC gene symbol
|
||||||
|
name: string; // Full gene name
|
||||||
|
chromosome: string; // Chromosome location
|
||||||
|
function: string; // Pharmacogenomic function
|
||||||
|
clinicalAnnotations: number; // Count of annotations
|
||||||
|
relatedDrugs: string[]; // Associated drugs
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Drug Object
|
||||||
|
```typescript
|
||||||
|
{
|
||||||
|
id: string; // ClinPGx drug ID
|
||||||
|
name: string; // Generic name
|
||||||
|
tradeNames: string[]; // Brand names
|
||||||
|
drugClasses: string[]; // Therapeutic classes
|
||||||
|
indication: string; // Primary indication
|
||||||
|
relatedGenes: string[]; // Pharmacogenes
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Gene-Drug Pair Object
|
||||||
|
```typescript
|
||||||
|
{
|
||||||
|
gene: string; // Gene symbol
|
||||||
|
drug: string; // Drug name
|
||||||
|
sources: string[]; // CPIC, FDA, DPWG, etc.
|
||||||
|
cpicLevel: string; // A, B, C, D
|
||||||
|
evidenceLevel: string; // 1A, 1B, 2A, 2B, 3, 4
|
||||||
|
hasGuideline: boolean; // Has clinical guideline
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Allele Object
|
||||||
|
```typescript
|
||||||
|
{
|
||||||
|
name: string; // Allele name (e.g., CYP2D6*4)
|
||||||
|
gene: string; // Gene symbol
|
||||||
|
function: string; // Normal/decreased/no/increased/uncertain
|
||||||
|
activityScore: number; // 0.0 to 2.0+
|
||||||
|
frequencies: { // Population frequencies
|
||||||
|
[population: string]: number;
|
||||||
|
};
|
||||||
|
definingVariants: string[]; // rsIDs or descriptions
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Stability and Versioning
|
||||||
|
|
||||||
|
### Current Status
|
||||||
|
- API version: v1
|
||||||
|
- Stability: Beta - endpoints stable, parameters may change
|
||||||
|
- Monitor: https://blog.clinpgx.org/ for updates
|
||||||
|
|
||||||
|
### Migration from PharmGKB
|
||||||
|
As of July 2025, PharmGKB URLs redirect to ClinPGx. Update references:
|
||||||
|
- Old: `https://api.pharmgkb.org/`
|
||||||
|
- New: `https://api.clinpgx.org/`
|
||||||
|
|
||||||
|
### Future Changes
|
||||||
|
- Watch for API v2 announcements
|
||||||
|
- Breaking changes will be announced on ClinPGx Blog
|
||||||
|
- Consider version pinning for production applications
|
||||||
|
|
||||||
|
## Support and Contact
|
||||||
|
|
||||||
|
- **API Issues**: api@clinpgx.org
|
||||||
|
- **Documentation**: https://api.clinpgx.org/
|
||||||
|
- **General Questions**: https://www.clinpgx.org/page/faqs
|
||||||
|
- **Blog**: https://blog.clinpgx.org/
|
||||||
|
- **CPIC Guidelines**: https://cpicpgx.org/
|
||||||
|
|
||||||
|
## Related Resources
|
||||||
|
|
||||||
|
- **PharmCAT**: Pharmacogenomic variant calling and annotation tool
|
||||||
|
- **PharmVar**: Pharmacogene allele nomenclature database
|
||||||
|
- **CPIC**: Clinical Pharmacogenetics Implementation Consortium
|
||||||
|
- **DPWG**: Dutch Pharmacogenetics Working Group
|
||||||
|
- **ClinGen**: Clinical Genome Resource
|
||||||
518
scientific-databases/clinpgx-database/scripts/query_clinpgx.py
Executable file
518
scientific-databases/clinpgx-database/scripts/query_clinpgx.py
Executable file
@@ -0,0 +1,518 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
ClinPGx API Query Helper Script
|
||||||
|
|
||||||
|
Provides ready-to-use functions for querying the ClinPGx database API.
|
||||||
|
Includes rate limiting, error handling, and caching functionality.
|
||||||
|
|
||||||
|
ClinPGx API: https://api.clinpgx.org/
|
||||||
|
Rate limit: 2 requests per second
|
||||||
|
License: Creative Commons Attribution-ShareAlike 4.0 International
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
|
||||||
|
# API Configuration
|
||||||
|
BASE_URL = "https://api.clinpgx.org/v1/"
|
||||||
|
RATE_LIMIT_DELAY = 0.5 # 500ms delay = 2 requests/second
|
||||||
|
|
||||||
|
|
||||||
|
def rate_limited_request(url: str, params: Optional[Dict] = None, delay: float = RATE_LIMIT_DELAY) -> requests.Response:
|
||||||
|
"""
|
||||||
|
Make API request with rate limiting compliance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: API endpoint URL
|
||||||
|
params: Query parameters
|
||||||
|
delay: Delay in seconds between requests (default 0.5s for 2 req/sec)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response object
|
||||||
|
"""
|
||||||
|
response = requests.get(url, params=params)
|
||||||
|
time.sleep(delay)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def safe_api_call(url: str, params: Optional[Dict] = None, max_retries: int = 3) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Make API call with error handling and exponential backoff retry.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: API endpoint URL
|
||||||
|
params: Query parameters
|
||||||
|
max_retries: Maximum number of retry attempts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response data or None on failure
|
||||||
|
"""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
time.sleep(RATE_LIMIT_DELAY)
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 429:
|
||||||
|
# Rate limit exceeded
|
||||||
|
wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s
|
||||||
|
print(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
elif response.status_code == 404:
|
||||||
|
print(f"Resource not found: {url}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
print(f"Failed after {max_retries} attempts")
|
||||||
|
return None
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cached_query(cache_file: str, query_func, *args, **kwargs) -> Any:
|
||||||
|
"""
|
||||||
|
Cache API results to avoid repeated queries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_file: Path to cache file
|
||||||
|
query_func: Function to call if cache miss
|
||||||
|
*args, **kwargs: Arguments to pass to query_func
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached or freshly queried data
|
||||||
|
"""
|
||||||
|
cache_path = Path(cache_file)
|
||||||
|
|
||||||
|
if cache_path.exists():
|
||||||
|
print(f"Loading from cache: {cache_file}")
|
||||||
|
with open(cache_path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
print(f"Cache miss. Querying API...")
|
||||||
|
result = query_func(*args, **kwargs)
|
||||||
|
|
||||||
|
if result is not None:
|
||||||
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(cache_path, 'w') as f:
|
||||||
|
json.dump(result, f, indent=2)
|
||||||
|
print(f"Cached to: {cache_file}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Core Query Functions
|
||||||
|
|
||||||
|
def get_gene_info(gene_symbol: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Retrieve detailed information about a pharmacogene.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene_symbol: Gene symbol (e.g., "CYP2D6", "TPMT")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Gene information dictionary
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> gene_data = get_gene_info("CYP2D6")
|
||||||
|
>>> print(gene_data['symbol'], gene_data['name'])
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}gene/{gene_symbol}"
|
||||||
|
return safe_api_call(url)
|
||||||
|
|
||||||
|
|
||||||
|
def get_drug_info(drug_name: str) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Search for drug/chemical information by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
drug_name: Drug name (e.g., "warfarin", "codeine")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of matching drugs
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> drugs = get_drug_info("warfarin")
|
||||||
|
>>> for drug in drugs:
|
||||||
|
>>> print(drug['name'], drug['id'])
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}chemical"
|
||||||
|
params = {"name": drug_name}
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_gene_drug_pairs(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Query gene-drug interaction pairs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene: Gene symbol (optional)
|
||||||
|
drug: Drug name (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of gene-drug pairs with clinical annotations
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get all pairs for CYP2D6
|
||||||
|
>>> pairs = get_gene_drug_pairs(gene="CYP2D6")
|
||||||
|
>>>
|
||||||
|
>>> # Get specific gene-drug pair
|
||||||
|
>>> pair = get_gene_drug_pairs(gene="CYP2D6", drug="codeine")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}geneDrugPair"
|
||||||
|
params = {}
|
||||||
|
if gene:
|
||||||
|
params["gene"] = gene
|
||||||
|
if drug:
|
||||||
|
params["drug"] = drug
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpic_guidelines(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Retrieve CPIC clinical practice guidelines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene: Gene symbol (optional)
|
||||||
|
drug: Drug name (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of CPIC guidelines
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get all CPIC guidelines
|
||||||
|
>>> guidelines = get_cpic_guidelines()
|
||||||
|
>>>
|
||||||
|
>>> # Get guideline for specific gene-drug
|
||||||
|
>>> guideline = get_cpic_guidelines(gene="CYP2C19", drug="clopidogrel")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}guideline"
|
||||||
|
params = {"source": "CPIC"}
|
||||||
|
if gene:
|
||||||
|
params["gene"] = gene
|
||||||
|
if drug:
|
||||||
|
params["drug"] = drug
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_alleles(gene: str) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Get all alleles for a pharmacogene including function and frequency.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene: Gene symbol (e.g., "CYP2D6")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of alleles with functional annotations and population frequencies
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> alleles = get_alleles("CYP2D6")
|
||||||
|
>>> for allele in alleles:
|
||||||
|
>>> print(f"{allele['name']}: {allele['function']}")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}allele"
|
||||||
|
params = {"gene": gene}
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_allele_info(allele_name: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Get detailed information about a specific allele.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
allele_name: Allele name (e.g., "CYP2D6*4")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Allele information dictionary
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> allele = get_allele_info("CYP2D6*4")
|
||||||
|
>>> print(allele['function'], allele['frequencies'])
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}allele/{allele_name}"
|
||||||
|
return safe_api_call(url)
|
||||||
|
|
||||||
|
|
||||||
|
def get_clinical_annotations(
|
||||||
|
gene: Optional[str] = None,
|
||||||
|
drug: Optional[str] = None,
|
||||||
|
evidence_level: Optional[str] = None
|
||||||
|
) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Retrieve curated literature annotations for gene-drug interactions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene: Gene symbol (optional)
|
||||||
|
drug: Drug name (optional)
|
||||||
|
evidence_level: Filter by evidence level (1A, 1B, 2A, 2B, 3, 4)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of clinical annotations
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get all annotations for CYP2D6
|
||||||
|
>>> annotations = get_clinical_annotations(gene="CYP2D6")
|
||||||
|
>>>
|
||||||
|
>>> # Get high-quality evidence only
|
||||||
|
>>> high_quality = get_clinical_annotations(evidence_level="1A")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}clinicalAnnotation"
|
||||||
|
params = {}
|
||||||
|
if gene:
|
||||||
|
params["gene"] = gene
|
||||||
|
if drug:
|
||||||
|
params["drug"] = drug
|
||||||
|
if evidence_level:
|
||||||
|
params["evidenceLevel"] = evidence_level
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_drug_labels(drug: str, source: Optional[str] = None) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Retrieve pharmacogenomic drug label information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
drug: Drug name
|
||||||
|
source: Regulatory source (e.g., "FDA", "EMA")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of drug labels with PGx information
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get all labels for warfarin
|
||||||
|
>>> labels = get_drug_labels("warfarin")
|
||||||
|
>>>
|
||||||
|
>>> # Get only FDA labels
|
||||||
|
>>> fda_labels = get_drug_labels("warfarin", source="FDA")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}drugLabel"
|
||||||
|
params = {"drug": drug}
|
||||||
|
if source:
|
||||||
|
params["source"] = source
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def search_variants(rsid: Optional[str] = None, chromosome: Optional[str] = None,
|
||||||
|
position: Optional[str] = None) -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Search for genetic variants by rsID or genomic position.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rsid: dbSNP rsID (e.g., "rs4244285")
|
||||||
|
chromosome: Chromosome number
|
||||||
|
position: Genomic position
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of matching variants
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Search by rsID
|
||||||
|
>>> variant = search_variants(rsid="rs4244285")
|
||||||
|
>>>
|
||||||
|
>>> # Search by position
|
||||||
|
>>> variants = search_variants(chromosome="10", position="94781859")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}variant"
|
||||||
|
|
||||||
|
if rsid:
|
||||||
|
url = f"{BASE_URL}variant/{rsid}"
|
||||||
|
return safe_api_call(url)
|
||||||
|
|
||||||
|
params = {}
|
||||||
|
if chromosome:
|
||||||
|
params["chromosome"] = chromosome
|
||||||
|
if position:
|
||||||
|
params["position"] = position
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pathway_info(pathway_id: Optional[str] = None, drug: Optional[str] = None) -> Optional[Any]:
|
||||||
|
"""
|
||||||
|
Retrieve pharmacokinetic/pharmacodynamic pathway information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pathway_id: ClinPGx pathway ID (optional)
|
||||||
|
drug: Drug name (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Pathway information or list of pathways
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get specific pathway
|
||||||
|
>>> pathway = get_pathway_info(pathway_id="PA146123006")
|
||||||
|
>>>
|
||||||
|
>>> # Get all pathways for a drug
|
||||||
|
>>> pathways = get_pathway_info(drug="warfarin")
|
||||||
|
"""
|
||||||
|
if pathway_id:
|
||||||
|
url = f"{BASE_URL}pathway/{pathway_id}"
|
||||||
|
return safe_api_call(url)
|
||||||
|
|
||||||
|
url = f"{BASE_URL}pathway"
|
||||||
|
params = {}
|
||||||
|
if drug:
|
||||||
|
params["drug"] = drug
|
||||||
|
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
# Utility Functions
|
||||||
|
|
||||||
|
def export_to_dataframe(data: List[Dict], output_file: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Convert API results to pandas DataFrame for analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: List of dictionaries from API
|
||||||
|
output_file: Optional CSV output file path
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas DataFrame
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> pairs = get_gene_drug_pairs(gene="CYP2D6")
|
||||||
|
>>> df = export_to_dataframe(pairs, "cyp2d6_pairs.csv")
|
||||||
|
>>> print(df.head())
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
except ImportError:
|
||||||
|
print("pandas not installed. Install with: pip install pandas")
|
||||||
|
return None
|
||||||
|
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
df.to_csv(output_file, index=False)
|
||||||
|
print(f"Data exported to: {output_file}")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def batch_gene_query(gene_list: List[str], delay: float = 0.5) -> Dict[str, Dict]:
|
||||||
|
"""
|
||||||
|
Query multiple genes in batch with rate limiting.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gene_list: List of gene symbols
|
||||||
|
delay: Delay between requests (default 0.5s)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping gene symbols to gene data
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> genes = ["CYP2D6", "CYP2C19", "CYP2C9", "TPMT"]
|
||||||
|
>>> results = batch_gene_query(genes)
|
||||||
|
>>> for gene, data in results.items():
|
||||||
|
>>> print(f"{gene}: {data['name']}")
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
print(f"Querying {len(gene_list)} genes with {delay}s delay between requests...")
|
||||||
|
|
||||||
|
for gene in gene_list:
|
||||||
|
print(f"Fetching: {gene}")
|
||||||
|
data = get_gene_info(gene)
|
||||||
|
if data:
|
||||||
|
results[gene] = data
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
print(f"Completed: {len(results)}/{len(gene_list)} successful")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def find_actionable_gene_drug_pairs(cpic_level: str = "A") -> Optional[List[Dict]]:
|
||||||
|
"""
|
||||||
|
Find all clinically actionable gene-drug pairs with CPIC guidelines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cpic_level: CPIC recommendation level (A, B, C, D)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of actionable gene-drug pairs
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> # Get all Level A recommendations
|
||||||
|
>>> actionable = find_actionable_gene_drug_pairs(cpic_level="A")
|
||||||
|
>>> for pair in actionable:
|
||||||
|
>>> print(f"{pair['gene']} - {pair['drug']}")
|
||||||
|
"""
|
||||||
|
url = f"{BASE_URL}geneDrugPair"
|
||||||
|
params = {"cpicLevel": cpic_level}
|
||||||
|
return safe_api_call(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
# Example Usage
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("ClinPGx API Query Examples\n")
|
||||||
|
|
||||||
|
# Example 1: Get gene information
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 1: Get CYP2D6 gene information")
|
||||||
|
print("=" * 60)
|
||||||
|
cyp2d6 = get_gene_info("CYP2D6")
|
||||||
|
if cyp2d6:
|
||||||
|
print(f"Gene: {cyp2d6.get('symbol')}")
|
||||||
|
print(f"Name: {cyp2d6.get('name')}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example 2: Search for a drug
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 2: Search for warfarin")
|
||||||
|
print("=" * 60)
|
||||||
|
warfarin = get_drug_info("warfarin")
|
||||||
|
if warfarin:
|
||||||
|
for drug in warfarin[:1]: # Show first result
|
||||||
|
print(f"Drug: {drug.get('name')}")
|
||||||
|
print(f"ID: {drug.get('id')}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example 3: Get gene-drug pairs
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 3: Get CYP2C19-clopidogrel pair")
|
||||||
|
print("=" * 60)
|
||||||
|
pair = get_gene_drug_pairs(gene="CYP2C19", drug="clopidogrel")
|
||||||
|
if pair:
|
||||||
|
print(f"Found {len(pair)} gene-drug pair(s)")
|
||||||
|
if len(pair) > 0:
|
||||||
|
print(f"Annotations: {pair[0].get('sources', [])}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example 4: Get CPIC guidelines
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 4: Get CPIC guidelines for CYP2C19")
|
||||||
|
print("=" * 60)
|
||||||
|
guidelines = get_cpic_guidelines(gene="CYP2C19")
|
||||||
|
if guidelines:
|
||||||
|
print(f"Found {len(guidelines)} guideline(s)")
|
||||||
|
for g in guidelines[:2]: # Show first 2
|
||||||
|
print(f" - {g.get('name')}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example 5: Get alleles for a gene
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 5: Get CYP2D6 alleles")
|
||||||
|
print("=" * 60)
|
||||||
|
alleles = get_alleles("CYP2D6")
|
||||||
|
if alleles:
|
||||||
|
print(f"Found {len(alleles)} allele(s)")
|
||||||
|
for allele in alleles[:3]: # Show first 3
|
||||||
|
print(f" - {allele.get('name')}: {allele.get('function')}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Examples completed!")
|
||||||
|
print("=" * 60)
|
||||||
292
scientific-databases/ensembl-database/SKILL.md
Normal file
292
scientific-databases/ensembl-database/SKILL.md
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
---
|
||||||
|
name: ensembl-database
|
||||||
|
description: Work with the Ensembl genome database to query genomic data, retrieve sequences, analyze variants, and perform comparative genomics. This skill should be used when working with vertebrate genomic data, gene annotations, variant analysis, ortholog identification, or when users need to query the Ensembl REST API for genomic information across multiple species.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Ensembl Database
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Access and query the Ensembl genome database, a comprehensive resource for vertebrate genomic data maintained by EMBL-EBI. The database provides gene annotations, sequences, variants, regulatory information, and comparative genomics data for over 250 species. Current release is 115 (September 2025).
|
||||||
|
|
||||||
|
## Core Capabilities
|
||||||
|
|
||||||
|
### 1. Gene Information Retrieval
|
||||||
|
|
||||||
|
Query gene data by symbol, Ensembl ID, or external database identifiers.
|
||||||
|
|
||||||
|
**Common operations:**
|
||||||
|
- Look up gene information by symbol (e.g., "BRCA2", "TP53")
|
||||||
|
- Retrieve transcript and protein information
|
||||||
|
- Get gene coordinates and chromosomal locations
|
||||||
|
- Access cross-references to external databases (UniProt, RefSeq, etc.)
|
||||||
|
|
||||||
|
**Using the ensembl_rest package:**
|
||||||
|
```python
|
||||||
|
from ensembl_rest import EnsemblClient
|
||||||
|
|
||||||
|
client = EnsemblClient()
|
||||||
|
|
||||||
|
# Look up gene by symbol
|
||||||
|
gene_data = client.symbol_lookup(
|
||||||
|
species='human',
|
||||||
|
symbol='BRCA2'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get detailed gene information
|
||||||
|
gene_info = client.lookup_id(
|
||||||
|
id='ENSG00000139618', # BRCA2 Ensembl ID
|
||||||
|
expand=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Direct REST API (no package):**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
server = "https://rest.ensembl.org"
|
||||||
|
|
||||||
|
# Symbol lookup
|
||||||
|
response = requests.get(
|
||||||
|
f"{server}/lookup/symbol/homo_sapiens/BRCA2",
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
gene_data = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Sequence Retrieval
|
||||||
|
|
||||||
|
Fetch genomic, transcript, or protein sequences in various formats (JSON, FASTA, plain text).
|
||||||
|
|
||||||
|
**Operations:**
|
||||||
|
- Get DNA sequences for genes or genomic regions
|
||||||
|
- Retrieve transcript sequences (cDNA)
|
||||||
|
- Access protein sequences
|
||||||
|
- Extract sequences with flanking regions or modifications
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
# Using ensembl_rest package
|
||||||
|
sequence = client.sequence_id(
|
||||||
|
id='ENSG00000139618', # Gene ID
|
||||||
|
content_type='application/json'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get sequence for a genomic region
|
||||||
|
region_seq = client.sequence_region(
|
||||||
|
species='human',
|
||||||
|
region='7:140424943-140624564' # chromosome:start-end
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Variant Analysis
|
||||||
|
|
||||||
|
Query genetic variation data and predict variant consequences using the Variant Effect Predictor (VEP).
|
||||||
|
|
||||||
|
**Capabilities:**
|
||||||
|
- Look up variants by rsID or genomic coordinates
|
||||||
|
- Predict functional consequences of variants
|
||||||
|
- Access population frequency data
|
||||||
|
- Retrieve phenotype associations
|
||||||
|
|
||||||
|
**VEP example:**
|
||||||
|
```python
|
||||||
|
# Predict variant consequences
|
||||||
|
vep_result = client.vep_hgvs(
|
||||||
|
species='human',
|
||||||
|
hgvs_notation='ENST00000380152.7:c.803C>T'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Query variant by rsID
|
||||||
|
variant = client.variation_id(
|
||||||
|
species='human',
|
||||||
|
id='rs699'
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Comparative Genomics
|
||||||
|
|
||||||
|
Perform cross-species comparisons to identify orthologs, paralogs, and evolutionary relationships.
|
||||||
|
|
||||||
|
**Operations:**
|
||||||
|
- Find orthologs (same gene in different species)
|
||||||
|
- Identify paralogs (related genes in same species)
|
||||||
|
- Access gene trees showing evolutionary relationships
|
||||||
|
- Retrieve gene family information
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
# Find orthologs for a human gene
|
||||||
|
orthologs = client.homology_ensemblgene(
|
||||||
|
id='ENSG00000139618', # Human BRCA2
|
||||||
|
target_species='mouse'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get gene tree
|
||||||
|
gene_tree = client.genetree_member_symbol(
|
||||||
|
species='human',
|
||||||
|
symbol='BRCA2'
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Genomic Region Analysis
|
||||||
|
|
||||||
|
Find all genomic features (genes, transcripts, regulatory elements) in a specific region.
|
||||||
|
|
||||||
|
**Use cases:**
|
||||||
|
- Identify all genes in a chromosomal region
|
||||||
|
- Find regulatory features (promoters, enhancers)
|
||||||
|
- Locate variants within a region
|
||||||
|
- Retrieve structural features
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
# Find all features in a region
|
||||||
|
features = client.overlap_region(
|
||||||
|
species='human',
|
||||||
|
region='7:140424943-140624564',
|
||||||
|
feature='gene'
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Assembly Mapping
|
||||||
|
|
||||||
|
Convert coordinates between different genome assemblies (e.g., GRCh37 to GRCh38).
|
||||||
|
|
||||||
|
**Important:** Use `https://grch37.rest.ensembl.org` for GRCh37/hg19 queries and `https://rest.ensembl.org` for current assemblies.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
from ensembl_rest import AssemblyMapper
|
||||||
|
|
||||||
|
# Map coordinates from GRCh37 to GRCh38
|
||||||
|
mapper = AssemblyMapper(
|
||||||
|
species='human',
|
||||||
|
asm_from='GRCh37',
|
||||||
|
asm_to='GRCh38'
|
||||||
|
)
|
||||||
|
|
||||||
|
mapped = mapper.map(chrom='7', start=140453136, end=140453136)
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Best Practices
|
||||||
|
|
||||||
|
### Rate Limiting
|
||||||
|
|
||||||
|
The Ensembl REST API has rate limits. Follow these practices:
|
||||||
|
|
||||||
|
1. **Respect rate limits:** Maximum 15 requests per second for anonymous users
|
||||||
|
2. **Handle 429 responses:** When rate-limited, check the `Retry-After` header and wait
|
||||||
|
3. **Use batch endpoints:** When querying multiple items, use batch endpoints where available
|
||||||
|
4. **Cache results:** Store frequently accessed data to reduce API calls
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
Always implement proper error handling:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
|
def query_ensembl(endpoint, params=None, max_retries=3):
|
||||||
|
server = "https://rest.ensembl.org"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
response = requests.get(
|
||||||
|
f"{server}{endpoint}",
|
||||||
|
headers=headers,
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 429:
|
||||||
|
# Rate limited - wait and retry
|
||||||
|
retry_after = int(response.headers.get('Retry-After', 1))
|
||||||
|
time.sleep(retry_after)
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
raise Exception(f"Failed after {max_retries} attempts")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Python Package (Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install ensembl_rest
|
||||||
|
```
|
||||||
|
|
||||||
|
The `ensembl_rest` package provides a Pythonic interface to all Ensembl REST API endpoints.
|
||||||
|
|
||||||
|
### Direct REST API
|
||||||
|
|
||||||
|
No installation needed - use standard HTTP libraries like `requests`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install requests
|
||||||
|
```
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
### references/
|
||||||
|
|
||||||
|
- `api_endpoints.md`: Comprehensive documentation of all 17 API endpoint categories with examples and parameters
|
||||||
|
|
||||||
|
### scripts/
|
||||||
|
|
||||||
|
- `ensembl_query.py`: Reusable Python script for common Ensembl queries with built-in rate limiting and error handling
|
||||||
|
|
||||||
|
## Common Workflows
|
||||||
|
|
||||||
|
### Workflow 1: Gene Annotation Pipeline
|
||||||
|
|
||||||
|
1. Look up gene by symbol to get Ensembl ID
|
||||||
|
2. Retrieve transcript information
|
||||||
|
3. Get protein sequences for all transcripts
|
||||||
|
4. Find orthologs in other species
|
||||||
|
5. Export results
|
||||||
|
|
||||||
|
### Workflow 2: Variant Analysis
|
||||||
|
|
||||||
|
1. Query variant by rsID or coordinates
|
||||||
|
2. Use VEP to predict functional consequences
|
||||||
|
3. Check population frequencies
|
||||||
|
4. Retrieve phenotype associations
|
||||||
|
5. Generate report
|
||||||
|
|
||||||
|
### Workflow 3: Comparative Analysis
|
||||||
|
|
||||||
|
1. Start with gene of interest in reference species
|
||||||
|
2. Find orthologs in target species
|
||||||
|
3. Retrieve sequences for all orthologs
|
||||||
|
4. Compare gene structures and features
|
||||||
|
5. Analyze evolutionary conservation
|
||||||
|
|
||||||
|
## Species and Assembly Information
|
||||||
|
|
||||||
|
To query available species and assemblies:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# List all available species
|
||||||
|
species_list = client.info_species()
|
||||||
|
|
||||||
|
# Get assembly information for a species
|
||||||
|
assembly_info = client.info_assembly(species='human')
|
||||||
|
```
|
||||||
|
|
||||||
|
Common species identifiers:
|
||||||
|
- Human: `homo_sapiens` or `human`
|
||||||
|
- Mouse: `mus_musculus` or `mouse`
|
||||||
|
- Zebrafish: `danio_rerio` or `zebrafish`
|
||||||
|
- Fruit fly: `drosophila_melanogaster`
|
||||||
|
|
||||||
|
## Additional Resources
|
||||||
|
|
||||||
|
- **Official Documentation:** https://rest.ensembl.org/documentation
|
||||||
|
- **Python Package Docs:** https://ensemblrest.readthedocs.io
|
||||||
|
- **EBI Training:** https://www.ebi.ac.uk/training/online/courses/ensembl-rest-api/
|
||||||
|
- **Ensembl Browser:** https://useast.ensembl.org
|
||||||
|
- **GitHub Examples:** https://github.com/Ensembl/ensembl-rest/wiki
|
||||||
@@ -0,0 +1,346 @@
|
|||||||
|
# Ensembl REST API Endpoints Reference
|
||||||
|
|
||||||
|
Comprehensive documentation of all 17 API endpoint categories available in the Ensembl REST API (Release 115, September 2025).
|
||||||
|
|
||||||
|
**Base URLs:**
|
||||||
|
- Current assemblies: `https://rest.ensembl.org`
|
||||||
|
- GRCh37/hg19 (human): `https://grch37.rest.ensembl.org`
|
||||||
|
|
||||||
|
**Rate Limits:**
|
||||||
|
- Anonymous: 15 requests/second
|
||||||
|
- Registered: 55,000 requests/hour
|
||||||
|
|
||||||
|
## 1. Archive
|
||||||
|
|
||||||
|
Retrieve historical information about retired Ensembl identifiers.
|
||||||
|
|
||||||
|
**GET /archive/id/:id**
|
||||||
|
- Retrieve archived entries for a retired identifier
|
||||||
|
- Example: `/archive/id/ENSG00000157764` (retired gene ID)
|
||||||
|
|
||||||
|
## 2. Comparative Genomics
|
||||||
|
|
||||||
|
Access gene trees, genomic alignments, and homology data across species.
|
||||||
|
|
||||||
|
**GET /alignment/region/:species/:region**
|
||||||
|
- Get genomic alignments for a region
|
||||||
|
- Example: `/alignment/region/human/2:106040000-106040050:1?species_set_group=mammals`
|
||||||
|
|
||||||
|
**GET /genetree/id/:id**
|
||||||
|
- Retrieve gene tree for a gene family
|
||||||
|
- Example: `/genetree/id/ENSGT00390000003602`
|
||||||
|
|
||||||
|
**GET /genetree/member/id/:id**
|
||||||
|
- Get gene tree by member gene ID
|
||||||
|
- Example: `/genetree/member/id/ENSG00000139618`
|
||||||
|
|
||||||
|
**GET /homology/id/:id**
|
||||||
|
- Find orthologs and paralogs for a gene
|
||||||
|
- Parameters: `target_species`, `type` (orthologues, paralogues, all)
|
||||||
|
- Example: `/homology/id/ENSG00000139618?target_species=mouse`
|
||||||
|
|
||||||
|
**GET /homology/symbol/:species/:symbol**
|
||||||
|
- Find homologs by gene symbol
|
||||||
|
- Example: `/homology/symbol/human/BRCA2?target_species=mouse`
|
||||||
|
|
||||||
|
## 3. Cross References
|
||||||
|
|
||||||
|
Link external database identifiers to Ensembl objects.
|
||||||
|
|
||||||
|
**GET /xrefs/id/:id**
|
||||||
|
- Get external references for Ensembl ID
|
||||||
|
- Example: `/xrefs/id/ENSG00000139618`
|
||||||
|
|
||||||
|
**GET /xrefs/symbol/:species/:symbol**
|
||||||
|
- Get cross-references by gene symbol
|
||||||
|
- Example: `/xrefs/symbol/human/BRCA2`
|
||||||
|
|
||||||
|
**GET /xrefs/name/:species/:name**
|
||||||
|
- Search for objects by external name
|
||||||
|
- Example: `/xrefs/name/human/NP_000050`
|
||||||
|
|
||||||
|
## 4. Information
|
||||||
|
|
||||||
|
Query metadata about species, assemblies, biotypes, and database versions.
|
||||||
|
|
||||||
|
**GET /info/species**
|
||||||
|
- List all available species
|
||||||
|
- Returns species names, assemblies, taxonomy IDs
|
||||||
|
|
||||||
|
**GET /info/assembly/:species**
|
||||||
|
- Get assembly information for a species
|
||||||
|
- Example: `/info/assembly/human` (returns GRCh38.p14)
|
||||||
|
|
||||||
|
**GET /info/assembly/:species/:region**
|
||||||
|
- Get detailed information about a chromosomal region
|
||||||
|
- Example: `/info/assembly/human/X`
|
||||||
|
|
||||||
|
**GET /info/biotypes/:species**
|
||||||
|
- List all available biotypes (gene types)
|
||||||
|
- Example: `/info/biotypes/human`
|
||||||
|
|
||||||
|
**GET /info/analysis/:species**
|
||||||
|
- List available analysis types
|
||||||
|
- Example: `/info/analysis/human`
|
||||||
|
|
||||||
|
**GET /info/data**
|
||||||
|
- Get general information about the current Ensembl release
|
||||||
|
|
||||||
|
## 5. Linkage Disequilibrium (LD)
|
||||||
|
|
||||||
|
Calculate linkage disequilibrium between variants.
|
||||||
|
|
||||||
|
**GET /ld/:species/:id/:population_name**
|
||||||
|
- Calculate LD for a variant
|
||||||
|
- Example: `/ld/human/rs1042522/1000GENOMES:phase_3:KHV`
|
||||||
|
|
||||||
|
**GET /ld/pairwise/:species/:id1/:id2**
|
||||||
|
- Calculate LD between two variants
|
||||||
|
- Example: `/ld/pairwise/human/rs1042522/rs11540652`
|
||||||
|
|
||||||
|
## 6. Lookup
|
||||||
|
|
||||||
|
Identify species and database information for identifiers.
|
||||||
|
|
||||||
|
**GET /lookup/id/:id**
|
||||||
|
- Look up object by Ensembl ID
|
||||||
|
- Parameter: `expand` (include child objects)
|
||||||
|
- Example: `/lookup/id/ENSG00000139618?expand=1`
|
||||||
|
|
||||||
|
**POST /lookup/id**
|
||||||
|
- Batch lookup multiple IDs
|
||||||
|
- Submit JSON array of IDs
|
||||||
|
- Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
|
||||||
|
|
||||||
|
**GET /lookup/symbol/:species/:symbol**
|
||||||
|
- Look up gene by symbol
|
||||||
|
- Parameter: `expand` (include transcripts)
|
||||||
|
- Example: `/lookup/symbol/human/BRCA2?expand=1`
|
||||||
|
|
||||||
|
## 7. Mapping
|
||||||
|
|
||||||
|
Convert coordinates between assemblies, cDNA, CDS, and protein positions.
|
||||||
|
|
||||||
|
**GET /map/cdna/:id/:region**
|
||||||
|
- Map cDNA coordinates to genomic
|
||||||
|
- Example: `/map/cdna/ENST00000288602/100..300`
|
||||||
|
|
||||||
|
**GET /map/cds/:id/:region**
|
||||||
|
- Map CDS coordinates to genomic
|
||||||
|
- Example: `/map/cds/ENST00000288602/1..300`
|
||||||
|
|
||||||
|
**GET /map/translation/:id/:region**
|
||||||
|
- Map protein coordinates to genomic
|
||||||
|
- Example: `/map/translation/ENSP00000288602/1..100`
|
||||||
|
|
||||||
|
**GET /map/:species/:asm_one/:region/:asm_two**
|
||||||
|
- Map coordinates between assemblies
|
||||||
|
- Example: `/map/human/GRCh37/7:140453136..140453136/GRCh38`
|
||||||
|
|
||||||
|
**POST /map/:species/:asm_one/:asm_two**
|
||||||
|
- Batch assembly mapping
|
||||||
|
- Submit JSON array of regions
|
||||||
|
|
||||||
|
## 8. Ontologies and Taxonomy
|
||||||
|
|
||||||
|
Search biological ontologies and taxonomic classifications.
|
||||||
|
|
||||||
|
**GET /ontology/id/:id**
|
||||||
|
- Get ontology term information
|
||||||
|
- Example: `/ontology/id/GO:0005515`
|
||||||
|
|
||||||
|
**GET /ontology/name/:name**
|
||||||
|
- Search ontology by term name
|
||||||
|
- Example: `/ontology/name/protein%20binding`
|
||||||
|
|
||||||
|
**GET /taxonomy/classification/:id**
|
||||||
|
- Get taxonomic classification
|
||||||
|
- Example: `/taxonomy/classification/9606` (human)
|
||||||
|
|
||||||
|
**GET /taxonomy/id/:id**
|
||||||
|
- Get taxonomy information by ID
|
||||||
|
- Example: `/taxonomy/id/9606`
|
||||||
|
|
||||||
|
## 9. Overlap
|
||||||
|
|
||||||
|
Find genomic features overlapping a region.
|
||||||
|
|
||||||
|
**GET /overlap/id/:id**
|
||||||
|
- Get features overlapping a gene/transcript
|
||||||
|
- Parameters: `feature` (gene, transcript, cds, exon, repeat, etc.)
|
||||||
|
- Example: `/overlap/id/ENSG00000139618?feature=transcript`
|
||||||
|
|
||||||
|
**GET /overlap/region/:species/:region**
|
||||||
|
- Get all features in a genomic region
|
||||||
|
- Parameters: `feature` (gene, transcript, variation, regulatory, etc.)
|
||||||
|
- Example: `/overlap/region/human/7:140424943..140624564?feature=gene`
|
||||||
|
|
||||||
|
**GET /overlap/translation/:id**
|
||||||
|
- Get protein features
|
||||||
|
- Example: `/overlap/translation/ENSP00000288602`
|
||||||
|
|
||||||
|
## 10. Phenotype Annotations
|
||||||
|
|
||||||
|
Retrieve disease and trait associations.
|
||||||
|
|
||||||
|
**GET /phenotype/accession/:species/:accession**
|
||||||
|
- Get phenotypes by ontology accession
|
||||||
|
- Example: `/phenotype/accession/human/EFO:0003767`
|
||||||
|
|
||||||
|
**GET /phenotype/gene/:species/:gene**
|
||||||
|
- Get phenotype associations for a gene
|
||||||
|
- Example: `/phenotype/gene/human/ENSG00000139618`
|
||||||
|
|
||||||
|
**GET /phenotype/region/:species/:region**
|
||||||
|
- Get phenotypes in genomic region
|
||||||
|
- Example: `/phenotype/region/human/7:140424943-140624564`
|
||||||
|
|
||||||
|
**GET /phenotype/term/:species/:term**
|
||||||
|
- Search phenotypes by term
|
||||||
|
- Example: `/phenotype/term/human/cancer`
|
||||||
|
|
||||||
|
## 11. Regulation
|
||||||
|
|
||||||
|
Access regulatory feature and binding motif data.
|
||||||
|
|
||||||
|
**GET /regulatory/species/:species/microarray/:microarray/:probe**
|
||||||
|
- Get microarray probe information
|
||||||
|
- Example: `/regulatory/species/human/microarray/HumanWG_6_V2/ILMN_1773626`
|
||||||
|
|
||||||
|
**GET /species/:species/binding_matrix/:binding_matrix_id**
|
||||||
|
- Get transcription factor binding matrix
|
||||||
|
- Example: `/species/human/binding_matrix/ENSPFM0001`
|
||||||
|
|
||||||
|
## 12. Sequence
|
||||||
|
|
||||||
|
Retrieve genomic, transcript, and protein sequences.
|
||||||
|
|
||||||
|
**GET /sequence/id/:id**
|
||||||
|
- Get sequence by ID
|
||||||
|
- Parameters: `type` (genomic, cds, cdna, protein), `format` (json, fasta, text)
|
||||||
|
- Example: `/sequence/id/ENSG00000139618?type=genomic`
|
||||||
|
|
||||||
|
**POST /sequence/id**
|
||||||
|
- Batch sequence retrieval
|
||||||
|
- Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
|
||||||
|
|
||||||
|
**GET /sequence/region/:species/:region**
|
||||||
|
- Get genomic sequence for region
|
||||||
|
- Parameters: `coord_system`, `format`
|
||||||
|
- Example: `/sequence/region/human/7:140424943..140624564?format=fasta`
|
||||||
|
|
||||||
|
**POST /sequence/region/:species**
|
||||||
|
- Batch region sequence retrieval
|
||||||
|
|
||||||
|
## 13. Transcript Haplotypes
|
||||||
|
|
||||||
|
Compute transcript haplotypes from phased genotypes.
|
||||||
|
|
||||||
|
**GET /transcript_haplotypes/:species/:id**
|
||||||
|
- Get transcript haplotypes
|
||||||
|
- Example: `/transcript_haplotypes/human/ENST00000288602`
|
||||||
|
|
||||||
|
## 14. Variant Effect Predictor (VEP)
|
||||||
|
|
||||||
|
Predict functional consequences of variants.
|
||||||
|
|
||||||
|
**GET /vep/:species/hgvs/:hgvs_notation**
|
||||||
|
- Predict variant effects using HGVS notation
|
||||||
|
- Parameters: numerous VEP options
|
||||||
|
- Example: `/vep/human/hgvs/ENST00000288602:c.803C>T`
|
||||||
|
|
||||||
|
**POST /vep/:species/hgvs**
|
||||||
|
- Batch VEP analysis with HGVS
|
||||||
|
- Example: `{"hgvs_notations": ["ENST00000288602:c.803C>T"]}`
|
||||||
|
|
||||||
|
**GET /vep/:species/id/:id**
|
||||||
|
- Predict effects for variant ID
|
||||||
|
- Example: `/vep/human/id/rs699`
|
||||||
|
|
||||||
|
**POST /vep/:species/id**
|
||||||
|
- Batch VEP by variant IDs
|
||||||
|
|
||||||
|
**GET /vep/:species/region/:region/:allele**
|
||||||
|
- Predict effects for region and allele
|
||||||
|
- Example: `/vep/human/region/7:140453136:C/T`
|
||||||
|
|
||||||
|
**POST /vep/:species/region**
|
||||||
|
- Batch VEP by regions
|
||||||
|
|
||||||
|
## 15. Variation
|
||||||
|
|
||||||
|
Query genetic variation data and associated publications.
|
||||||
|
|
||||||
|
**GET /variation/:species/:id**
|
||||||
|
- Get variant information by ID
|
||||||
|
- Parameters: `pops` (include population frequencies), `genotypes`
|
||||||
|
- Example: `/variation/human/rs699?pops=1`
|
||||||
|
|
||||||
|
**POST /variation/:species**
|
||||||
|
- Batch variant queries
|
||||||
|
- Example: `{"ids": ["rs699", "rs6025"]}`
|
||||||
|
|
||||||
|
**GET /variation/:species/pmcid/:pmcid**
|
||||||
|
- Get variants from PubMed Central article
|
||||||
|
- Example: `/variation/human/pmcid/PMC5002951`
|
||||||
|
|
||||||
|
**GET /variation/:species/pmid/:pmid**
|
||||||
|
- Get variants from PubMed article
|
||||||
|
- Example: `/variation/human/pmid/26318936`
|
||||||
|
|
||||||
|
## 16. Variation GA4GH
|
||||||
|
|
||||||
|
Access genomic variation data using GA4GH standards.
|
||||||
|
|
||||||
|
**POST /ga4gh/beacon**
|
||||||
|
- Query beacon for variant presence
|
||||||
|
|
||||||
|
**GET /ga4gh/features/:id**
|
||||||
|
- Get feature by ID in GA4GH format
|
||||||
|
|
||||||
|
**POST /ga4gh/features/search**
|
||||||
|
- Search features using GA4GH protocol
|
||||||
|
|
||||||
|
**POST /ga4gh/variants/search**
|
||||||
|
- Search variants using GA4GH protocol
|
||||||
|
|
||||||
|
## Response Formats
|
||||||
|
|
||||||
|
Most endpoints support multiple response formats:
|
||||||
|
- **JSON** (default): `Content-Type: application/json`
|
||||||
|
- **FASTA**: For sequence data
|
||||||
|
- **XML**: Some endpoints support XML
|
||||||
|
- **Text**: Plain text output
|
||||||
|
|
||||||
|
Specify format using:
|
||||||
|
1. `Content-Type` header
|
||||||
|
2. URL parameter: `content-type=text/x-fasta`
|
||||||
|
3. File extension: `/sequence/id/ENSG00000139618.fasta`
|
||||||
|
|
||||||
|
## Common Parameters
|
||||||
|
|
||||||
|
Many endpoints share these parameters:
|
||||||
|
|
||||||
|
- **expand**: Include child objects (transcripts, proteins)
|
||||||
|
- **format**: Output format (json, xml, fasta)
|
||||||
|
- **db_type**: Database type (core, otherfeatures, variation)
|
||||||
|
- **object_type**: Type of object to return
|
||||||
|
- **species**: Species name (can be common or scientific)
|
||||||
|
|
||||||
|
## Error Codes
|
||||||
|
|
||||||
|
- **200**: Success
|
||||||
|
- **400**: Bad request (invalid parameters)
|
||||||
|
- **404**: Not found (ID doesn't exist)
|
||||||
|
- **429**: Rate limit exceeded
|
||||||
|
- **500**: Internal server error
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Use batch endpoints** for multiple queries (more efficient)
|
||||||
|
2. **Cache responses** to minimize API calls
|
||||||
|
3. **Check rate limit headers** in responses
|
||||||
|
4. **Handle 429 errors** by respecting `Retry-After` header
|
||||||
|
5. **Use appropriate content types** for sequence data
|
||||||
|
6. **Specify assembly** when querying older genome versions
|
||||||
|
7. **Enable expand parameter** when you need full object details
|
||||||
427
scientific-databases/ensembl-database/scripts/ensembl_query.py
Normal file
427
scientific-databases/ensembl-database/scripts/ensembl_query.py
Normal file
@@ -0,0 +1,427 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Ensembl REST API Query Script
|
||||||
|
Reusable functions for common Ensembl database queries with built-in rate limiting and error handling.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python ensembl_query.py --gene BRCA2 --species human
|
||||||
|
python ensembl_query.py --variant rs699 --species human
|
||||||
|
python ensembl_query.py --region "7:140424943-140624564" --species human
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
|
||||||
|
|
||||||
|
class EnsemblAPIClient:
|
||||||
|
"""Client for querying the Ensembl REST API with rate limiting and error handling."""
|
||||||
|
|
||||||
|
def __init__(self, server: str = "https://rest.ensembl.org", rate_limit: int = 15):
|
||||||
|
"""
|
||||||
|
Initialize the Ensembl API client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server: Base URL for the Ensembl REST API
|
||||||
|
rate_limit: Maximum requests per second (default 15 for anonymous users)
|
||||||
|
"""
|
||||||
|
self.server = server
|
||||||
|
self.rate_limit = rate_limit
|
||||||
|
self.request_count = 0
|
||||||
|
self.last_request_time = 0
|
||||||
|
|
||||||
|
def _rate_limit_check(self):
|
||||||
|
"""Enforce rate limiting before making requests."""
|
||||||
|
current_time = time.time()
|
||||||
|
time_since_last = current_time - self.last_request_time
|
||||||
|
|
||||||
|
if time_since_last < 1.0:
|
||||||
|
if self.request_count >= self.rate_limit:
|
||||||
|
sleep_time = 1.0 - time_since_last
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
self.request_count = 0
|
||||||
|
self.last_request_time = time.time()
|
||||||
|
else:
|
||||||
|
self.request_count = 0
|
||||||
|
self.last_request_time = current_time
|
||||||
|
|
||||||
|
def _make_request(
|
||||||
|
self,
|
||||||
|
endpoint: str,
|
||||||
|
params: Optional[Dict] = None,
|
||||||
|
max_retries: int = 3,
|
||||||
|
method: str = "GET",
|
||||||
|
data: Optional[Dict] = None
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Make an API request with error handling and retries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint: API endpoint path
|
||||||
|
params: Query parameters
|
||||||
|
max_retries: Maximum number of retry attempts
|
||||||
|
method: HTTP method (GET or POST)
|
||||||
|
data: JSON data for POST requests
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
JSON response data
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If request fails after max retries
|
||||||
|
"""
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
url = f"{self.server}{endpoint}"
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
self._rate_limit_check()
|
||||||
|
self.request_count += 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
if method == "POST":
|
||||||
|
response = requests.post(url, headers=headers, json=data)
|
||||||
|
else:
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
elif response.status_code == 429:
|
||||||
|
# Rate limited - wait and retry
|
||||||
|
retry_after = int(response.headers.get('Retry-After', 1))
|
||||||
|
print(f"Rate limited. Waiting {retry_after} seconds...")
|
||||||
|
time.sleep(retry_after)
|
||||||
|
elif response.status_code == 404:
|
||||||
|
raise Exception(f"Resource not found: {endpoint}")
|
||||||
|
else:
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
raise Exception(f"Request failed after {max_retries} attempts: {e}")
|
||||||
|
time.sleep(2 ** attempt) # Exponential backoff
|
||||||
|
|
||||||
|
raise Exception(f"Failed after {max_retries} attempts")
|
||||||
|
|
||||||
|
def lookup_gene_by_symbol(self, species: str, symbol: str, expand: bool = True) -> Dict:
|
||||||
|
"""
|
||||||
|
Look up gene information by symbol.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name (e.g., 'human', 'mouse')
|
||||||
|
symbol: Gene symbol (e.g., 'BRCA2', 'TP53')
|
||||||
|
expand: Include transcript information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Gene information dictionary
|
||||||
|
"""
|
||||||
|
endpoint = f"/lookup/symbol/{species}/{symbol}"
|
||||||
|
params = {"expand": 1} if expand else {}
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def lookup_by_id(self, ensembl_id: str, expand: bool = False) -> Dict:
|
||||||
|
"""
|
||||||
|
Look up object by Ensembl ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ensembl_id: Ensembl identifier (e.g., 'ENSG00000139618')
|
||||||
|
expand: Include child objects
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Object information dictionary
|
||||||
|
"""
|
||||||
|
endpoint = f"/lookup/id/{ensembl_id}"
|
||||||
|
params = {"expand": 1} if expand else {}
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def get_sequence(
|
||||||
|
self,
|
||||||
|
ensembl_id: str,
|
||||||
|
seq_type: str = "genomic",
|
||||||
|
format: str = "json"
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Retrieve sequence by Ensembl ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ensembl_id: Ensembl identifier
|
||||||
|
seq_type: Sequence type ('genomic', 'cds', 'cdna', 'protein')
|
||||||
|
format: Output format ('json', 'fasta', 'text')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sequence data
|
||||||
|
"""
|
||||||
|
endpoint = f"/sequence/id/{ensembl_id}"
|
||||||
|
params = {"type": seq_type}
|
||||||
|
|
||||||
|
if format == "fasta":
|
||||||
|
headers = {"Content-Type": "text/x-fasta"}
|
||||||
|
url = f"{self.server}{endpoint}"
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def get_region_sequence(
|
||||||
|
self,
|
||||||
|
species: str,
|
||||||
|
region: str,
|
||||||
|
format: str = "json"
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Get genomic sequence for a region.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
region: Region string (e.g., '7:140424943-140624564')
|
||||||
|
format: Output format ('json', 'fasta', 'text')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sequence data
|
||||||
|
"""
|
||||||
|
endpoint = f"/sequence/region/{species}/{region}"
|
||||||
|
|
||||||
|
if format == "fasta":
|
||||||
|
headers = {"Content-Type": "text/x-fasta"}
|
||||||
|
url = f"{self.server}{endpoint}"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
return self._make_request(endpoint)
|
||||||
|
|
||||||
|
def get_variant(self, species: str, variant_id: str, include_pops: bool = True) -> Dict:
|
||||||
|
"""
|
||||||
|
Get variant information by ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
variant_id: Variant identifier (e.g., 'rs699')
|
||||||
|
include_pops: Include population frequencies
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Variant information dictionary
|
||||||
|
"""
|
||||||
|
endpoint = f"/variation/{species}/{variant_id}"
|
||||||
|
params = {"pops": 1} if include_pops else {}
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def predict_variant_effect(
|
||||||
|
self,
|
||||||
|
species: str,
|
||||||
|
hgvs_notation: str
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Predict variant consequences using VEP.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
hgvs_notation: HGVS notation (e.g., 'ENST00000288602:c.803C>T')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of predicted consequences
|
||||||
|
"""
|
||||||
|
endpoint = f"/vep/{species}/hgvs/{hgvs_notation}"
|
||||||
|
return self._make_request(endpoint)
|
||||||
|
|
||||||
|
def find_orthologs(
|
||||||
|
self,
|
||||||
|
ensembl_id: str,
|
||||||
|
target_species: Optional[str] = None
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Find orthologs for a gene.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ensembl_id: Source gene Ensembl ID
|
||||||
|
target_species: Target species (optional, returns all if not specified)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Homology information dictionary
|
||||||
|
"""
|
||||||
|
endpoint = f"/homology/id/{ensembl_id}"
|
||||||
|
params = {}
|
||||||
|
if target_species:
|
||||||
|
params["target_species"] = target_species
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def get_region_features(
|
||||||
|
self,
|
||||||
|
species: str,
|
||||||
|
region: str,
|
||||||
|
feature_type: str = "gene"
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Get genomic features in a region.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
region: Region string (e.g., '7:140424943-140624564')
|
||||||
|
feature_type: Feature type ('gene', 'transcript', 'variation', etc.)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of features
|
||||||
|
"""
|
||||||
|
endpoint = f"/overlap/region/{species}/{region}"
|
||||||
|
params = {"feature": feature_type}
|
||||||
|
return self._make_request(endpoint, params=params)
|
||||||
|
|
||||||
|
def get_species_info(self) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Get information about all available species.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of species information dictionaries
|
||||||
|
"""
|
||||||
|
endpoint = "/info/species"
|
||||||
|
result = self._make_request(endpoint)
|
||||||
|
return result.get("species", [])
|
||||||
|
|
||||||
|
def get_assembly_info(self, species: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Get assembly information for a species.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Assembly information dictionary
|
||||||
|
"""
|
||||||
|
endpoint = f"/info/assembly/{species}"
|
||||||
|
return self._make_request(endpoint)
|
||||||
|
|
||||||
|
def map_coordinates(
|
||||||
|
self,
|
||||||
|
species: str,
|
||||||
|
asm_from: str,
|
||||||
|
region: str,
|
||||||
|
asm_to: str
|
||||||
|
) -> Dict:
|
||||||
|
"""
|
||||||
|
Map coordinates between genome assemblies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
species: Species name
|
||||||
|
asm_from: Source assembly (e.g., 'GRCh37')
|
||||||
|
region: Region string (e.g., '7:140453136-140453136')
|
||||||
|
asm_to: Target assembly (e.g., 'GRCh38')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mapped coordinates
|
||||||
|
"""
|
||||||
|
endpoint = f"/map/{species}/{asm_from}/{region}/{asm_to}"
|
||||||
|
return self._make_request(endpoint)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Command-line interface for common Ensembl queries."""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Query the Ensembl database via REST API"
|
||||||
|
)
|
||||||
|
parser.add_argument("--gene", help="Gene symbol to look up")
|
||||||
|
parser.add_argument("--ensembl-id", help="Ensembl ID to look up")
|
||||||
|
parser.add_argument("--variant", help="Variant ID (e.g., rs699)")
|
||||||
|
parser.add_argument("--region", help="Genomic region (chr:start-end)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--species",
|
||||||
|
default="human",
|
||||||
|
help="Species name (default: human)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--orthologs",
|
||||||
|
help="Find orthologs for gene (provide Ensembl ID)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--target-species",
|
||||||
|
help="Target species for ortholog search"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sequence",
|
||||||
|
action="store_true",
|
||||||
|
help="Retrieve sequence (requires --gene or --ensembl-id or --region)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--format",
|
||||||
|
choices=["json", "fasta"],
|
||||||
|
default="json",
|
||||||
|
help="Output format (default: json)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--assembly",
|
||||||
|
default="GRCh37",
|
||||||
|
help="For GRCh37, use grch37.rest.ensembl.org server"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Select appropriate server
|
||||||
|
server = "https://rest.ensembl.org"
|
||||||
|
if args.assembly.lower() == "grch37":
|
||||||
|
server = "https://grch37.rest.ensembl.org"
|
||||||
|
|
||||||
|
client = EnsemblAPIClient(server=server)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args.gene:
|
||||||
|
print(f"Looking up gene: {args.gene}")
|
||||||
|
result = client.lookup_gene_by_symbol(args.species, args.gene)
|
||||||
|
if args.sequence:
|
||||||
|
print(f"\nRetrieving sequence for {result['id']}...")
|
||||||
|
seq_result = client.get_sequence(
|
||||||
|
result['id'],
|
||||||
|
format=args.format
|
||||||
|
)
|
||||||
|
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
|
||||||
|
else:
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
|
|
||||||
|
elif args.ensembl_id:
|
||||||
|
print(f"Looking up ID: {args.ensembl_id}")
|
||||||
|
result = client.lookup_by_id(args.ensembl_id, expand=True)
|
||||||
|
if args.sequence:
|
||||||
|
print(f"\nRetrieving sequence...")
|
||||||
|
seq_result = client.get_sequence(
|
||||||
|
args.ensembl_id,
|
||||||
|
format=args.format
|
||||||
|
)
|
||||||
|
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
|
||||||
|
else:
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
|
|
||||||
|
elif args.variant:
|
||||||
|
print(f"Looking up variant: {args.variant}")
|
||||||
|
result = client.get_variant(args.species, args.variant)
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
|
|
||||||
|
elif args.region:
|
||||||
|
if args.sequence:
|
||||||
|
print(f"Retrieving sequence for region: {args.region}")
|
||||||
|
result = client.get_region_sequence(
|
||||||
|
args.species,
|
||||||
|
args.region,
|
||||||
|
format=args.format
|
||||||
|
)
|
||||||
|
print(json.dumps(result, indent=2) if args.format == "json" else result)
|
||||||
|
else:
|
||||||
|
print(f"Finding features in region: {args.region}")
|
||||||
|
result = client.get_region_features(args.species, args.region)
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
|
|
||||||
|
elif args.orthologs:
|
||||||
|
print(f"Finding orthologs for: {args.orthologs}")
|
||||||
|
result = client.find_orthologs(
|
||||||
|
args.orthologs,
|
||||||
|
target_species=args.target_species
|
||||||
|
)
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
|
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
251
scientific-databases/metabolomics-workbench-database/SKILL.md
Normal file
251
scientific-databases/metabolomics-workbench-database/SKILL.md
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
---
|
||||||
|
name: metabolomics-workbench-database
|
||||||
|
description: Toolkit for accessing and querying the Metabolomics Workbench, an NIH-sponsored repository containing 4,200+ metabolomics studies with standardized nomenclature (RefMet), study metadata, experimental results, and comprehensive metabolite databases. Use this skill when working with metabolomics data, querying metabolite structures, accessing study results, standardizing metabolite names, performing mass spectrometry searches, or retrieving gene/protein associations with metabolites.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Metabolomics Workbench Database
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Metabolomics Workbench is a comprehensive NIH Common Fund-sponsored platform hosted at UCSD that serves as the primary repository for metabolomics research data. It provides programmatic access to over 4,200 processed studies (3,790+ publicly available), standardized metabolite nomenclature through RefMet, and powerful search capabilities across multiple analytical platforms (GC-MS, LC-MS, NMR).
|
||||||
|
|
||||||
|
This skill enables efficient interaction with the Metabolomics Workbench REST API to query metabolite structures, access study data, standardize nomenclature, perform mass spectrometry searches, and retrieve gene/protein-metabolite associations.
|
||||||
|
|
||||||
|
## Core Capabilities
|
||||||
|
|
||||||
|
### 1. Querying Metabolite Structures and Data
|
||||||
|
|
||||||
|
Access comprehensive metabolite information including structures, identifiers, and cross-references to external databases.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Retrieve compound data by various identifiers (PubChem CID, InChI Key, KEGG ID, HMDB ID, etc.)
|
||||||
|
- Download molecular structures as MOL files or PNG images
|
||||||
|
- Access standardized compound classifications
|
||||||
|
- Cross-reference between different metabolite databases
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Get compound information by PubChem CID
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json')
|
||||||
|
|
||||||
|
# Download molecular structure as PNG
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/png')
|
||||||
|
|
||||||
|
# Get compound name by registry number
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Accessing Study Metadata and Experimental Results
|
||||||
|
|
||||||
|
Query metabolomics studies by various criteria and retrieve complete experimental datasets.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Search studies by metabolite, institute, investigator, or title
|
||||||
|
- Access study summaries, experimental factors, and analysis details
|
||||||
|
- Retrieve complete experimental data in various formats
|
||||||
|
- Download mwTab format files for complete study information
|
||||||
|
- Query untargeted metabolomics data
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
# List all available public studies
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json')
|
||||||
|
|
||||||
|
# Get study summary
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json')
|
||||||
|
|
||||||
|
# Retrieve experimental data
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
|
||||||
|
|
||||||
|
# Find studies containing a specific metabolite
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Standardizing Metabolite Nomenclature with RefMet
|
||||||
|
|
||||||
|
Use the RefMet database to standardize metabolite names and access systematic classification across four structural resolution levels.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Match common metabolite names to standardized RefMet names
|
||||||
|
- Query by chemical formula, exact mass, or InChI Key
|
||||||
|
- Access hierarchical classification (super class, main class, sub class)
|
||||||
|
- Retrieve all RefMet entries or filter by classification
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
# Standardize a metabolite name
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json')
|
||||||
|
|
||||||
|
# Query by molecular formula
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/formula/C12H24O2/all/json')
|
||||||
|
|
||||||
|
# Get all metabolites in a specific class
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json')
|
||||||
|
|
||||||
|
# Retrieve complete RefMet database
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/all/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Performing Mass Spectrometry Searches
|
||||||
|
|
||||||
|
Search for compounds by mass-to-charge ratio (m/z) with specified ion adducts and tolerance levels.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Search precursor ion masses across multiple databases (Metabolomics Workbench, LIPIDS, RefMet)
|
||||||
|
- Specify ion adduct types (M+H, M-H, M+Na, M+NH4, M+2H, etc.)
|
||||||
|
- Calculate exact masses for known metabolites with specific adducts
|
||||||
|
- Set mass tolerance for flexible matching
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
# Search by m/z value with M+H adduct
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json')
|
||||||
|
|
||||||
|
# Calculate exact mass for a metabolite with specific adduct
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json')
|
||||||
|
|
||||||
|
# Search across RefMet database
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Filtering Studies by Analytical and Biological Parameters
|
||||||
|
|
||||||
|
Use the MetStat context to find studies matching specific experimental conditions.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Filter by analytical method (LCMS, GCMS, NMR)
|
||||||
|
- Specify ionization polarity (POSITIVE, NEGATIVE)
|
||||||
|
- Filter by chromatography type (HILIC, RP, GC)
|
||||||
|
- Target specific species, sample sources, or diseases
|
||||||
|
- Combine multiple filters using semicolon-delimited format
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
# Find human blood studies on diabetes using LC-MS
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json')
|
||||||
|
|
||||||
|
# Find all human blood studies containing tyrosine
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json')
|
||||||
|
|
||||||
|
# Filter by analytical method only
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Accessing Gene and Protein Information
|
||||||
|
|
||||||
|
Retrieve gene and protein data associated with metabolic pathways and metabolite metabolism.
|
||||||
|
|
||||||
|
**Key operations:**
|
||||||
|
- Query genes by symbol, name, or ID
|
||||||
|
- Access protein sequences and annotations
|
||||||
|
- Cross-reference between gene IDs, RefSeq IDs, and UniProt IDs
|
||||||
|
- Retrieve gene-metabolite associations
|
||||||
|
|
||||||
|
**Example queries:**
|
||||||
|
```python
|
||||||
|
# Get gene information by symbol
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json')
|
||||||
|
|
||||||
|
# Retrieve protein data by UniProt ID
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Workflows
|
||||||
|
|
||||||
|
### Workflow 1: Finding Studies for a Specific Metabolite
|
||||||
|
|
||||||
|
To find all studies containing measurements of a specific metabolite:
|
||||||
|
|
||||||
|
1. First standardize the metabolite name using RefMet:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/glucose/name/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Use the standardized name to search for studies:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Glucose/summary/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Retrieve experimental data from specific studies:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 2: Identifying Compounds from MS Data
|
||||||
|
|
||||||
|
To identify potential compounds from mass spectrometry m/z values:
|
||||||
|
|
||||||
|
1. Perform m/z search with appropriate adduct and tolerance:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/180.06/M+H/0.5/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Review candidate compounds from results
|
||||||
|
|
||||||
|
3. Retrieve detailed information for candidate compounds:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/all/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Download structures for confirmation:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 3: Exploring Disease-Specific Metabolomics
|
||||||
|
|
||||||
|
To find metabolomics studies for a specific disease and analytical platform:
|
||||||
|
|
||||||
|
1. Use MetStat to filter studies:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;;Human;;Cancer/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Review study IDs from results
|
||||||
|
|
||||||
|
3. Access detailed study information:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/summary/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Retrieve complete experimental data:
|
||||||
|
```python
|
||||||
|
response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/data/json')
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Formats
|
||||||
|
|
||||||
|
The API supports two primary output formats:
|
||||||
|
- **JSON** (default): Machine-readable format, ideal for programmatic access
|
||||||
|
- **TXT**: Human-readable tab-delimited text format
|
||||||
|
|
||||||
|
Specify format by appending `/json` or `/txt` to API URLs. When format is omitted, JSON is returned by default.
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Use RefMet for standardization**: Always standardize metabolite names through RefMet before searching studies to ensure consistent nomenclature
|
||||||
|
|
||||||
|
2. **Specify appropriate adducts**: When performing m/z searches, use the correct ion adduct type for your analytical method (e.g., M+H for positive mode ESI)
|
||||||
|
|
||||||
|
3. **Set reasonable tolerances**: Use appropriate mass tolerance values (typically 0.5 Da for low-resolution, 0.01 Da for high-resolution MS)
|
||||||
|
|
||||||
|
4. **Cache reference data**: Consider caching frequently used reference data (RefMet database, compound information) to minimize API calls
|
||||||
|
|
||||||
|
5. **Handle pagination**: For large result sets, be prepared to handle multiple data structures in responses
|
||||||
|
|
||||||
|
6. **Validate identifiers**: Cross-reference metabolite identifiers across multiple databases when possible to ensure correct compound identification
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
### references/
|
||||||
|
|
||||||
|
Detailed API reference documentation is available in `references/api_reference.md`, including:
|
||||||
|
- Complete REST API endpoint specifications
|
||||||
|
- All available contexts (compound, study, refmet, metstat, gene, protein, moverz)
|
||||||
|
- Input/output parameter details
|
||||||
|
- Ion adduct types for mass spectrometry
|
||||||
|
- Additional query examples
|
||||||
|
|
||||||
|
Load this reference file when detailed API specifications are needed or when working with less common endpoints.
|
||||||
@@ -0,0 +1,494 @@
|
|||||||
|
# Metabolomics Workbench REST API Reference
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
All API requests use the following base URL:
|
||||||
|
```
|
||||||
|
https://www.metabolomicsworkbench.org/rest/
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Structure
|
||||||
|
|
||||||
|
The REST API follows a consistent URL pattern:
|
||||||
|
```
|
||||||
|
/context/input_item/input_value/output_item/output_format
|
||||||
|
```
|
||||||
|
|
||||||
|
- **context**: The type of resource to access (study, compound, refmet, metstat, gene, protein, moverz)
|
||||||
|
- **input_item**: The type of identifier or search parameter
|
||||||
|
- **input_value**: The specific value to search for
|
||||||
|
- **output_item**: What data to return (e.g., all, name, summary)
|
||||||
|
- **output_format**: json or txt (json is default if omitted)
|
||||||
|
|
||||||
|
## Output Formats
|
||||||
|
|
||||||
|
- **json**: Machine-readable JSON format (default)
|
||||||
|
- **txt**: Tab-delimited text format for human readability
|
||||||
|
|
||||||
|
## Context 1: Compound
|
||||||
|
|
||||||
|
Retrieve metabolite structure and identification data.
|
||||||
|
|
||||||
|
### Input Items
|
||||||
|
|
||||||
|
| Input Item | Description | Example |
|
||||||
|
|------------|-------------|---------|
|
||||||
|
| `regno` | Metabolomics Workbench registry number | 11 |
|
||||||
|
| `pubchem_cid` | PubChem Compound ID | 5281365 |
|
||||||
|
| `inchi_key` | International Chemical Identifier Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
|
||||||
|
| `formula` | Molecular formula | C6H12O6 |
|
||||||
|
| `lm_id` | LIPID MAPS ID | LM... |
|
||||||
|
| `hmdb_id` | Human Metabolome Database ID | HMDB0000122 |
|
||||||
|
| `kegg_id` | KEGG Compound ID | C00031 |
|
||||||
|
|
||||||
|
### Output Items
|
||||||
|
|
||||||
|
| Output Item | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `all` | All available compound data |
|
||||||
|
| `classification` | Compound classification |
|
||||||
|
| `regno` | Registry number |
|
||||||
|
| `formula` | Molecular formula |
|
||||||
|
| `exactmass` | Exact mass |
|
||||||
|
| `inchi_key` | InChI Key |
|
||||||
|
| `name` | Common name |
|
||||||
|
| `sys_name` | Systematic name |
|
||||||
|
| `smiles` | SMILES notation |
|
||||||
|
| `lm_id` | LIPID MAPS ID |
|
||||||
|
| `pubchem_cid` | PubChem CID |
|
||||||
|
| `hmdb_id` | HMDB ID |
|
||||||
|
| `kegg_id` | KEGG ID |
|
||||||
|
| `chebi_id` | ChEBI ID |
|
||||||
|
| `metacyc_id` | MetaCyc ID |
|
||||||
|
| `molfile` | MOL file structure |
|
||||||
|
| `png` | PNG image of structure |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get all compound data by PubChem CID
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json"
|
||||||
|
|
||||||
|
# Get compound name by registry number
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json"
|
||||||
|
|
||||||
|
# Download structure as PNG
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/png" -o structure.png
|
||||||
|
|
||||||
|
# Get compound by KEGG ID
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/compound/kegg_id/C00031/all/json"
|
||||||
|
|
||||||
|
# Get compound by molecular formula
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/compound/formula/C6H12O6/all/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Context 2: Study
|
||||||
|
|
||||||
|
Access metabolomics research study metadata and experimental results.
|
||||||
|
|
||||||
|
### Input Items
|
||||||
|
|
||||||
|
| Input Item | Description | Example |
|
||||||
|
|------------|-------------|---------|
|
||||||
|
| `study_id` | Study identifier | ST000001 |
|
||||||
|
| `analysis_id` | Analysis identifier | AN000001 |
|
||||||
|
| `study_title` | Keywords in study title | diabetes |
|
||||||
|
| `institute` | Institute name | UCSD |
|
||||||
|
| `last_name` | Investigator last name | Smith |
|
||||||
|
| `metabolite_id` | Metabolite registry number | 11 |
|
||||||
|
| `refmet_name` | RefMet standardized name | Glucose |
|
||||||
|
| `kegg_id` | KEGG compound ID | C00031 |
|
||||||
|
|
||||||
|
### Output Items
|
||||||
|
|
||||||
|
| Output Item | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `summary` | Study overview and metadata |
|
||||||
|
| `factors` | Experimental factors and design |
|
||||||
|
| `analysis` | Analysis methods and parameters |
|
||||||
|
| `metabolites` | List of measured metabolites |
|
||||||
|
| `data` | Complete experimental data |
|
||||||
|
| `mwtab` | Complete study in mwTab format |
|
||||||
|
| `number_of_metabolites` | Count of metabolites measured |
|
||||||
|
| `species` | Organism species |
|
||||||
|
| `disease` | Disease studied |
|
||||||
|
| `source` | Sample source/tissue type |
|
||||||
|
| `untarg_studies` | Untargeted study information |
|
||||||
|
| `untarg_factors` | Untargeted study factors |
|
||||||
|
| `untarg_data` | Untargeted experimental data |
|
||||||
|
| `datatable` | Formatted data table |
|
||||||
|
| `available` | List available studies (use with ST as input_value) |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List all publicly available studies
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json"
|
||||||
|
|
||||||
|
# Get study summary
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json"
|
||||||
|
|
||||||
|
# Get experimental data
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json"
|
||||||
|
|
||||||
|
# Get study factors
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/factors/json"
|
||||||
|
|
||||||
|
# Find studies containing a specific metabolite
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json"
|
||||||
|
|
||||||
|
# Search studies by investigator
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/last_name/Smith/summary/json"
|
||||||
|
|
||||||
|
# Download complete study in mwTab format
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/mwtab/txt"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Context 3: RefMet
|
||||||
|
|
||||||
|
Query the standardized metabolite nomenclature database with hierarchical classification.
|
||||||
|
|
||||||
|
### Input Items
|
||||||
|
|
||||||
|
| Input Item | Description | Example |
|
||||||
|
|------------|-------------|---------|
|
||||||
|
| `name` | Metabolite name | glucose |
|
||||||
|
| `inchi_key` | InChI Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
|
||||||
|
| `pubchem_cid` | PubChem CID | 5793 |
|
||||||
|
| `exactmass` | Exact mass | 180.0634 |
|
||||||
|
| `formula` | Molecular formula | C6H12O6 |
|
||||||
|
| `super_class` | Super class name | Organic compounds |
|
||||||
|
| `main_class` | Main class name | Carbohydrates |
|
||||||
|
| `sub_class` | Sub class name | Monosaccharides |
|
||||||
|
| `match` | Name matching/standardization | citrate |
|
||||||
|
| `refmet_id` | RefMet identifier | 12345 |
|
||||||
|
| `all` | Retrieve all RefMet entries | (no value needed) |
|
||||||
|
|
||||||
|
### Output Items
|
||||||
|
|
||||||
|
| Output Item | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `all` | All available RefMet data |
|
||||||
|
| `name` | Standardized RefMet name |
|
||||||
|
| `inchi_key` | InChI Key |
|
||||||
|
| `pubchem_cid` | PubChem CID |
|
||||||
|
| `exactmass` | Exact mass |
|
||||||
|
| `formula` | Molecular formula |
|
||||||
|
| `sys_name` | Systematic name |
|
||||||
|
| `super_class` | Super class classification |
|
||||||
|
| `main_class` | Main class classification |
|
||||||
|
| `sub_class` | Sub class classification |
|
||||||
|
| `refmet_id` | RefMet identifier |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Standardize a metabolite name
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json"
|
||||||
|
|
||||||
|
# Get all RefMet data for a metabolite
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/name/Glucose/all/json"
|
||||||
|
|
||||||
|
# Query by molecular formula
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/formula/C6H12O6/all/json"
|
||||||
|
|
||||||
|
# Get all metabolites in a main class
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json"
|
||||||
|
|
||||||
|
# Query by exact mass
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/exactmass/180.0634/all/json"
|
||||||
|
|
||||||
|
# Download complete RefMet database
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/refmet/all/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
### RefMet Classification Hierarchy
|
||||||
|
|
||||||
|
RefMet provides four-level structural resolution:
|
||||||
|
|
||||||
|
1. **Super Class**: Broadest categorization (e.g., "Organic compounds", "Lipids")
|
||||||
|
2. **Main Class**: Major biochemical categories (e.g., "Fatty Acids", "Carbohydrates")
|
||||||
|
3. **Sub Class**: More specific groupings (e.g., "Monosaccharides", "Amino acids")
|
||||||
|
4. **Individual Metabolite**: Specific compound with standardized name
|
||||||
|
|
||||||
|
## Context 4: MetStat
|
||||||
|
|
||||||
|
Filter studies by analytical and biological parameters using semicolon-delimited format.
|
||||||
|
|
||||||
|
### Format
|
||||||
|
|
||||||
|
```
|
||||||
|
/metstat/ANALYSIS_TYPE;POLARITY;CHROMATOGRAPHY;SPECIES;SAMPLE_SOURCE;DISEASE;KEGG_ID;REFMET_NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parameters
|
||||||
|
|
||||||
|
| Position | Parameter | Options |
|
||||||
|
|----------|-----------|---------|
|
||||||
|
| 1 | Analysis Type | LCMS, GCMS, NMR, MS, ICPMS |
|
||||||
|
| 2 | Polarity | POSITIVE, NEGATIVE |
|
||||||
|
| 3 | Chromatography | HILIC, RP (Reverse Phase), GC, IC |
|
||||||
|
| 4 | Species | Human, Mouse, Rat, etc. |
|
||||||
|
| 5 | Sample Source | Blood, Plasma, Serum, Urine, Liver, etc. |
|
||||||
|
| 6 | Disease | Diabetes, Cancer, Alzheimer, etc. |
|
||||||
|
| 7 | KEGG ID | C00031, etc. |
|
||||||
|
| 8 | RefMet Name | Glucose, Tyrosine, etc. |
|
||||||
|
|
||||||
|
**Note**: Use empty positions (consecutive semicolons) to skip parameters. All parameters are optional.
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Human blood diabetes studies with LC-MS HILIC positive mode
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json"
|
||||||
|
|
||||||
|
# All human blood studies containing tyrosine
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json"
|
||||||
|
|
||||||
|
# All GC-MS studies regardless of other parameters
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json"
|
||||||
|
|
||||||
|
# Mouse liver studies
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Mouse;Liver;;/json"
|
||||||
|
|
||||||
|
# All studies measuring glucose
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;;;;;Glucose/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Context 5: Moverz
|
||||||
|
|
||||||
|
Perform mass spectrometry precursor ion searches by m/z value.
|
||||||
|
|
||||||
|
### Format for m/z Search
|
||||||
|
|
||||||
|
```
|
||||||
|
/moverz/DATABASE/mass/adduct/tolerance/format
|
||||||
|
```
|
||||||
|
|
||||||
|
- **DATABASE**: MB (Metabolomics Workbench), LIPIDS, REFMET
|
||||||
|
- **mass**: m/z value (e.g., 635.52)
|
||||||
|
- **adduct**: Ion adduct type (see table below)
|
||||||
|
- **tolerance**: Mass tolerance in Daltons (e.g., 0.5)
|
||||||
|
- **format**: json or txt
|
||||||
|
|
||||||
|
### Format for Exact Mass Calculation
|
||||||
|
|
||||||
|
```
|
||||||
|
/moverz/exactmass/metabolite_name/adduct/format
|
||||||
|
```
|
||||||
|
|
||||||
|
### Ion Adduct Types
|
||||||
|
|
||||||
|
#### Positive Mode Adducts
|
||||||
|
|
||||||
|
| Adduct | Description | Example Use |
|
||||||
|
|--------|-------------|-------------|
|
||||||
|
| `M+H` | Protonated molecule | Most common positive ESI |
|
||||||
|
| `M+Na` | Sodium adduct | Common in ESI |
|
||||||
|
| `M+K` | Potassium adduct | Less common ESI |
|
||||||
|
| `M+NH4` | Ammonium adduct | Common with ammonium salts |
|
||||||
|
| `M+2H` | Doubly protonated | Multiply charged ions |
|
||||||
|
| `M+H-H2O` | Dehydrated protonated | Loss of water |
|
||||||
|
| `M+2Na-H` | Disodium minus hydrogen | Multiple sodium |
|
||||||
|
| `M+CH3OH+H` | Methanol adduct | Methanol in mobile phase |
|
||||||
|
| `M+ACN+H` | Acetonitrile adduct | ACN in mobile phase |
|
||||||
|
| `M+ACN+Na` | ACN + sodium | ACN and sodium |
|
||||||
|
|
||||||
|
#### Negative Mode Adducts
|
||||||
|
|
||||||
|
| Adduct | Description | Example Use |
|
||||||
|
|--------|-------------|-------------|
|
||||||
|
| `M-H` | Deprotonated molecule | Most common negative ESI |
|
||||||
|
| `M+Cl` | Chloride adduct | Chlorinated mobile phases |
|
||||||
|
| `M+FA-H` | Formate adduct | Formic acid in mobile phase |
|
||||||
|
| `M+HAc-H` | Acetate adduct | Acetic acid in mobile phase |
|
||||||
|
| `M-H-H2O` | Deprotonated minus water | Water loss |
|
||||||
|
| `M-2H` | Doubly deprotonated | Multiply charged ions |
|
||||||
|
| `M+Na-2H` | Sodium minus two protons | Mixed charge states |
|
||||||
|
|
||||||
|
#### Uncharged
|
||||||
|
|
||||||
|
| Adduct | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `M` | Uncharged molecule | Direct ionization methods |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Search for compounds with m/z 635.52 (M+H) in MB database
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json"
|
||||||
|
|
||||||
|
# Search in RefMet with negative mode
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json"
|
||||||
|
|
||||||
|
# Search lipids database
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/moverz/LIPIDS/760.59/M+Na/0.5/json"
|
||||||
|
|
||||||
|
# Calculate exact mass for known metabolite
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json"
|
||||||
|
|
||||||
|
# High-resolution MS search (tight tolerance)
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/180.0634/M+H/0.01/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Context 6: Gene
|
||||||
|
|
||||||
|
Access gene information from the Metabolome Gene/Protein (MGP) database.
|
||||||
|
|
||||||
|
### Input Items
|
||||||
|
|
||||||
|
| Input Item | Description | Example |
|
||||||
|
|------------|-------------|---------|
|
||||||
|
| `mgp_id` | MGP database ID | MGP001 |
|
||||||
|
| `gene_id` | NCBI Gene ID | 31 |
|
||||||
|
| `gene_name` | Full gene name | acetyl-CoA carboxylase |
|
||||||
|
| `gene_symbol` | Gene symbol | ACACA |
|
||||||
|
| `taxid` | Taxonomy ID | 9606 (human) |
|
||||||
|
|
||||||
|
### Output Items
|
||||||
|
|
||||||
|
| Output Item | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `all` | All gene information |
|
||||||
|
| `mgp_id` | MGP identifier |
|
||||||
|
| `gene_id` | NCBI Gene ID |
|
||||||
|
| `gene_name` | Full gene name |
|
||||||
|
| `gene_symbol` | Gene symbol |
|
||||||
|
| `gene_synonyms` | Alternative names |
|
||||||
|
| `alt_names` | Alternative nomenclature |
|
||||||
|
| `chromosome` | Chromosomal location |
|
||||||
|
| `map_location` | Genetic map position |
|
||||||
|
| `summary` | Gene description |
|
||||||
|
| `taxid` | Taxonomy ID |
|
||||||
|
| `species` | Species short name |
|
||||||
|
| `species_long` | Full species name |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get gene information by symbol
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json"
|
||||||
|
|
||||||
|
# Get gene by NCBI Gene ID
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_id/31/all/json"
|
||||||
|
|
||||||
|
# Search by gene name
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/gene/gene_name/carboxylase/summary/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Context 7: Protein
|
||||||
|
|
||||||
|
Retrieve protein sequence and annotation data.
|
||||||
|
|
||||||
|
### Input Items
|
||||||
|
|
||||||
|
| Input Item | Description | Example |
|
||||||
|
|------------|-------------|---------|
|
||||||
|
| `mgp_id` | MGP database ID | MGP001 |
|
||||||
|
| `gene_id` | NCBI Gene ID | 31 |
|
||||||
|
| `gene_name` | Gene name | acetyl-CoA carboxylase |
|
||||||
|
| `gene_symbol` | Gene symbol | ACACA |
|
||||||
|
| `taxid` | Taxonomy ID | 9606 |
|
||||||
|
| `mrna_id` | mRNA identifier | NM_001093.3 |
|
||||||
|
| `refseq_id` | RefSeq ID | NP_001084 |
|
||||||
|
| `protein_gi` | GenInfo Identifier | 4557237 |
|
||||||
|
| `uniprot_id` | UniProt ID | Q13085 |
|
||||||
|
| `protein_entry` | Protein entry name | ACACA_HUMAN |
|
||||||
|
| `protein_name` | Protein name | Acetyl-CoA carboxylase |
|
||||||
|
|
||||||
|
### Output Items
|
||||||
|
|
||||||
|
| Output Item | Description |
|
||||||
|
|-------------|-------------|
|
||||||
|
| `all` | All protein information |
|
||||||
|
| `mgp_id` | MGP identifier |
|
||||||
|
| `gene_id` | NCBI Gene ID |
|
||||||
|
| `gene_name` | Gene name |
|
||||||
|
| `gene_symbol` | Gene symbol |
|
||||||
|
| `taxid` | Taxonomy ID |
|
||||||
|
| `species` | Species short name |
|
||||||
|
| `species_long` | Full species name |
|
||||||
|
| `mrna_id` | mRNA identifier |
|
||||||
|
| `refseq_id` | RefSeq protein ID |
|
||||||
|
| `protein_gi` | GenInfo Identifier |
|
||||||
|
| `uniprot_id` | UniProt accession |
|
||||||
|
| `protein_entry` | Protein entry name |
|
||||||
|
| `protein_name` | Full protein name |
|
||||||
|
| `seqlength` | Sequence length |
|
||||||
|
| `seq` | Amino acid sequence |
|
||||||
|
| `is_identical_to` | Identical sequences |
|
||||||
|
|
||||||
|
### Example Requests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get protein information by UniProt ID
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json"
|
||||||
|
|
||||||
|
# Get protein by gene symbol
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/protein/gene_symbol/ACACA/all/json"
|
||||||
|
|
||||||
|
# Get protein sequence
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/seq/json"
|
||||||
|
|
||||||
|
# Search by RefSeq ID
|
||||||
|
curl "https://www.metabolomicsworkbench.org/rest/protein/refseq_id/NP_001084/all/json"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The API returns appropriate HTTP status codes:
|
||||||
|
|
||||||
|
- **200 OK**: Successful request
|
||||||
|
- **400 Bad Request**: Invalid parameters or malformed request
|
||||||
|
- **404 Not Found**: Resource not found
|
||||||
|
- **500 Internal Server Error**: Server-side error
|
||||||
|
|
||||||
|
When no results are found, the API typically returns an empty array or object rather than an error code.
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
|
||||||
|
As of 2025, the Metabolomics Workbench REST API does not enforce strict rate limits for reasonable use. However, best practices include:
|
||||||
|
|
||||||
|
- Implementing delays between bulk requests
|
||||||
|
- Caching frequently accessed reference data
|
||||||
|
- Using appropriate batch sizes for large-scale queries
|
||||||
|
|
||||||
|
## Additional Resources
|
||||||
|
|
||||||
|
- **Interactive REST URL Creator**: https://www.metabolomicsworkbench.org/tools/mw_rest.php
|
||||||
|
- **Official API Specification**: https://www.metabolomicsworkbench.org/tools/MWRestAPIv1.1.pdf
|
||||||
|
- **Python Library**: mwtab package for Python users
|
||||||
|
- **R Package**: metabolomicsWorkbenchR (Bioconductor)
|
||||||
|
- **Julia Package**: MetabolomicsWorkbenchAPI.jl
|
||||||
|
|
||||||
|
## Python Example: Complete Workflow
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
# 1. Standardize metabolite name using RefMet
|
||||||
|
metabolite = "citrate"
|
||||||
|
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/refmet/match/{metabolite}/name/json')
|
||||||
|
standardized_name = response.json()['name']
|
||||||
|
|
||||||
|
# 2. Search for studies containing this metabolite
|
||||||
|
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/refmet_name/{standardized_name}/summary/json')
|
||||||
|
studies = response.json()
|
||||||
|
|
||||||
|
# 3. Get detailed data from a specific study
|
||||||
|
study_id = studies[0]['study_id']
|
||||||
|
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/study_id/{study_id}/data/json')
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# 4. Perform m/z search for compound identification
|
||||||
|
mz_value = 180.06
|
||||||
|
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/moverz/MB/{mz_value}/M+H/0.5/json')
|
||||||
|
matches = response.json()
|
||||||
|
|
||||||
|
# 5. Get compound structure
|
||||||
|
regno = matches[0]['regno']
|
||||||
|
response = requests.get(f'https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
|
||||||
|
with open('structure.png', 'wb') as f:
|
||||||
|
f.write(response.content)
|
||||||
|
```
|
||||||
@@ -0,0 +1,261 @@
|
|||||||
|
---
|
||||||
|
name: reactome-database
|
||||||
|
description: Work with Reactome pathway database for analyzing biological pathways, performing pathway enrichment analysis, querying molecular interactions, and analyzing gene expression data. This skill should be used when working with biological pathways, performing overrepresentation analysis, mapping gene identifiers to pathways, analyzing gene expression datasets, or exploring disease-related pathways. Supports both direct REST API access and the reactome2py Python package.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Reactome Database
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This skill enables interaction with Reactome, a free, open-source, curated and peer-reviewed pathway database. Reactome provides comprehensive biological pathway data for research, genome analysis, modeling, and systems biology. The database contains thousands of human pathways, reactions, proteins, small molecules, and drugs, all supported by extensive literature references.
|
||||||
|
|
||||||
|
## Core Capabilities
|
||||||
|
|
||||||
|
Reactome provides two main API services and a Python client library:
|
||||||
|
|
||||||
|
### 1. Content Service - Data Retrieval
|
||||||
|
|
||||||
|
Query and retrieve biological pathway data, molecular interactions, and entity information.
|
||||||
|
|
||||||
|
**Common operations:**
|
||||||
|
- Retrieve pathway information and hierarchies
|
||||||
|
- Query specific entities (proteins, reactions, complexes)
|
||||||
|
- Get participating molecules in pathways
|
||||||
|
- Access database version and metadata
|
||||||
|
- Explore pathway compartments and locations
|
||||||
|
|
||||||
|
**API Base URL:** `https://reactome.org/ContentService`
|
||||||
|
|
||||||
|
### 2. Analysis Service - Pathway Analysis
|
||||||
|
|
||||||
|
Perform computational analysis on gene lists and expression data.
|
||||||
|
|
||||||
|
**Analysis types:**
|
||||||
|
- **Overrepresentation Analysis**: Identify statistically significant pathways from gene/protein lists
|
||||||
|
- **Expression Data Analysis**: Analyze gene expression datasets to find relevant pathways
|
||||||
|
- **Species Comparison**: Compare pathway data across different organisms
|
||||||
|
|
||||||
|
**API Base URL:** `https://reactome.org/AnalysisService`
|
||||||
|
|
||||||
|
### 3. reactome2py Python Package
|
||||||
|
|
||||||
|
Python client library that wraps Reactome API calls for easier programmatic access.
|
||||||
|
|
||||||
|
**Installation:**
|
||||||
|
```bash
|
||||||
|
pip install reactome2py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** The reactome2py package (version 3.0.0, released January 2021) is functional but not actively maintained. For the most up-to-date functionality, consider using direct REST API calls.
|
||||||
|
|
||||||
|
## Querying Pathway Data
|
||||||
|
|
||||||
|
### Using Content Service REST API
|
||||||
|
|
||||||
|
The Content Service uses REST protocol and returns data in JSON or plain text formats.
|
||||||
|
|
||||||
|
**Get database version:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
response = requests.get("https://reactome.org/ContentService/data/database/version")
|
||||||
|
version = response.text
|
||||||
|
print(f"Reactome version: {version}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Query a specific entity:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
entity_id = "R-HSA-69278" # Example pathway ID
|
||||||
|
response = requests.get(f"https://reactome.org/ContentService/data/query/{entity_id}")
|
||||||
|
data = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Get participating molecules in a pathway:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
event_id = "R-HSA-69278"
|
||||||
|
response = requests.get(
|
||||||
|
f"https://reactome.org/ContentService/data/event/{event_id}/participatingPhysicalEntities"
|
||||||
|
)
|
||||||
|
molecules = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using reactome2py Package
|
||||||
|
|
||||||
|
```python
|
||||||
|
import reactome2py
|
||||||
|
from reactome2py import content
|
||||||
|
|
||||||
|
# Query pathway information
|
||||||
|
pathway_info = content.query_by_id("R-HSA-69278")
|
||||||
|
|
||||||
|
# Get database version
|
||||||
|
version = content.get_database_version()
|
||||||
|
```
|
||||||
|
|
||||||
|
**For detailed API endpoints and parameters**, refer to `references/api_reference.md` in this skill.
|
||||||
|
|
||||||
|
## Performing Pathway Analysis
|
||||||
|
|
||||||
|
### Overrepresentation Analysis
|
||||||
|
|
||||||
|
Submit a list of gene/protein identifiers to find enriched pathways.
|
||||||
|
|
||||||
|
**Using REST API:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Prepare identifier list
|
||||||
|
identifiers = ["TP53", "BRCA1", "EGFR", "MYC"]
|
||||||
|
data = "\n".join(identifiers)
|
||||||
|
|
||||||
|
# Submit analysis
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
token = result["summary"]["token"] # Save token to retrieve results later
|
||||||
|
|
||||||
|
# Access pathways
|
||||||
|
for pathway in result["pathways"]:
|
||||||
|
print(f"{pathway['stId']}: {pathway['name']} (p-value: {pathway['entities']['pValue']})")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Retrieve analysis by token:**
|
||||||
|
```python
|
||||||
|
# Token is valid for 7 days
|
||||||
|
response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
|
||||||
|
results = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Expression Data Analysis
|
||||||
|
|
||||||
|
Analyze gene expression datasets with quantitative values.
|
||||||
|
|
||||||
|
**Input format (TSV with header starting with #):**
|
||||||
|
```
|
||||||
|
#Gene Sample1 Sample2 Sample3
|
||||||
|
TP53 2.5 3.1 2.8
|
||||||
|
BRCA1 1.2 1.5 1.3
|
||||||
|
EGFR 4.5 4.2 4.8
|
||||||
|
```
|
||||||
|
|
||||||
|
**Submit expression data:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Read TSV file
|
||||||
|
with open("expression_data.tsv", "r") as f:
|
||||||
|
data = f.read()
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Species Projection
|
||||||
|
|
||||||
|
Map identifiers to human pathways exclusively using the `/projection/` endpoint:
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/projection/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Visualizing Results
|
||||||
|
|
||||||
|
Analysis results can be visualized in the Reactome Pathway Browser by constructing URLs with the analysis token:
|
||||||
|
|
||||||
|
```python
|
||||||
|
token = result["summary"]["token"]
|
||||||
|
pathway_id = "R-HSA-69278"
|
||||||
|
url = f"https://reactome.org/PathwayBrowser/#{pathway_id}&DTAB=AN&ANALYSIS={token}"
|
||||||
|
print(f"View results: {url}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Working with Analysis Tokens
|
||||||
|
|
||||||
|
- Analysis tokens are valid for **7 days**
|
||||||
|
- Tokens allow retrieval of previously computed results without re-submission
|
||||||
|
- Store tokens to access results across sessions
|
||||||
|
- Use `GET /token/{TOKEN}` endpoint to retrieve results
|
||||||
|
|
||||||
|
## Data Formats and Identifiers
|
||||||
|
|
||||||
|
### Supported Identifier Types
|
||||||
|
|
||||||
|
Reactome accepts various identifier formats:
|
||||||
|
- UniProt accessions (e.g., P04637)
|
||||||
|
- Gene symbols (e.g., TP53)
|
||||||
|
- Ensembl IDs (e.g., ENSG00000141510)
|
||||||
|
- EntrezGene IDs (e.g., 7157)
|
||||||
|
- ChEBI IDs for small molecules
|
||||||
|
|
||||||
|
The system automatically detects identifier types.
|
||||||
|
|
||||||
|
### Input Format Requirements
|
||||||
|
|
||||||
|
**For overrepresentation analysis:**
|
||||||
|
- Plain text list of identifiers (one per line)
|
||||||
|
- OR single column in TSV format
|
||||||
|
|
||||||
|
**For expression analysis:**
|
||||||
|
- TSV format with mandatory header row starting with "#"
|
||||||
|
- Column 1: identifiers
|
||||||
|
- Columns 2+: numeric expression values
|
||||||
|
- Use period (.) as decimal separator
|
||||||
|
|
||||||
|
### Output Format
|
||||||
|
|
||||||
|
All API responses return JSON containing:
|
||||||
|
- `pathways`: Array of enriched pathways with statistical metrics
|
||||||
|
- `summary`: Analysis metadata and token
|
||||||
|
- `entities`: Matched and unmapped identifiers
|
||||||
|
- Statistical values: pValue, FDR (false discovery rate)
|
||||||
|
|
||||||
|
## Helper Scripts
|
||||||
|
|
||||||
|
This skill includes `scripts/reactome_query.py`, a helper script for common Reactome operations:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Query pathway information
|
||||||
|
python scripts/reactome_query.py query R-HSA-69278
|
||||||
|
|
||||||
|
# Perform overrepresentation analysis
|
||||||
|
python scripts/reactome_query.py analyze gene_list.txt
|
||||||
|
|
||||||
|
# Get database version
|
||||||
|
python scripts/reactome_query.py version
|
||||||
|
```
|
||||||
|
|
||||||
|
## Additional Resources
|
||||||
|
|
||||||
|
- **API Documentation**: https://reactome.org/dev
|
||||||
|
- **User Guide**: https://reactome.org/userguide
|
||||||
|
- **Documentation Portal**: https://reactome.org/documentation
|
||||||
|
- **Data Downloads**: https://reactome.org/download-data
|
||||||
|
- **reactome2py Docs**: https://reactome.github.io/reactome2py/
|
||||||
|
|
||||||
|
For comprehensive API endpoint documentation, see `references/api_reference.md` in this skill.
|
||||||
|
|
||||||
|
## Current Database Statistics (Version 94, September 2025)
|
||||||
|
|
||||||
|
- 2,825 human pathways
|
||||||
|
- 16,002 reactions
|
||||||
|
- 11,630 proteins
|
||||||
|
- 2,176 small molecules
|
||||||
|
- 1,070 drugs
|
||||||
|
- 41,373 literature references
|
||||||
@@ -0,0 +1,465 @@
|
|||||||
|
# Reactome API Reference
|
||||||
|
|
||||||
|
This document provides comprehensive reference information for Reactome's REST APIs.
|
||||||
|
|
||||||
|
## Base URLs
|
||||||
|
|
||||||
|
- **Content Service**: `https://reactome.org/ContentService`
|
||||||
|
- **Analysis Service**: `https://reactome.org/AnalysisService`
|
||||||
|
|
||||||
|
## Content Service API
|
||||||
|
|
||||||
|
The Content Service provides access to Reactome's curated pathway data through REST endpoints.
|
||||||
|
|
||||||
|
### Database Information
|
||||||
|
|
||||||
|
#### Get Database Version
|
||||||
|
```
|
||||||
|
GET /data/database/version
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:** Plain text containing the database version number
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
response = requests.get("https://reactome.org/ContentService/data/database/version")
|
||||||
|
print(response.text) # e.g., "94"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Get Database Name
|
||||||
|
```
|
||||||
|
GET /data/database/name
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:** Plain text containing the database name
|
||||||
|
|
||||||
|
### Entity Queries
|
||||||
|
|
||||||
|
#### Query Entity by ID
|
||||||
|
```
|
||||||
|
GET /data/query/{id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `id` (path): Stable identifier or database ID (e.g., "R-HSA-69278")
|
||||||
|
|
||||||
|
**Response:** JSON object containing full entity information including:
|
||||||
|
- `stId`: Stable identifier
|
||||||
|
- `displayName`: Human-readable name
|
||||||
|
- `schemaClass`: Entity type (Pathway, Reaction, Complex, etc.)
|
||||||
|
- `species`: Array of species information
|
||||||
|
- Additional type-specific fields
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278")
|
||||||
|
pathway = response.json()
|
||||||
|
print(f"Pathway: {pathway['displayName']}")
|
||||||
|
print(f"Species: {pathway['species'][0]['displayName']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Query Entity Attribute
|
||||||
|
```
|
||||||
|
GET /data/query/{id}/{attribute}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `id` (path): Entity identifier
|
||||||
|
- `attribute` (path): Specific attribute name (e.g., "displayName", "compartment")
|
||||||
|
|
||||||
|
**Response:** JSON or plain text depending on attribute type
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278/displayName")
|
||||||
|
name = response.text
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pathway Queries
|
||||||
|
|
||||||
|
#### Get Pathway Entities
|
||||||
|
```
|
||||||
|
GET /data/event/{id}/participatingPhysicalEntities
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `id` (path): Pathway or reaction stable identifier
|
||||||
|
|
||||||
|
**Response:** JSON array of physical entities (proteins, complexes, small molecules) participating in the pathway
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
response = requests.get(
|
||||||
|
"https://reactome.org/ContentService/data/event/R-HSA-69278/participatingPhysicalEntities"
|
||||||
|
)
|
||||||
|
entities = response.json()
|
||||||
|
for entity in entities:
|
||||||
|
print(f"{entity['stId']}: {entity['displayName']} ({entity['schemaClass']})")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Get Contained Events
|
||||||
|
```
|
||||||
|
GET /data/pathway/{id}/containedEvents
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `id` (path): Pathway stable identifier
|
||||||
|
|
||||||
|
**Response:** JSON array of events (reactions, subpathways) contained within the pathway
|
||||||
|
|
||||||
|
### Search Queries
|
||||||
|
|
||||||
|
#### Search by Name
|
||||||
|
```
|
||||||
|
GET /data/query?name={query}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `name` (query): Search term
|
||||||
|
|
||||||
|
**Response:** JSON array of matching entities
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
response = requests.get(
|
||||||
|
"https://reactome.org/ContentService/data/query",
|
||||||
|
params={"name": "glycolysis"}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Analysis Service API
|
||||||
|
|
||||||
|
The Analysis Service performs pathway enrichment and expression analysis.
|
||||||
|
|
||||||
|
### Submit Analysis
|
||||||
|
|
||||||
|
#### Submit Identifiers (POST)
|
||||||
|
```
|
||||||
|
POST /identifiers/
|
||||||
|
POST /identifiers/projection/ # Map to human pathways only
|
||||||
|
```
|
||||||
|
|
||||||
|
**Headers:**
|
||||||
|
- `Content-Type: text/plain`
|
||||||
|
|
||||||
|
**Body:**
|
||||||
|
- For overrepresentation: Plain text list of identifiers (one per line)
|
||||||
|
- For expression analysis: TSV format with header starting with "#"
|
||||||
|
|
||||||
|
**Expression data format:**
|
||||||
|
```
|
||||||
|
#Gene Sample1 Sample2 Sample3
|
||||||
|
TP53 2.5 3.1 2.8
|
||||||
|
BRCA1 1.2 1.5 1.3
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:** JSON object containing:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"summary": {
|
||||||
|
"token": "MzUxODM3NTQzMDAwMDA1ODI4MA==",
|
||||||
|
"type": "OVERREPRESENTATION",
|
||||||
|
"species": "9606",
|
||||||
|
"sampleName": null,
|
||||||
|
"fileName": null,
|
||||||
|
"text": true
|
||||||
|
},
|
||||||
|
"pathways": [
|
||||||
|
{
|
||||||
|
"stId": "R-HSA-69278",
|
||||||
|
"name": "Cell Cycle, Mitotic",
|
||||||
|
"species": {
|
||||||
|
"name": "Homo sapiens",
|
||||||
|
"taxId": "9606"
|
||||||
|
},
|
||||||
|
"entities": {
|
||||||
|
"found": 15,
|
||||||
|
"total": 450,
|
||||||
|
"pValue": 0.0000234,
|
||||||
|
"fdr": 0.00156
|
||||||
|
},
|
||||||
|
"reactions": {
|
||||||
|
"found": 12,
|
||||||
|
"total": 342
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"resourceSummary": [
|
||||||
|
{
|
||||||
|
"resource": "TOTAL",
|
||||||
|
"pathways": 25
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Overrepresentation analysis
|
||||||
|
identifiers = ["TP53", "BRCA1", "EGFR", "MYC", "CDK1"]
|
||||||
|
data = "\n".join(identifiers)
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
token = result["summary"]["token"]
|
||||||
|
|
||||||
|
# Process pathways
|
||||||
|
for pathway in result["pathways"]:
|
||||||
|
print(f"Pathway: {pathway['name']}")
|
||||||
|
print(f" Found: {pathway['entities']['found']}/{pathway['entities']['total']}")
|
||||||
|
print(f" p-value: {pathway['entities']['pValue']:.6f}")
|
||||||
|
print(f" FDR: {pathway['entities']['fdr']:.6f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Submit File (Form Upload)
|
||||||
|
```
|
||||||
|
POST /identifiers/form/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content-Type:** `multipart/form-data`
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `file`: File containing identifiers or expression data
|
||||||
|
|
||||||
|
#### Submit URL
|
||||||
|
```
|
||||||
|
POST /identifiers/url/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `url`: URL pointing to data file
|
||||||
|
|
||||||
|
### Retrieve Analysis Results
|
||||||
|
|
||||||
|
#### Get Results by Token
|
||||||
|
```
|
||||||
|
GET /token/{token}
|
||||||
|
GET /token/{token}/projection/ # With species projection
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `token` (path): Analysis token returned from submission
|
||||||
|
|
||||||
|
**Response:** Same structure as initial analysis response
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
token = "MzUxODM3NTQzMDAwMDA1ODI4MA=="
|
||||||
|
response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
|
||||||
|
results = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Tokens are valid for 7 days
|
||||||
|
|
||||||
|
#### Filter Results
|
||||||
|
```
|
||||||
|
GET /token/{token}/filter/pathways?resource={resource}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `token` (path): Analysis token
|
||||||
|
- `resource` (query): Resource filter (e.g., "TOTAL", "UNIPROT", "ENSEMBL")
|
||||||
|
|
||||||
|
### Download Results
|
||||||
|
|
||||||
|
#### Download as CSV
|
||||||
|
```
|
||||||
|
GET /download/{token}/pathways/{resource}/result.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Download Mapping
|
||||||
|
```
|
||||||
|
GET /download/{token}/entities/found/{resource}/mapping.tsv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Identifiers
|
||||||
|
|
||||||
|
Reactome automatically detects and processes various identifier types:
|
||||||
|
|
||||||
|
### Proteins and Genes
|
||||||
|
- **UniProt**: P04637
|
||||||
|
- **Gene Symbol**: TP53
|
||||||
|
- **Ensembl**: ENSG00000141510
|
||||||
|
- **EntrezGene**: 7157
|
||||||
|
- **RefSeq**: NM_000546
|
||||||
|
- **OMIM**: 191170
|
||||||
|
|
||||||
|
### Small Molecules
|
||||||
|
- **ChEBI**: CHEBI:15377
|
||||||
|
- **KEGG Compound**: C00031
|
||||||
|
- **PubChem**: 702
|
||||||
|
|
||||||
|
### Other
|
||||||
|
- **miRBase**: hsa-miR-21
|
||||||
|
- **InterPro**: IPR011616
|
||||||
|
|
||||||
|
## Response Formats
|
||||||
|
|
||||||
|
### JSON Objects
|
||||||
|
|
||||||
|
Entity objects contain standardized fields:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stId": "R-HSA-69278",
|
||||||
|
"displayName": "Cell Cycle, Mitotic",
|
||||||
|
"schemaClass": "Pathway",
|
||||||
|
"species": [
|
||||||
|
{
|
||||||
|
"dbId": 48887,
|
||||||
|
"displayName": "Homo sapiens",
|
||||||
|
"taxId": "9606"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isInDisease": false
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### TSV Format
|
||||||
|
|
||||||
|
For bulk queries, TSV returns:
|
||||||
|
```
|
||||||
|
stId displayName schemaClass
|
||||||
|
R-HSA-69278 Cell Cycle, Mitotic Pathway
|
||||||
|
R-HSA-69306 DNA Replication Pathway
|
||||||
|
```
|
||||||
|
|
||||||
|
## Error Responses
|
||||||
|
|
||||||
|
### HTTP Status Codes
|
||||||
|
- `200`: Success
|
||||||
|
- `400`: Bad Request (invalid parameters)
|
||||||
|
- `404`: Not Found (invalid ID)
|
||||||
|
- `415`: Unsupported Media Type
|
||||||
|
- `500`: Internal Server Error
|
||||||
|
|
||||||
|
### Error JSON Structure
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"code": 404,
|
||||||
|
"reason": "NOT_FOUND",
|
||||||
|
"messages": ["Pathway R-HSA-INVALID not found"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rate Limiting
|
||||||
|
|
||||||
|
Reactome does not currently enforce strict rate limits, but consider:
|
||||||
|
- Implementing reasonable delays between requests
|
||||||
|
- Using batch operations when available
|
||||||
|
- Caching results when appropriate
|
||||||
|
- Respecting the 7-day token validity period
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### 1. Use Analysis Tokens
|
||||||
|
Store and reuse analysis tokens to avoid redundant computation:
|
||||||
|
```python
|
||||||
|
# Store token after analysis
|
||||||
|
token = result["summary"]["token"]
|
||||||
|
save_token(token) # Save to file or database
|
||||||
|
|
||||||
|
# Retrieve results later
|
||||||
|
result = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Batch Queries
|
||||||
|
Submit multiple identifiers in a single request rather than individual queries:
|
||||||
|
```python
|
||||||
|
# Good: Single batch request
|
||||||
|
identifiers = ["TP53", "BRCA1", "EGFR"]
|
||||||
|
result = analyze_batch(identifiers)
|
||||||
|
|
||||||
|
# Avoid: Multiple individual requests
|
||||||
|
# for gene in genes:
|
||||||
|
# result = analyze_single(gene) # Don't do this
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Handle Species Appropriately
|
||||||
|
Use `/projection/` endpoints to map non-human identifiers to human pathways:
|
||||||
|
```python
|
||||||
|
# For mouse genes, project to human pathways
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/projection/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=mouse_genes
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Process Large Result Sets
|
||||||
|
For analyses returning many pathways, filter by significance:
|
||||||
|
```python
|
||||||
|
significant_pathways = [
|
||||||
|
p for p in result["pathways"]
|
||||||
|
if p["entities"]["fdr"] < 0.05
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration Examples
|
||||||
|
|
||||||
|
### Complete Analysis Workflow
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
def analyze_gene_list(genes, output_file="analysis_results.json"):
|
||||||
|
"""
|
||||||
|
Perform pathway enrichment analysis on a list of genes
|
||||||
|
"""
|
||||||
|
# Submit analysis
|
||||||
|
data = "\n".join(genes)
|
||||||
|
response = requests.post(
|
||||||
|
"https://reactome.org/AnalysisService/identifiers/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise Exception(f"Analysis failed: {response.text}")
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
token = result["summary"]["token"]
|
||||||
|
|
||||||
|
# Filter significant pathways (FDR < 0.05)
|
||||||
|
significant = [
|
||||||
|
p for p in result["pathways"]
|
||||||
|
if p["entities"]["fdr"] < 0.05
|
||||||
|
]
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
with open(output_file, "w") as f:
|
||||||
|
json.dump({
|
||||||
|
"token": token,
|
||||||
|
"total_pathways": len(result["pathways"]),
|
||||||
|
"significant_pathways": len(significant),
|
||||||
|
"pathways": significant
|
||||||
|
}, f, indent=2)
|
||||||
|
|
||||||
|
# Generate browser URL for top pathway
|
||||||
|
if significant:
|
||||||
|
top_pathway = significant[0]
|
||||||
|
url = f"https://reactome.org/PathwayBrowser/#{top_pathway['stId']}&DTAB=AN&ANALYSIS={token}"
|
||||||
|
print(f"View top result: {url}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
genes = ["TP53", "BRCA1", "BRCA2", "CDK1", "CDK2"]
|
||||||
|
result = analyze_gene_list(genes)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Additional Resources
|
||||||
|
|
||||||
|
- **Interactive API Documentation**: https://reactome.org/dev/content-service
|
||||||
|
- **Analysis Service Docs**: https://reactome.org/dev/analysis
|
||||||
|
- **User Guide**: https://reactome.org/userguide
|
||||||
|
- **Data Downloads**: https://reactome.org/download-data
|
||||||
@@ -0,0 +1,286 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Reactome Database Query Helper Script
|
||||||
|
|
||||||
|
This script provides convenient command-line access to common Reactome operations.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python reactome_query.py version
|
||||||
|
python reactome_query.py query <pathway_id>
|
||||||
|
python reactome_query.py analyze <gene_list_file>
|
||||||
|
python reactome_query.py search <term>
|
||||||
|
python reactome_query.py entities <pathway_id>
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
python reactome_query.py version
|
||||||
|
python reactome_query.py query R-HSA-69278
|
||||||
|
python reactome_query.py analyze genes.txt
|
||||||
|
python reactome_query.py search "cell cycle"
|
||||||
|
python reactome_query.py entities R-HSA-69278
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class ReactomeClient:
|
||||||
|
"""Client for interacting with Reactome REST APIs"""
|
||||||
|
|
||||||
|
CONTENT_BASE = "https://reactome.org/ContentService"
|
||||||
|
ANALYSIS_BASE = "https://reactome.org/AnalysisService"
|
||||||
|
|
||||||
|
def get_version(self) -> str:
|
||||||
|
"""Get Reactome database version"""
|
||||||
|
response = requests.get(f"{self.CONTENT_BASE}/data/database/version")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text.strip()
|
||||||
|
|
||||||
|
def query_pathway(self, pathway_id: str) -> Dict:
|
||||||
|
"""Query pathway information by ID"""
|
||||||
|
response = requests.get(f"{self.CONTENT_BASE}/data/query/{pathway_id}")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def get_pathway_entities(self, pathway_id: str) -> List[Dict]:
|
||||||
|
"""Get participating entities in a pathway"""
|
||||||
|
response = requests.get(
|
||||||
|
f"{self.CONTENT_BASE}/data/event/{pathway_id}/participatingPhysicalEntities"
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def search_pathways(self, term: str) -> List[Dict]:
|
||||||
|
"""Search for pathways by name"""
|
||||||
|
response = requests.get(
|
||||||
|
f"{self.CONTENT_BASE}/data/query",
|
||||||
|
params={"name": term}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def analyze_genes(self, gene_list: List[str]) -> Dict:
|
||||||
|
"""Perform pathway enrichment analysis on gene list"""
|
||||||
|
data = "\n".join(gene_list)
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.ANALYSIS_BASE}/identifiers/",
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
data=data
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def get_analysis_by_token(self, token: str) -> Dict:
|
||||||
|
"""Retrieve analysis results by token"""
|
||||||
|
response = requests.get(f"{self.ANALYSIS_BASE}/token/{token}")
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def print_json(data):
|
||||||
|
"""Pretty print JSON data"""
|
||||||
|
print(json.dumps(data, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
def command_version():
|
||||||
|
"""Get and display Reactome version"""
|
||||||
|
client = ReactomeClient()
|
||||||
|
version = client.get_version()
|
||||||
|
print(f"Reactome Database Version: {version}")
|
||||||
|
|
||||||
|
|
||||||
|
def command_query(pathway_id: str):
|
||||||
|
"""Query and display pathway information"""
|
||||||
|
client = ReactomeClient()
|
||||||
|
try:
|
||||||
|
pathway = client.query_pathway(pathway_id)
|
||||||
|
print(f"Pathway: {pathway['displayName']}")
|
||||||
|
print(f"ID: {pathway['stId']}")
|
||||||
|
print(f"Type: {pathway['schemaClass']}")
|
||||||
|
|
||||||
|
if 'species' in pathway and pathway['species']:
|
||||||
|
species = pathway['species'][0]['displayName']
|
||||||
|
print(f"Species: {species}")
|
||||||
|
|
||||||
|
if 'summation' in pathway and pathway['summation']:
|
||||||
|
summation = pathway['summation'][0]['text']
|
||||||
|
print(f"\nDescription: {summation}")
|
||||||
|
|
||||||
|
print("\nFull JSON response:")
|
||||||
|
print_json(pathway)
|
||||||
|
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
if e.response.status_code == 404:
|
||||||
|
print(f"Error: Pathway '{pathway_id}' not found")
|
||||||
|
else:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def command_entities(pathway_id: str):
|
||||||
|
"""Display entities participating in a pathway"""
|
||||||
|
client = ReactomeClient()
|
||||||
|
try:
|
||||||
|
entities = client.get_pathway_entities(pathway_id)
|
||||||
|
print(f"Entities in pathway {pathway_id}: {len(entities)} total\n")
|
||||||
|
|
||||||
|
# Group by type
|
||||||
|
by_type = {}
|
||||||
|
for entity in entities:
|
||||||
|
entity_type = entity['schemaClass']
|
||||||
|
if entity_type not in by_type:
|
||||||
|
by_type[entity_type] = []
|
||||||
|
by_type[entity_type].append(entity)
|
||||||
|
|
||||||
|
# Display by type
|
||||||
|
for entity_type, entities_list in sorted(by_type.items()):
|
||||||
|
print(f"{entity_type} ({len(entities_list)}):")
|
||||||
|
for entity in entities_list[:10]: # Show first 10
|
||||||
|
print(f" - {entity['stId']}: {entity['displayName']}")
|
||||||
|
if len(entities_list) > 10:
|
||||||
|
print(f" ... and {len(entities_list) - 10} more")
|
||||||
|
print()
|
||||||
|
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
if e.response.status_code == 404:
|
||||||
|
print(f"Error: Pathway '{pathway_id}' not found")
|
||||||
|
else:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def command_search(term: str):
|
||||||
|
"""Search for pathways by term"""
|
||||||
|
client = ReactomeClient()
|
||||||
|
try:
|
||||||
|
results = client.search_pathways(term)
|
||||||
|
print(f"Search results for '{term}': {len(results)} found\n")
|
||||||
|
|
||||||
|
for result in results[:20]: # Show first 20
|
||||||
|
print(f"{result['stId']}: {result['displayName']}")
|
||||||
|
if 'species' in result and result['species']:
|
||||||
|
species = result['species'][0]['displayName']
|
||||||
|
print(f" Species: {species}")
|
||||||
|
print(f" Type: {result['schemaClass']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if len(results) > 20:
|
||||||
|
print(f"... and {len(results) - 20} more results")
|
||||||
|
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def command_analyze(gene_file: str):
|
||||||
|
"""Perform pathway enrichment analysis"""
|
||||||
|
client = ReactomeClient()
|
||||||
|
|
||||||
|
# Read gene list
|
||||||
|
try:
|
||||||
|
with open(gene_file, 'r') as f:
|
||||||
|
genes = [line.strip() for line in f if line.strip()]
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File '{gene_file}' not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Analyzing {len(genes)} genes...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = client.analyze_genes(genes)
|
||||||
|
|
||||||
|
# Display summary
|
||||||
|
summary = result['summary']
|
||||||
|
print(f"\nAnalysis Type: {summary['type']}")
|
||||||
|
print(f"Token: {summary['token']} (valid for 7 days)")
|
||||||
|
print(f"Species: {summary.get('species', 'N/A')}")
|
||||||
|
|
||||||
|
# Display pathways
|
||||||
|
pathways = result.get('pathways', [])
|
||||||
|
print(f"\nEnriched Pathways: {len(pathways)} found")
|
||||||
|
|
||||||
|
# Show significant pathways (FDR < 0.05)
|
||||||
|
significant = [p for p in pathways if p['entities']['fdr'] < 0.05]
|
||||||
|
print(f"Significant (FDR < 0.05): {len(significant)}\n")
|
||||||
|
|
||||||
|
# Display top 10 pathways
|
||||||
|
print("Top 10 Pathways:")
|
||||||
|
for i, pathway in enumerate(pathways[:10], 1):
|
||||||
|
print(f"\n{i}. {pathway['name']}")
|
||||||
|
print(f" ID: {pathway['stId']}")
|
||||||
|
entities = pathway['entities']
|
||||||
|
print(f" Found: {entities['found']}/{entities['total']} entities")
|
||||||
|
print(f" p-value: {entities['pValue']:.6e}")
|
||||||
|
print(f" FDR: {entities['fdr']:.6e}")
|
||||||
|
|
||||||
|
# Generate browser URL for top pathway
|
||||||
|
if pathways:
|
||||||
|
token = summary['token']
|
||||||
|
top_pathway = pathways[0]['stId']
|
||||||
|
url = f"https://reactome.org/PathwayBrowser/#{top_pathway}&DTAB=AN&ANALYSIS={token}"
|
||||||
|
print(f"\nView top result in browser:")
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
# Save full results
|
||||||
|
output_file = gene_file.replace('.txt', '_results.json')
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(result, f, indent=2)
|
||||||
|
print(f"\nFull results saved to: {output_file}")
|
||||||
|
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def print_usage():
|
||||||
|
"""Print usage information"""
|
||||||
|
print(__doc__)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print_usage()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
command = sys.argv[1].lower()
|
||||||
|
|
||||||
|
if command == "version":
|
||||||
|
command_version()
|
||||||
|
|
||||||
|
elif command == "query":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Error: pathway_id required")
|
||||||
|
print("Usage: python reactome_query.py query <pathway_id>")
|
||||||
|
sys.exit(1)
|
||||||
|
command_query(sys.argv[2])
|
||||||
|
|
||||||
|
elif command == "entities":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Error: pathway_id required")
|
||||||
|
print("Usage: python reactome_query.py entities <pathway_id>")
|
||||||
|
sys.exit(1)
|
||||||
|
command_entities(sys.argv[2])
|
||||||
|
|
||||||
|
elif command == "search":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Error: search term required")
|
||||||
|
print("Usage: python reactome_query.py search <term>")
|
||||||
|
sys.exit(1)
|
||||||
|
command_search(" ".join(sys.argv[2:]))
|
||||||
|
|
||||||
|
elif command == "analyze":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Error: gene list file required")
|
||||||
|
print("Usage: python reactome_query.py analyze <gene_list_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
command_analyze(sys.argv[2])
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"Error: Unknown command '{command}'")
|
||||||
|
print_usage()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user