Add more databases

2026-01-26 16:58:56 +08:00 · 2025-10-19 19:16:45 -07:00
parent 56a8312fc9
commit 9f4154a9ed
13 changed files with 4739 additions and 5 deletions
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -7,7 +7,7 @@
  },
  "metadata": {
    "description": "Claude scientific skills from K-Dense Inc",
-    "version": "1.12.0"
+    "version": "1.15.0"
  },
  "plugins": [
    {
@@ -61,15 +61,20 @@
      "skills": [
        "./scientific-databases/alphafold-database",
        "./scientific-databases/chembl-database",
        "./scientific-databases/clinpgx-database",
        "./scientific-databases/clinvar-database",
        "./scientific-databases/cosmic-database",
        "./scientific-databases/ena-database",
        "./scientific-databases/ensembl-database",
        "./scientific-databases/gene-database",
        "./scientific-databases/geo-database",
        "./scientific-databases/hmdb-database",
        "./scientific-databases/kegg-database",
        "./scientific-databases/metabolomics-workbench-database",
        "./scientific-databases/pdb-database",
        "./scientific-databases/pubchem-database",
        "./scientific-databases/pubmed-database",
        "./scientific-databases/reactome-database",
        "./scientific-databases/string-database",
        "./scientific-databases/uniprot-database",
        "./scientific-databases/zinc-database"
--- a/README.md
+++ b/README.md
@@ -8,16 +8,20 @@ A comprehensive collection of ready-to-use scientific skills for Claude, curated
 - **AlphaFold DB** - AI-predicted protein structure database with 200M+ predictions, confidence metrics (pLDDT, PAE), and Google Cloud bulk access
 - **ChEMBL** - Bioactive molecule database with drug-like properties (2M+ compounds, 19M+ activities, 13K+ targets)
 - **ClinPGx** - Clinical pharmacogenomics database (successor to PharmGKB) providing gene-drug interactions, CPIC clinical guidelines, allele functions, drug labels, and pharmacogenomic annotations for precision medicine and personalized pharmacotherapy (consolidates PharmGKB, CPIC, and PharmCAT resources)
 - **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
 - **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
 - **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
 - **Ensembl** - Genome browser and bioinformatics database providing genomic annotations, sequences, variants, and comparative genomics data for 250+ vertebrate species (Release 115, 2025) with comprehensive REST API for gene lookups, sequence retrieval, variant effect prediction (VEP), ortholog finding, assembly mapping (GRCh37/GRCh38), and region analysis
 - **GEO (Gene Expression Omnibus)** - High-throughput gene expression and functional genomics data repository (264K+ studies, 8M+ samples) with microarray, RNA-seq, and expression profile access
 - **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
 - **KEGG** - Kyoto Encyclopedia of Genes and Genomes for biological pathway analysis, gene-to-pathway mapping, compound searches, and molecular interaction networks (pathway enrichment, metabolic pathways, gene annotations, drug-drug interactions, ID conversion)
 - **Metabolomics Workbench** - NIH Common Fund metabolomics data repository with 4,200+ processed studies, standardized nomenclature (RefMet), mass spectrometry searches, and comprehensive REST API for accessing metabolite structures, study metadata, experimental results, and gene/protein-metabolite associations
 - **NCBI Gene** - Work with NCBI Gene database to search, retrieve, and analyze gene information including nomenclature, sequences, variations, phenotypes, and pathways using E-utilities and Datasets API
 - **Protein Data Bank (PDB)** - Access 3D structural data of proteins, nucleic acids, and biological macromolecules (200K+ structures) with search, retrieval, and analysis capabilities
 - **PubChem** - Access chemical compound data from the world's largest free chemical database (110M+ compounds, 270M+ bioactivities)
 - **PubMed** - Access to PubMed literature database with advanced search capabilities
 - **Reactome** - Curated pathway database for biological processes and molecular interactions (2,825+ human pathways, 16K+ reactions, 11K+ proteins) with pathway enrichment analysis, expression data analysis, and species comparison using Content Service and Analysis Service APIs
 - **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
 - **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
 - **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery (230M+ purchasable compounds in ready-to-dock 3D formats)
@@ -125,16 +129,12 @@ You can use Anthropic's pre-built skills, and upload custom skills, via the Clau
 - **DAVID** - Database for Annotation, Visualization and Integrated Discovery for functional enrichment analysis
 - **dbSNP** - NCBI's database of single nucleotide polymorphisms and short genetic variations
 - **DrugBank** - Comprehensive drug and drug target database with pharmacological and pharmaceutical data
 - **Ensembl** - Genome browser with annotation, comparative genomics, and variant data
 - **GenBank** - NIH genetic sequence database (part of NCBI but with specific access patterns)
 - **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies
 - **InterPro** - Protein sequence analysis and classification with functional annotations
 - **MetaboLights** - EMBL-EBI metabolomics database with experimental data and metadata
 - **Metabolomics Workbench** - NIH Common Fund metabolomics data repository
 - **OMIM** - Online Mendelian Inheritance in Man for genetic disorders and genes
 - **Pfam** - Protein families database with multiple sequence alignments and HMMs
 - **PharmGKB** - Pharmacogenomics Knowledge Base linking genetic variation to drug response
 - **Reactome** - Curated pathway database with biological processes and molecular interactions
 - **RefSeq** - NCBI's non-redundant reference sequence database
 - **TCGA** - The Cancer Genome Atlas with multi-omic cancer genomics data
 - **UCSC Genome Browser** - Genomic data visualization and custom track integration
--- a/scientific-databases/clinpgx-database/SKILL.md
+++ b/scientific-databases/clinpgx-database/SKILL.md
@@ -0,0 +1,632 @@
 ---
 name: clinpgx-database
 description: Toolkit for accessing ClinPGx, a clinical pharmacogenomics database providing information on how genetic variation affects drug response. Use this skill when working with pharmacogenomics data, querying gene-drug interactions, accessing CPIC clinical guidelines, retrieving allele function and frequency information, exploring PharmGKB annotations, or conducting research on personalized medicine and precision pharmacotherapy. ClinPGx consolidates PharmGKB, CPIC, and PharmCAT resources.
 ---
 # ClinPGx Database
 ## Overview
 Facilitate access to and querying of ClinPGx (Clinical Pharmacogenomics Database), a comprehensive resource for clinical pharmacogenomics information. ClinPGx is the successor to PharmGKB (launched officially in July 2025) and consolidates data from PharmGKB, CPIC (Clinical Pharmacogenetics Implementation Consortium), and PharmCAT (Pharmacogenomics Clinical Annotation Tool). The database provides curated information on how human genetic variation affects medication response, including gene-drug pairs, clinical guidelines, allele functions, and drug labels. Managed at Stanford University as a ClinGen (Clinical Genome Resource) affiliate grant.
 ## When to Use This Skill
 Use this skill when queries involve:
 - **Gene-drug interactions**: Querying how genetic variants affect drug metabolism, efficacy, or toxicity
 - **CPIC guidelines**: Accessing evidence-based clinical practice guidelines for pharmacogenetics
 - **Allele information**: Retrieving allele function, frequency, and phenotype data
 - **Drug labels**: Exploring FDA and other regulatory pharmacogenomic drug labeling
 - **Pharmacogenomic annotations**: Accessing curated literature on gene-drug-disease relationships
 - **Clinical decision support**: Using PharmDOG tool for phenoconversion and custom genotype interpretation
 - **Precision medicine**: Implementing pharmacogenomic testing in clinical practice
 - **Drug metabolism**: Understanding CYP450 and other pharmacogene functions
 - **Personalized dosing**: Finding genotype-guided dosing recommendations
 - **Adverse drug reactions**: Identifying genetic risk factors for drug toxicity
 ## Installation and Setup
 ### Python API Access
 The ClinPGx REST API provides programmatic access to all database resources. Basic setup:
 ```bash
 pip install requests
 ```
 ### API Endpoint
 ```python
 BASE_URL = "https://api.clinpgx.org/v1/"
 ```
 **Rate Limits**:
 - 2 requests per second maximum
 - Excessive requests will result in HTTP 429 (Too Many Requests) response
 **Authentication**: Not required for basic access
 **Data License**: Creative Commons Attribution-ShareAlike 4.0 International License
 For substantial API use, notify the ClinPGx team at api@clinpgx.org
 ## Core Capabilities
 ### 1. Gene Queries
 **Retrieve gene information** including function, clinical annotations, and pharmacogenomic significance:
 ```python
 import requests
 # Get gene details
 response = requests.get("https://api.clinpgx.org/v1/gene/CYP2D6")
 gene_data = response.json()
 # Search for genes by name
 response = requests.get("https://api.clinpgx.org/v1/gene",
                       params={"q": "CYP"})
 genes = response.json()
 ```
 **Key pharmacogenes**:
 - **CYP450 enzymes**: CYP2D6, CYP2C19, CYP2C9, CYP3A4, CYP3A5
 - **Transporters**: SLCO1B1, ABCB1, ABCG2
 - **Other metabolizers**: TPMT, DPYD, NUDT15, UGT1A1
 - **Receptors**: OPRM1, HTR2A, ADRB1
 - **HLA genes**: HLA-B, HLA-A
 ### 2. Drug and Chemical Queries
 **Retrieve drug information** including pharmacogenomic annotations and mechanisms:
 ```python
 # Get drug details
 response = requests.get("https://api.clinpgx.org/v1/chemical/PA448515")  # Warfarin
 drug_data = response.json()
 # Search drugs by name
 response = requests.get("https://api.clinpgx.org/v1/chemical",
                       params={"name": "warfarin"})
 drugs = response.json()
 ```
 **Drug categories with pharmacogenomic significance**:
 - Anticoagulants (warfarin, clopidogrel)
 - Antidepressants (SSRIs, TCAs)
 - Immunosuppressants (tacrolimus, azathioprine)
 - Oncology drugs (5-fluorouracil, irinotecan, tamoxifen)
 - Cardiovascular drugs (statins, beta-blockers)
 - Pain medications (codeine, tramadol)
 - Antivirals (abacavir)
 ### 3. Gene-Drug Pair Queries
 **Access curated gene-drug relationships** with clinical annotations:
 ```python
 # Get gene-drug pair information
 response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                       params={"gene": "CYP2D6", "drug": "codeine"})
 pair_data = response.json()
 # Get all pairs for a gene
 response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                       params={"gene": "CYP2C19"})
 all_pairs = response.json()
 ```
 **Clinical annotation sources**:
 - CPIC (Clinical Pharmacogenetics Implementation Consortium)
 - DPWG (Dutch Pharmacogenetics Working Group)
 - FDA (Food and Drug Administration) labels
 - Peer-reviewed literature summary annotations
 ### 4. CPIC Guidelines
 **Access evidence-based clinical practice guidelines**:
 ```python
 # Get CPIC guideline
 response = requests.get("https://api.clinpgx.org/v1/guideline/PA166104939")
 guideline = response.json()
 # List all CPIC guidelines
 response = requests.get("https://api.clinpgx.org/v1/guideline",
                       params={"source": "CPIC"})
 guidelines = response.json()
 ```
 **CPIC guideline components**:
 - Gene-drug pairs covered
 - Clinical recommendations by phenotype
 - Evidence levels and strength ratings
 - Supporting literature
 - Downloadable PDFs and supplementary materials
 - Implementation considerations
 **Example guidelines**:
 - CYP2D6-codeine (avoid in ultra-rapid metabolizers)
 - CYP2C19-clopidogrel (alternative therapy for poor metabolizers)
 - TPMT-azathioprine (dose reduction for intermediate/poor metabolizers)
 - DPYD-fluoropyrimidines (dose adjustment based on activity)
 - HLA-B*57:01-abacavir (avoid if positive)
 ### 5. Allele and Variant Information
 **Query allele function and frequency data**:
 ```python
 # Get allele information
 response = requests.get("https://api.clinpgx.org/v1/allele/CYP2D6*4")
 allele_data = response.json()
 # Get all alleles for a gene
 response = requests.get("https://api.clinpgx.org/v1/allele",
                       params={"gene": "CYP2D6"})
 alleles = response.json()
 ```
 **Allele information includes**:
 - Functional status (normal, decreased, no function, increased, uncertain)
 - Population frequencies across ethnic groups
 - Defining variants (SNPs, indels, CNVs)
 - Phenotype assignment
 - References to PharmVar and other nomenclature systems
 **Phenotype categories**:
 - **Ultra-rapid metabolizer** (UM): Increased enzyme activity
 - **Normal metabolizer** (NM): Normal enzyme activity
 - **Intermediate metabolizer** (IM): Reduced enzyme activity
 - **Poor metabolizer** (PM): Little to no enzyme activity
 ### 6. Variant Annotations
 **Access clinical annotations for specific genetic variants**:
 ```python
 # Get variant information
 response = requests.get("https://api.clinpgx.org/v1/variant/rs4244285")
 variant_data = response.json()
 # Search variants by position (if supported)
 response = requests.get("https://api.clinpgx.org/v1/variant",
                       params={"chromosome": "10", "position": "94781859"})
 variants = response.json()
 ```
 **Variant data includes**:
 - rsID and genomic coordinates
 - Gene and functional consequence
 - Allele associations
 - Clinical significance
 - Population frequencies
 - Literature references
 ### 7. Clinical Annotations
 **Retrieve curated literature annotations** (formerly PharmGKB clinical annotations):
 ```python
 # Get clinical annotations
 response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
                       params={"gene": "CYP2D6"})
 annotations = response.json()
 # Filter by evidence level
 response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
                       params={"evidenceLevel": "1A"})
 high_evidence = response.json()
 ```
 **Evidence levels** (from highest to lowest):
 - **Level 1A**: High-quality evidence, CPIC/FDA/DPWG guidelines
 - **Level 1B**: High-quality evidence, not yet guideline
 - **Level 2A**: Moderate evidence from well-designed studies
 - **Level 2B**: Moderate evidence with some limitations
 - **Level 3**: Limited or conflicting evidence
 - **Level 4**: Case reports or weak evidence
 ### 8. Drug Labels
 **Access pharmacogenomic information from drug labels**:
 ```python
 # Get drug labels with PGx information
 response = requests.get("https://api.clinpgx.org/v1/drugLabel",
                       params={"drug": "warfarin"})
 labels = response.json()
 # Filter by regulatory source
 response = requests.get("https://api.clinpgx.org/v1/drugLabel",
                       params={"source": "FDA"})
 fda_labels = response.json()
 ```
 **Label information includes**:
 - Testing recommendations
 - Dosing guidance by genotype
 - Warnings and precautions
 - Biomarker information
 - Regulatory source (FDA, EMA, PMDA, etc.)
 ### 9. Pathways
 **Explore pharmacokinetic and pharmacodynamic pathways**:
 ```python
 # Get pathway information
 response = requests.get("https://api.clinpgx.org/v1/pathway/PA146123006")  # Warfarin pathway
 pathway_data = response.json()
 # Search pathways by drug
 response = requests.get("https://api.clinpgx.org/v1/pathway",
                       params={"drug": "warfarin"})
 pathways = response.json()
 ```
 **Pathway diagrams** show:
 - Drug metabolism steps
 - Enzymes and transporters involved
 - Gene variants affecting each step
 - Downstream effects on efficacy/toxicity
 - Interactions with other pathways
 ## Query Workflow
 ### Workflow 1: Clinical Decision Support for Drug Prescription
 1. **Identify patient genotype** for relevant pharmacogenes:
   ```python
   # Example: Patient is CYP2C19 *1/*2 (intermediate metabolizer)
   response = requests.get("https://api.clinpgx.org/v1/allele/CYP2C19*2")
   allele_function = response.json()
   ```
 2. **Query gene-drug pairs** for medication of interest:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                          params={"gene": "CYP2C19", "drug": "clopidogrel"})
   pair_info = response.json()
   ```
 3. **Retrieve CPIC guideline** for dosing recommendations:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/guideline",
                          params={"gene": "CYP2C19", "drug": "clopidogrel"})
   guideline = response.json()
   # Recommendation: Alternative antiplatelet therapy for IM/PM
   ```
 4. **Check drug label** for regulatory guidance:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/drugLabel",
                          params={"drug": "clopidogrel"})
   label = response.json()
   ```
 ### Workflow 2: Gene Panel Analysis
 1. **Get list of pharmacogenes** in clinical panel:
   ```python
   pgx_panel = ["CYP2C19", "CYP2D6", "CYP2C9", "TPMT", "DPYD", "SLCO1B1"]
   ```
 2. **For each gene, retrieve all drug interactions**:
   ```python
   all_interactions = {}
   for gene in pgx_panel:
       response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                              params={"gene": gene})
       all_interactions[gene] = response.json()
   ```
 3. **Filter for CPIC guideline-level evidence**:
   ```python
   for gene, pairs in all_interactions.items():
       for pair in pairs:
           if pair.get('cpicLevel'):  # Has CPIC guideline
               print(f"{gene} - {pair['drug']}: {pair['cpicLevel']}")
   ```
 4. **Generate patient report** with actionable pharmacogenomic findings.
 ### Workflow 3: Drug Safety Assessment
 1. **Query drug for PGx associations**:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/chemical",
                          params={"name": "abacavir"})
   drug_id = response.json()[0]['id']
   ```
 2. **Get clinical annotations**:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
                          params={"drug": drug_id})
   annotations = response.json()
   ```
 3. **Check for HLA associations** and toxicity risk:
   ```python
   for annotation in annotations:
       if 'HLA' in annotation.get('genes', []):
           print(f"Toxicity risk: {annotation['phenotype']}")
           print(f"Evidence level: {annotation['evidenceLevel']}")
   ```
 4. **Retrieve screening recommendations** from guidelines and labels.
 ### Workflow 4: Research Analysis - Population Pharmacogenomics
 1. **Get allele frequencies** for population comparison:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/allele",
                          params={"gene": "CYP2D6"})
   alleles = response.json()
   ```
 2. **Extract population-specific frequencies**:
   ```python
   populations = ['European', 'African', 'East Asian', 'Latino']
   frequency_data = {}
   for allele in alleles:
       allele_name = allele['name']
       frequency_data[allele_name] = {
           pop: allele.get(f'{pop}_frequency', 'N/A')
           for pop in populations
       }
   ```
 3. **Calculate phenotype distributions** by population:
   ```python
   # Combine allele frequencies with function to predict phenotypes
   phenotype_dist = calculate_phenotype_frequencies(frequency_data)
   ```
 4. **Analyze implications** for drug dosing in diverse populations.
 ### Workflow 5: Literature Evidence Review
 1. **Search for gene-drug pair**:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                          params={"gene": "TPMT", "drug": "azathioprine"})
   pair = response.json()
   ```
 2. **Retrieve all clinical annotations**:
   ```python
   response = requests.get("https://api.clinpgx.org/v1/clinicalAnnotation",
                          params={"gene": "TPMT", "drug": "azathioprine"})
   annotations = response.json()
   ```
 3. **Filter by evidence level and publication date**:
   ```python
   high_quality = [a for a in annotations
                   if a['evidenceLevel'] in ['1A', '1B', '2A']]
   ```
 4. **Extract PMIDs** and retrieve full references:
   ```python
   pmids = [a['pmid'] for a in high_quality if 'pmid' in a]
   # Use PubMed skill to retrieve full citations
   ```
 ## Rate Limiting and Best Practices
 ### Rate Limit Compliance
 ```python
 import time
 def rate_limited_request(url, params=None, delay=0.5):
    """Make API request with rate limiting (2 req/sec max)"""
    response = requests.get(url, params=params)
    time.sleep(delay)  # Wait 0.5 seconds between requests
    return response
 # Use in loops
 genes = ["CYP2D6", "CYP2C19", "CYP2C9"]
 for gene in genes:
    response = rate_limited_request(
        "https://api.clinpgx.org/v1/gene/" + gene
    )
    data = response.json()
 ```
 ### Error Handling
 ```python
 def safe_api_call(url, params=None, max_retries=3):
    """API call with error handling and retries"""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                # Rate limit exceeded
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Rate limit hit. Waiting {wait_time}s...")
                time.sleep(wait_time)
            else:
                response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(1)
 ```
 ### Caching Results
 ```python
 import json
 from pathlib import Path
 def cached_query(cache_file, api_func, *args, **kwargs):
    """Cache API results to avoid repeated queries"""
    cache_path = Path(cache_file)
    if cache_path.exists():
        with open(cache_path) as f:
            return json.load(f)
    result = api_func(*args, **kwargs)
    with open(cache_path, 'w') as f:
        json.dump(result, f, indent=2)
    return result
 # Usage
 gene_data = cached_query(
    'cyp2d6_cache.json',
    rate_limited_request,
    "https://api.clinpgx.org/v1/gene/CYP2D6"
 )
 ```
 ## PharmDOG Tool
 PharmDOG (formerly DDRx) is ClinPGx's clinical decision support tool for interpreting pharmacogenomic test results:
 **Key features**:
 - **Phenoconversion calculator**: Adjusts phenotype predictions for drug-drug interactions affecting CYP2D6
 - **Custom genotypes**: Input patient genotypes to get phenotype predictions
 - **QR code sharing**: Generate shareable patient reports
 - **Flexible guidance sources**: Select which guidelines to apply (CPIC, DPWG, FDA)
 - **Multi-drug analysis**: Assess multiple medications simultaneously
 **Access**: Available at https://www.clinpgx.org/pharmacogenomic-decision-support
 **Use cases**:
 - Clinical interpretation of PGx panel results
 - Medication review for patients with known genotypes
 - Patient education materials
 - Point-of-care decision support
 ## Resources
 ### scripts/query_clinpgx.py
 Python script with ready-to-use functions for common ClinPGx queries:
 - `get_gene_info(gene_symbol)` - Retrieve gene details
 - `get_drug_info(drug_name)` - Get drug information
 - `get_gene_drug_pairs(gene, drug)` - Query gene-drug interactions
 - `get_cpic_guidelines(gene, drug)` - Retrieve CPIC guidelines
 - `get_alleles(gene)` - Get all alleles for a gene
 - `get_clinical_annotations(gene, drug, evidence_level)` - Query literature annotations
 - `get_drug_labels(drug)` - Retrieve pharmacogenomic drug labels
 - `search_variants(rsid)` - Search by variant rsID
 - `export_to_dataframe(data)` - Convert results to pandas DataFrame
 Consult this script for implementation examples with proper rate limiting and error handling.
 ### references/api_reference.md
 Comprehensive API documentation including:
 - Complete endpoint listing with parameters
 - Request/response format specifications
 - Example queries for each endpoint
 - Filter operators and search patterns
 - Data schema definitions
 - Rate limiting details
 - Authentication requirements (if any)
 - Troubleshooting common errors
 Refer to this document when detailed API information is needed or when constructing complex queries.
 ## Important Notes
 ### Data Sources and Integration
 ClinPGx consolidates multiple authoritative sources:
 - **PharmGKB**: Curated pharmacogenomics knowledge base (now part of ClinPGx)
 - **CPIC**: Evidence-based clinical implementation guidelines
 - **PharmCAT**: Allele calling and phenotype interpretation tool
 - **DPWG**: Dutch pharmacogenetics guidelines
 - **FDA/EMA labels**: Regulatory pharmacogenomic information
 As of July 2025, all PharmGKB URLs redirect to corresponding ClinPGx pages.
 ### Clinical Implementation Considerations
 - **Evidence levels**: Always check evidence strength before clinical application
 - **Population differences**: Allele frequencies vary significantly across populations
 - **Phenoconversion**: Consider drug-drug interactions that affect enzyme activity
 - **Multi-gene effects**: Some drugs affected by multiple pharmacogenes
 - **Non-genetic factors**: Age, organ function, drug interactions also affect response
 - **Testing limitations**: Not all clinically relevant alleles detected by all assays
 ### Data Updates
 - ClinPGx continuously updates with new evidence and guidelines
 - Check publication dates for clinical annotations
 - Monitor ClinPGx Blog (https://blog.clinpgx.org/) for announcements
 - CPIC guidelines updated as new evidence emerges
 - PharmVar provides nomenclature updates for allele definitions
 ### API Stability
 - API endpoints are relatively stable but may change during development
 - Parameters and response formats subject to modification
 - Monitor API changelog and ClinPGx blog for updates
 - Consider version pinning for production applications
 - Test API changes in development before production deployment
 ## Common Use Cases
 ### Pre-emptive Pharmacogenomic Testing
 Query all clinically actionable gene-drug pairs to guide panel selection:
 ```python
 # Get all CPIC guideline pairs
 response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                       params={"cpicLevel": "A"})  # Level A recommendations
 actionable_pairs = response.json()
 ```
 ### Medication Therapy Management
 Review patient medications against known genotypes:
 ```python
 patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
 medications = ["clopidogrel", "simvastatin", "escitalopram"]
 for med in medications:
    for gene in patient_genes:
        response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                               params={"gene": gene, "drug": med})
        # Check for interactions and dosing guidance
 ```
 ### Clinical Trial Eligibility
 Screen for pharmacogenomic contraindications:
 ```python
 # Check for HLA-B*57:01 before abacavir trial
 response = requests.get("https://api.clinpgx.org/v1/geneDrugPair",
                       params={"gene": "HLA-B", "drug": "abacavir"})
 pair_info = response.json()
 # CPIC: Do not use if HLA-B*57:01 positive
 ```
 ## Additional Resources
 - **ClinPGx website**: https://www.clinpgx.org/
 - **ClinPGx Blog**: https://blog.clinpgx.org/
 - **API documentation**: https://api.clinpgx.org/
 - **CPIC website**: https://cpicpgx.org/
 - **PharmCAT**: https://pharmcat.clinpgx.org/
 - **ClinGen**: https://clinicalgenome.org/
 - **Contact**: api@clinpgx.org (for substantial API use)
--- a/scientific-databases/clinpgx-database/references/api_reference.md
+++ b/scientific-databases/clinpgx-database/references/api_reference.md
@@ -0,0 +1,757 @@
 # ClinPGx API Reference
 Complete reference documentation for the ClinPGx REST API.
 ## Base URL
 ```
 https://api.clinpgx.org/v1/
 ```
 ## Rate Limiting
 - **Maximum rate**: 2 requests per second
 - **Enforcement**: Requests exceeding the limit will receive HTTP 429 (Too Many Requests)
 - **Best practice**: Implement 500ms delay between requests (0.5 seconds)
 - **Recommendation**: For substantial API use, contact api@clinpgx.org
 ## Authentication
 No authentication is required for basic API access. All endpoints are publicly accessible.
 ## Data License
 All data accessed through the API is subject to:
 - Creative Commons Attribution-ShareAlike 4.0 International License
 - ClinPGx Data Usage Policy
 ## Response Format
 All successful responses return JSON with appropriate HTTP status codes:
 - `200 OK`: Successful request
 - `404 Not Found`: Resource does not exist
 - `429 Too Many Requests`: Rate limit exceeded
 - `500 Internal Server Error`: Server error
 ## Core Endpoints
 ### 1. Gene Endpoint
 Retrieve pharmacogene information including function, variants, and clinical significance.
 #### Get Gene by Symbol
 ```http
 GET /v1/gene/{gene_symbol}
 ```
 **Parameters:**
 - `gene_symbol` (path, required): Gene symbol (e.g., CYP2D6, TPMT, DPYD)
 **Example Request:**
 ```bash
 curl "https://api.clinpgx.org/v1/gene/CYP2D6"
 ```
 **Example Response:**
 ```json
 {
  "id": "PA126",
  "symbol": "CYP2D6",
  "name": "cytochrome P450 family 2 subfamily D member 6",
  "chromosome": "22",
  "chromosomeLocation": "22q13.2",
  "function": "Drug metabolism",
  "description": "Highly polymorphic gene encoding enzyme...",
  "clinicalAnnotations": [...],
  "relatedDrugs": [...]
 }
 ```
 #### Search Genes
 ```http
 GET /v1/gene?q={search_term}
 ```
 **Parameters:**
 - `q` (query, optional): Search term for gene name or symbol
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/gene?q=CYP"
 ```
 ### 2. Chemical/Drug Endpoint
 Access drug and chemical compound information including pharmacogenomic annotations.
 #### Get Drug by ID
 ```http
 GET /v1/chemical/{drug_id}
 ```
 **Parameters:**
 - `drug_id` (path, required): ClinPGx drug identifier (e.g., PA448515)
 **Example Request:**
 ```bash
 curl "https://api.clinpgx.org/v1/chemical/PA448515"
 ```
 #### Search Drugs by Name
 ```http
 GET /v1/chemical?name={drug_name}
 ```
 **Parameters:**
 - `name` (query, optional): Drug name or synonym
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/chemical?name=warfarin"
 ```
 **Example Response:**
 ```json
 [
  {
    "id": "PA448515",
    "name": "warfarin",
    "genericNames": ["warfarin sodium"],
    "tradeNames": ["Coumadin", "Jantoven"],
    "drugClasses": ["Anticoagulants"],
    "indication": "Prevention of thrombosis",
    "relatedGenes": ["CYP2C9", "VKORC1", "CYP4F2"]
  }
 ]
 ```
 ### 3. Gene-Drug Pair Endpoint
 Query curated gene-drug interaction relationships with clinical annotations.
 #### Get Gene-Drug Pairs
 ```http
 GET /v1/geneDrugPair?gene={gene}&drug={drug}
 ```
 **Parameters:**
 - `gene` (query, optional): Gene symbol
 - `drug` (query, optional): Drug name
 - `cpicLevel` (query, optional): Filter by CPIC recommendation level (A, B, C, D)
 **Example Requests:**
 ```bash
 # Get all pairs for a gene
 curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6"
 # Get specific gene-drug pair
 curl "https://api.clinpgx.org/v1/geneDrugPair?gene=CYP2D6&drug=codeine"
 # Get all CPIC Level A pairs
 curl "https://api.clinpgx.org/v1/geneDrugPair?cpicLevel=A"
 ```
 **Example Response:**
 ```json
 [
  {
    "gene": "CYP2D6",
    "drug": "codeine",
    "sources": ["CPIC", "FDA", "DPWG"],
    "cpicLevel": "A",
    "evidenceLevel": "1A",
    "clinicalAnnotationCount": 45,
    "hasGuideline": true,
    "guidelineUrl": "https://www.clinpgx.org/guideline/..."
  }
 ]
 ```
 ### 4. Guideline Endpoint
 Access clinical practice guidelines from CPIC, DPWG, and other sources.
 #### Get Guidelines
 ```http
 GET /v1/guideline?source={source}&gene={gene}&drug={drug}
 ```
 **Parameters:**
 - `source` (query, optional): Guideline source (CPIC, DPWG, FDA)
 - `gene` (query, optional): Gene symbol
 - `drug` (query, optional): Drug name
 **Example Requests:**
 ```bash
 # Get all CPIC guidelines
 curl "https://api.clinpgx.org/v1/guideline?source=CPIC"
 # Get guideline for specific gene-drug
 curl "https://api.clinpgx.org/v1/guideline?gene=CYP2C19&drug=clopidogrel"
 ```
 #### Get Guideline by ID
 ```http
 GET /v1/guideline/{guideline_id}
 ```
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/guideline/PA166104939"
 ```
 **Example Response:**
 ```json
 {
  "id": "PA166104939",
  "name": "CPIC Guideline for CYP2C19 and Clopidogrel",
  "source": "CPIC",
  "genes": ["CYP2C19"],
  "drugs": ["clopidogrel"],
  "recommendationLevel": "A",
  "lastUpdated": "2023-08-01",
  "summary": "Alternative antiplatelet therapy recommended for...",
  "recommendations": [...],
  "pdfUrl": "https://www.clinpgx.org/...",
  "pmid": "23400754"
 }
 ```
 ### 5. Allele Endpoint
 Query allele definitions, functions, and population frequencies.
 #### Get All Alleles for a Gene
 ```http
 GET /v1/allele?gene={gene_symbol}
 ```
 **Parameters:**
 - `gene` (query, required): Gene symbol
 **Example Request:**
 ```bash
 curl "https://api.clinpgx.org/v1/allele?gene=CYP2D6"
 ```
 **Example Response:**
 ```json
 [
  {
    "name": "CYP2D6*1",
    "gene": "CYP2D6",
    "function": "Normal function",
    "activityScore": 1.0,
    "frequencies": {
      "European": 0.42,
      "African": 0.37,
      "East Asian": 0.50,
      "Latino": 0.44
    },
    "definingVariants": ["Reference allele"],
    "pharmVarId": "PV00001"
  },
  {
    "name": "CYP2D6*4",
    "gene": "CYP2D6",
    "function": "No function",
    "activityScore": 0.0,
    "frequencies": {
      "European": 0.20,
      "African": 0.05,
      "East Asian": 0.01,
      "Latino": 0.10
    },
    "definingVariants": ["rs3892097"],
    "pharmVarId": "PV00004"
  }
 ]
 ```
 #### Get Specific Allele
 ```http
 GET /v1/allele/{allele_name}
 ```
 **Parameters:**
 - `allele_name` (path, required): Allele name with star nomenclature (e.g., CYP2D6*4)
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/allele/CYP2D6*4"
 ```
 ### 6. Variant Endpoint
 Search for genetic variants and their pharmacogenomic annotations.
 #### Get Variant by rsID
 ```http
 GET /v1/variant/{rsid}
 ```
 **Parameters:**
 - `rsid` (path, required): dbSNP reference SNP ID
 **Example Request:**
 ```bash
 curl "https://api.clinpgx.org/v1/variant/rs4244285"
 ```
 **Example Response:**
 ```json
 {
  "rsid": "rs4244285",
  "chromosome": "10",
  "position": 94781859,
  "gene": "CYP2C19",
  "alleles": ["CYP2C19*2"],
  "consequence": "Splice site variant",
  "clinicalSignificance": "Pathogenic - reduced enzyme activity",
  "frequencies": {
    "European": 0.15,
    "African": 0.18,
    "East Asian": 0.29,
    "Latino": 0.12
  },
  "references": [...]
 }
 ```
 #### Search Variants by Position
 ```http
 GET /v1/variant?chromosome={chr}&position={pos}
 ```
 **Parameters:**
 - `chromosome` (query, optional): Chromosome number (1-22, X, Y)
 - `position` (query, optional): Genomic position (GRCh38)
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/variant?chromosome=10&position=94781859"
 ```
 ### 7. Clinical Annotation Endpoint
 Access curated literature annotations for gene-drug-phenotype relationships.
 #### Get Clinical Annotations
 ```http
 GET /v1/clinicalAnnotation?gene={gene}&drug={drug}&evidenceLevel={level}
 ```
 **Parameters:**
 - `gene` (query, optional): Gene symbol
 - `drug` (query, optional): Drug name
 - `evidenceLevel` (query, optional): Evidence level (1A, 1B, 2A, 2B, 3, 4)
 - `phenotype` (query, optional): Phenotype or outcome
 **Example Requests:**
 ```bash
 # Get all annotations for a gene
 curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=CYP2D6"
 # Get high-quality evidence only
 curl "https://api.clinpgx.org/v1/clinicalAnnotation?evidenceLevel=1A"
 # Get annotations for specific gene-drug pair
 curl "https://api.clinpgx.org/v1/clinicalAnnotation?gene=TPMT&drug=azathioprine"
 ```
 **Example Response:**
 ```json
 [
  {
    "id": "PA166153683",
    "gene": "CYP2D6",
    "drug": "codeine",
    "phenotype": "Reduced analgesic effect",
    "evidenceLevel": "1A",
    "annotation": "Poor metabolizers have reduced conversion...",
    "pmid": "24618998",
    "studyType": "Clinical trial",
    "population": "European",
    "sources": ["CPIC"]
  }
 ]
 ```
 **Evidence Levels:**
 - **1A**: High-quality evidence from guidelines (CPIC, FDA, DPWG)
 - **1B**: High-quality evidence not yet guideline
 - **2A**: Moderate evidence from well-designed studies
 - **2B**: Moderate evidence with some limitations
 - **3**: Limited or conflicting evidence
 - **4**: Case reports or weak evidence
 ### 8. Drug Label Endpoint
 Retrieve regulatory drug label information with pharmacogenomic content.
 #### Get Drug Labels
 ```http
 GET /v1/drugLabel?drug={drug_name}&source={source}
 ```
 **Parameters:**
 - `drug` (query, required): Drug name
 - `source` (query, optional): Regulatory source (FDA, EMA, PMDA, Health Canada)
 **Example Requests:**
 ```bash
 # Get all labels for warfarin
 curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin"
 # Get only FDA labels
 curl "https://api.clinpgx.org/v1/drugLabel?drug=warfarin&source=FDA"
 ```
 **Example Response:**
 ```json
 [
  {
    "id": "DL001234",
    "drug": "warfarin",
    "source": "FDA",
    "sections": {
      "testing": "Consider CYP2C9 and VKORC1 genotyping...",
      "dosing": "Dose adjustment based on genotype...",
      "warnings": "Risk of bleeding in certain genotypes"
    },
    "biomarkers": ["CYP2C9", "VKORC1"],
    "testingRecommended": true,
    "labelUrl": "https://dailymed.nlm.nih.gov/...",
    "lastUpdated": "2024-01-15"
  }
 ]
 ```
 ### 9. Pathway Endpoint
 Access pharmacokinetic and pharmacodynamic pathway diagrams and information.
 #### Get Pathway by ID
 ```http
 GET /v1/pathway/{pathway_id}
 ```
 **Parameters:**
 - `pathway_id` (path, required): ClinPGx pathway identifier
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/pathway/PA146123006"
 ```
 #### Search Pathways
 ```http
 GET /v1/pathway?drug={drug_name}&gene={gene}
 ```
 **Parameters:**
 - `drug` (query, optional): Drug name
 - `gene` (query, optional): Gene symbol
 **Example:**
 ```bash
 curl "https://api.clinpgx.org/v1/pathway?drug=warfarin"
 ```
 **Example Response:**
 ```json
 {
  "id": "PA146123006",
  "name": "Warfarin Pharmacokinetics and Pharmacodynamics",
  "drugs": ["warfarin"],
  "genes": ["CYP2C9", "VKORC1", "CYP4F2", "GGCX"],
  "description": "Warfarin is metabolized primarily by CYP2C9...",
  "diagramUrl": "https://www.clinpgx.org/pathway/...",
  "steps": [
    {
      "step": 1,
      "process": "Absorption",
      "genes": []
    },
    {
      "step": 2,
      "process": "Metabolism",
      "genes": ["CYP2C9", "CYP2C19"]
    },
    {
      "step": 3,
      "process": "Target interaction",
      "genes": ["VKORC1"]
    }
  ]
 }
 ```
 ## Query Patterns and Examples
 ### Common Query Patterns
 #### 1. Patient Medication Review
 Query all gene-drug pairs for a patient's medications:
 ```python
 import requests
 patient_meds = ["clopidogrel", "simvastatin", "codeine"]
 patient_genes = {"CYP2C19": "*1/*2", "CYP2D6": "*1/*1", "SLCO1B1": "*1/*5"}
 for med in patient_meds:
    for gene in patient_genes:
        response = requests.get(
            "https://api.clinpgx.org/v1/geneDrugPair",
            params={"gene": gene, "drug": med}
        )
        pairs = response.json()
        # Check for interactions
 ```
 #### 2. Actionable Gene Panel
 Find all genes with CPIC Level A recommendations:
 ```python
 response = requests.get(
    "https://api.clinpgx.org/v1/geneDrugPair",
    params={"cpicLevel": "A"}
 )
 actionable_pairs = response.json()
 genes = set(pair['gene'] for pair in actionable_pairs)
 print(f"Panel should include: {sorted(genes)}")
 ```
 #### 3. Population Frequency Analysis
 Compare allele frequencies across populations:
 ```python
 alleles = requests.get(
    "https://api.clinpgx.org/v1/allele",
    params={"gene": "CYP2D6"}
 ).json()
 # Calculate phenotype frequencies
 pm_freq = {}  # Poor metabolizer frequencies
 for allele in alleles:
    if allele['function'] == 'No function':
        for pop, freq in allele['frequencies'].items():
            pm_freq[pop] = pm_freq.get(pop, 0) + freq
 ```
 #### 4. Drug Safety Screen
 Check for high-risk gene-drug associations:
 ```python
 # Screen for HLA-B*57:01 before abacavir
 response = requests.get(
    "https://api.clinpgx.org/v1/geneDrugPair",
    params={"gene": "HLA-B", "drug": "abacavir"}
 )
 pair = response.json()[0]
 if pair['cpicLevel'] == 'A':
    print("CRITICAL: Do not use if HLA-B*57:01 positive")
 ```
 ## Error Handling
 ### Common Error Responses
 #### 404 Not Found
 ```json
 {
  "error": "Resource not found",
  "message": "Gene 'INVALID' does not exist"
 }
 ```
 #### 429 Too Many Requests
 ```json
 {
  "error": "Rate limit exceeded",
  "message": "Maximum 2 requests per second allowed"
 }
 ```
 ### Recommended Error Handling Pattern
 ```python
 import requests
 import time
 def safe_query(url, params=None, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                time.sleep(0.5)  # Rate limiting
                return response.json()
            elif response.status_code == 429:
                wait = 2 ** attempt
                print(f"Rate limited. Waiting {wait}s...")
                time.sleep(wait)
            elif response.status_code == 404:
                print("Resource not found")
                return None
            else:
                response.raise_for_status()
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                raise
    return None
 ```
 ## Best Practices
 ### Rate Limiting
 - Implement 500ms delay between requests (2 requests/second maximum)
 - Use exponential backoff for rate limit errors
 - Consider caching results for frequently accessed data
 - For bulk operations, contact api@clinpgx.org
 ### Caching Strategy
 ```python
 import json
 from pathlib import Path
 def cached_query(cache_file, query_func, *args, **kwargs):
    cache_path = Path(cache_file)
    if cache_path.exists():
        with open(cache_path) as f:
            return json.load(f)
    result = query_func(*args, **kwargs)
    if result:
        with open(cache_path, 'w') as f:
            json.dump(result, f)
    return result
 ```
 ### Batch Processing
 ```python
 import time
 def batch_gene_query(genes, delay=0.5):
    results = {}
    for gene in genes:
        response = requests.get(f"https://api.clinpgx.org/v1/gene/{gene}")
        if response.status_code == 200:
            results[gene] = response.json()
        time.sleep(delay)
    return results
 ```
 ## Data Schema Definitions
 ### Gene Object
 ```typescript
 {
  id: string;              // ClinPGx gene ID
  symbol: string;          // HGNC gene symbol
  name: string;            // Full gene name
  chromosome: string;      // Chromosome location
  function: string;        // Pharmacogenomic function
  clinicalAnnotations: number;  // Count of annotations
  relatedDrugs: string[];  // Associated drugs
 }
 ```
 ### Drug Object
 ```typescript
 {
  id: string;              // ClinPGx drug ID
  name: string;            // Generic name
  tradeNames: string[];    // Brand names
  drugClasses: string[];   // Therapeutic classes
  indication: string;      // Primary indication
  relatedGenes: string[];  // Pharmacogenes
 }
 ```
 ### Gene-Drug Pair Object
 ```typescript
 {
  gene: string;            // Gene symbol
  drug: string;            // Drug name
  sources: string[];       // CPIC, FDA, DPWG, etc.
  cpicLevel: string;       // A, B, C, D
  evidenceLevel: string;   // 1A, 1B, 2A, 2B, 3, 4
  hasGuideline: boolean;   // Has clinical guideline
 }
 ```
 ### Allele Object
 ```typescript
 {
  name: string;            // Allele name (e.g., CYP2D6*4)
  gene: string;            // Gene symbol
  function: string;        // Normal/decreased/no/increased/uncertain
  activityScore: number;   // 0.0 to 2.0+
  frequencies: {           // Population frequencies
    [population: string]: number;
  };
  definingVariants: string[];  // rsIDs or descriptions
 }
 ```
 ## API Stability and Versioning
 ### Current Status
 - API version: v1
 - Stability: Beta - endpoints stable, parameters may change
 - Monitor: https://blog.clinpgx.org/ for updates
 ### Migration from PharmGKB
 As of July 2025, PharmGKB URLs redirect to ClinPGx. Update references:
 - Old: `https://api.pharmgkb.org/`
 - New: `https://api.clinpgx.org/`
 ### Future Changes
 - Watch for API v2 announcements
 - Breaking changes will be announced on ClinPGx Blog
 - Consider version pinning for production applications
 ## Support and Contact
 - **API Issues**: api@clinpgx.org
 - **Documentation**: https://api.clinpgx.org/
 - **General Questions**: https://www.clinpgx.org/page/faqs
 - **Blog**: https://blog.clinpgx.org/
 - **CPIC Guidelines**: https://cpicpgx.org/
 ## Related Resources
 - **PharmCAT**: Pharmacogenomic variant calling and annotation tool
 - **PharmVar**: Pharmacogene allele nomenclature database
 - **CPIC**: Clinical Pharmacogenetics Implementation Consortium
 - **DPWG**: Dutch Pharmacogenetics Working Group
 - **ClinGen**: Clinical Genome Resource
--- a/scientific-databases/clinpgx-database/scripts/query_clinpgx.py
+++ b/scientific-databases/clinpgx-database/scripts/query_clinpgx.py
@@ -0,0 +1,518 @@
 #!/usr/bin/env python3
 """
 ClinPGx API Query Helper Script
 Provides ready-to-use functions for querying the ClinPGx database API.
 Includes rate limiting, error handling, and caching functionality.
 ClinPGx API: https://api.clinpgx.org/
 Rate limit: 2 requests per second
 License: Creative Commons Attribution-ShareAlike 4.0 International
 """
 import requests
 import time
 import json
 from pathlib import Path
 from typing import Dict, List, Optional, Any
 # API Configuration
 BASE_URL = "https://api.clinpgx.org/v1/"
 RATE_LIMIT_DELAY = 0.5  # 500ms delay = 2 requests/second
 def rate_limited_request(url: str, params: Optional[Dict] = None, delay: float = RATE_LIMIT_DELAY) -> requests.Response:
    """
    Make API request with rate limiting compliance.
    Args:
        url: API endpoint URL
        params: Query parameters
        delay: Delay in seconds between requests (default 0.5s for 2 req/sec)
    Returns:
        Response object
    """
    response = requests.get(url, params=params)
    time.sleep(delay)
    return response
 def safe_api_call(url: str, params: Optional[Dict] = None, max_retries: int = 3) -> Optional[Dict]:
    """
    Make API call with error handling and exponential backoff retry.
    Args:
        url: API endpoint URL
        params: Query parameters
        max_retries: Maximum number of retry attempts
    Returns:
        JSON response data or None on failure
    """
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                time.sleep(RATE_LIMIT_DELAY)
                return response.json()
            elif response.status_code == 429:
                # Rate limit exceeded
                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
                print(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
                time.sleep(wait_time)
            elif response.status_code == 404:
                print(f"Resource not found: {url}")
                return None
            else:
                response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts")
                return None
            time.sleep(1)
    return None
 def cached_query(cache_file: str, query_func, *args, **kwargs) -> Any:
    """
    Cache API results to avoid repeated queries.
    Args:
        cache_file: Path to cache file
        query_func: Function to call if cache miss
        *args, **kwargs: Arguments to pass to query_func
    Returns:
        Cached or freshly queried data
    """
    cache_path = Path(cache_file)
    if cache_path.exists():
        print(f"Loading from cache: {cache_file}")
        with open(cache_path) as f:
            return json.load(f)
    print(f"Cache miss. Querying API...")
    result = query_func(*args, **kwargs)
    if result is not None:
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"Cached to: {cache_file}")
    return result
 # Core Query Functions
 def get_gene_info(gene_symbol: str) -> Optional[Dict]:
    """
    Retrieve detailed information about a pharmacogene.
    Args:
        gene_symbol: Gene symbol (e.g., "CYP2D6", "TPMT")
    Returns:
        Gene information dictionary
    Example:
        >>> gene_data = get_gene_info("CYP2D6")
        >>> print(gene_data['symbol'], gene_data['name'])
    """
    url = f"{BASE_URL}gene/{gene_symbol}"
    return safe_api_call(url)
 def get_drug_info(drug_name: str) -> Optional[List[Dict]]:
    """
    Search for drug/chemical information by name.
    Args:
        drug_name: Drug name (e.g., "warfarin", "codeine")
    Returns:
        List of matching drugs
    Example:
        >>> drugs = get_drug_info("warfarin")
        >>> for drug in drugs:
        >>>     print(drug['name'], drug['id'])
    """
    url = f"{BASE_URL}chemical"
    params = {"name": drug_name}
    return safe_api_call(url, params)
 def get_gene_drug_pairs(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Query gene-drug interaction pairs.
    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)
    Returns:
        List of gene-drug pairs with clinical annotations
    Example:
        >>> # Get all pairs for CYP2D6
        >>> pairs = get_gene_drug_pairs(gene="CYP2D6")
        >>>
        >>> # Get specific gene-drug pair
        >>> pair = get_gene_drug_pairs(gene="CYP2D6", drug="codeine")
    """
    url = f"{BASE_URL}geneDrugPair"
    params = {}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug
    return safe_api_call(url, params)
 def get_cpic_guidelines(gene: Optional[str] = None, drug: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Retrieve CPIC clinical practice guidelines.
    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)
    Returns:
        List of CPIC guidelines
    Example:
        >>> # Get all CPIC guidelines
        >>> guidelines = get_cpic_guidelines()
        >>>
        >>> # Get guideline for specific gene-drug
        >>> guideline = get_cpic_guidelines(gene="CYP2C19", drug="clopidogrel")
    """
    url = f"{BASE_URL}guideline"
    params = {"source": "CPIC"}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug
    return safe_api_call(url, params)
 def get_alleles(gene: str) -> Optional[List[Dict]]:
    """
    Get all alleles for a pharmacogene including function and frequency.
    Args:
        gene: Gene symbol (e.g., "CYP2D6")
    Returns:
        List of alleles with functional annotations and population frequencies
    Example:
        >>> alleles = get_alleles("CYP2D6")
        >>> for allele in alleles:
        >>>     print(f"{allele['name']}: {allele['function']}")
    """
    url = f"{BASE_URL}allele"
    params = {"gene": gene}
    return safe_api_call(url, params)
 def get_allele_info(allele_name: str) -> Optional[Dict]:
    """
    Get detailed information about a specific allele.
    Args:
        allele_name: Allele name (e.g., "CYP2D6*4")
    Returns:
        Allele information dictionary
    Example:
        >>> allele = get_allele_info("CYP2D6*4")
        >>> print(allele['function'], allele['frequencies'])
    """
    url = f"{BASE_URL}allele/{allele_name}"
    return safe_api_call(url)
 def get_clinical_annotations(
    gene: Optional[str] = None,
    drug: Optional[str] = None,
    evidence_level: Optional[str] = None
 ) -> Optional[List[Dict]]:
    """
    Retrieve curated literature annotations for gene-drug interactions.
    Args:
        gene: Gene symbol (optional)
        drug: Drug name (optional)
        evidence_level: Filter by evidence level (1A, 1B, 2A, 2B, 3, 4)
    Returns:
        List of clinical annotations
    Example:
        >>> # Get all annotations for CYP2D6
        >>> annotations = get_clinical_annotations(gene="CYP2D6")
        >>>
        >>> # Get high-quality evidence only
        >>> high_quality = get_clinical_annotations(evidence_level="1A")
    """
    url = f"{BASE_URL}clinicalAnnotation"
    params = {}
    if gene:
        params["gene"] = gene
    if drug:
        params["drug"] = drug
    if evidence_level:
        params["evidenceLevel"] = evidence_level
    return safe_api_call(url, params)
 def get_drug_labels(drug: str, source: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Retrieve pharmacogenomic drug label information.
    Args:
        drug: Drug name
        source: Regulatory source (e.g., "FDA", "EMA")
    Returns:
        List of drug labels with PGx information
    Example:
        >>> # Get all labels for warfarin
        >>> labels = get_drug_labels("warfarin")
        >>>
        >>> # Get only FDA labels
        >>> fda_labels = get_drug_labels("warfarin", source="FDA")
    """
    url = f"{BASE_URL}drugLabel"
    params = {"drug": drug}
    if source:
        params["source"] = source
    return safe_api_call(url, params)
 def search_variants(rsid: Optional[str] = None, chromosome: Optional[str] = None,
                   position: Optional[str] = None) -> Optional[List[Dict]]:
    """
    Search for genetic variants by rsID or genomic position.
    Args:
        rsid: dbSNP rsID (e.g., "rs4244285")
        chromosome: Chromosome number
        position: Genomic position
    Returns:
        List of matching variants
    Example:
        >>> # Search by rsID
        >>> variant = search_variants(rsid="rs4244285")
        >>>
        >>> # Search by position
        >>> variants = search_variants(chromosome="10", position="94781859")
    """
    url = f"{BASE_URL}variant"
    if rsid:
        url = f"{BASE_URL}variant/{rsid}"
        return safe_api_call(url)
    params = {}
    if chromosome:
        params["chromosome"] = chromosome
    if position:
        params["position"] = position
    return safe_api_call(url, params)
 def get_pathway_info(pathway_id: Optional[str] = None, drug: Optional[str] = None) -> Optional[Any]:
    """
    Retrieve pharmacokinetic/pharmacodynamic pathway information.
    Args:
        pathway_id: ClinPGx pathway ID (optional)
        drug: Drug name (optional)
    Returns:
        Pathway information or list of pathways
    Example:
        >>> # Get specific pathway
        >>> pathway = get_pathway_info(pathway_id="PA146123006")
        >>>
        >>> # Get all pathways for a drug
        >>> pathways = get_pathway_info(drug="warfarin")
    """
    if pathway_id:
        url = f"{BASE_URL}pathway/{pathway_id}"
        return safe_api_call(url)
    url = f"{BASE_URL}pathway"
    params = {}
    if drug:
        params["drug"] = drug
    return safe_api_call(url, params)
 # Utility Functions
 def export_to_dataframe(data: List[Dict], output_file: Optional[str] = None):
    """
    Convert API results to pandas DataFrame for analysis.
    Args:
        data: List of dictionaries from API
        output_file: Optional CSV output file path
    Returns:
        pandas DataFrame
    Example:
        >>> pairs = get_gene_drug_pairs(gene="CYP2D6")
        >>> df = export_to_dataframe(pairs, "cyp2d6_pairs.csv")
        >>> print(df.head())
    """
    try:
        import pandas as pd
    except ImportError:
        print("pandas not installed. Install with: pip install pandas")
        return None
    df = pd.DataFrame(data)
    if output_file:
        df.to_csv(output_file, index=False)
        print(f"Data exported to: {output_file}")
    return df
 def batch_gene_query(gene_list: List[str], delay: float = 0.5) -> Dict[str, Dict]:
    """
    Query multiple genes in batch with rate limiting.
    Args:
        gene_list: List of gene symbols
        delay: Delay between requests (default 0.5s)
    Returns:
        Dictionary mapping gene symbols to gene data
    Example:
        >>> genes = ["CYP2D6", "CYP2C19", "CYP2C9", "TPMT"]
        >>> results = batch_gene_query(genes)
        >>> for gene, data in results.items():
        >>>     print(f"{gene}: {data['name']}")
    """
    results = {}
    print(f"Querying {len(gene_list)} genes with {delay}s delay between requests...")
    for gene in gene_list:
        print(f"Fetching: {gene}")
        data = get_gene_info(gene)
        if data:
            results[gene] = data
        time.sleep(delay)
    print(f"Completed: {len(results)}/{len(gene_list)} successful")
    return results
 def find_actionable_gene_drug_pairs(cpic_level: str = "A") -> Optional[List[Dict]]:
    """
    Find all clinically actionable gene-drug pairs with CPIC guidelines.
    Args:
        cpic_level: CPIC recommendation level (A, B, C, D)
    Returns:
        List of actionable gene-drug pairs
    Example:
        >>> # Get all Level A recommendations
        >>> actionable = find_actionable_gene_drug_pairs(cpic_level="A")
        >>> for pair in actionable:
        >>>     print(f"{pair['gene']} - {pair['drug']}")
    """
    url = f"{BASE_URL}geneDrugPair"
    params = {"cpicLevel": cpic_level}
    return safe_api_call(url, params)
 # Example Usage
 if __name__ == "__main__":
    print("ClinPGx API Query Examples\n")
    # Example 1: Get gene information
    print("=" * 60)
    print("Example 1: Get CYP2D6 gene information")
    print("=" * 60)
    cyp2d6 = get_gene_info("CYP2D6")
    if cyp2d6:
        print(f"Gene: {cyp2d6.get('symbol')}")
        print(f"Name: {cyp2d6.get('name')}")
        print()
    # Example 2: Search for a drug
    print("=" * 60)
    print("Example 2: Search for warfarin")
    print("=" * 60)
    warfarin = get_drug_info("warfarin")
    if warfarin:
        for drug in warfarin[:1]:  # Show first result
            print(f"Drug: {drug.get('name')}")
            print(f"ID: {drug.get('id')}")
        print()
    # Example 3: Get gene-drug pairs
    print("=" * 60)
    print("Example 3: Get CYP2C19-clopidogrel pair")
    print("=" * 60)
    pair = get_gene_drug_pairs(gene="CYP2C19", drug="clopidogrel")
    if pair:
        print(f"Found {len(pair)} gene-drug pair(s)")
        if len(pair) > 0:
            print(f"Annotations: {pair[0].get('sources', [])}")
        print()
    # Example 4: Get CPIC guidelines
    print("=" * 60)
    print("Example 4: Get CPIC guidelines for CYP2C19")
    print("=" * 60)
    guidelines = get_cpic_guidelines(gene="CYP2C19")
    if guidelines:
        print(f"Found {len(guidelines)} guideline(s)")
        for g in guidelines[:2]:  # Show first 2
            print(f"  - {g.get('name')}")
        print()
    # Example 5: Get alleles for a gene
    print("=" * 60)
    print("Example 5: Get CYP2D6 alleles")
    print("=" * 60)
    alleles = get_alleles("CYP2D6")
    if alleles:
        print(f"Found {len(alleles)} allele(s)")
        for allele in alleles[:3]:  # Show first 3
            print(f"  - {allele.get('name')}: {allele.get('function')}")
        print()
    print("=" * 60)
    print("Examples completed!")
    print("=" * 60)
--- a/scientific-databases/ensembl-database/SKILL.md
+++ b/scientific-databases/ensembl-database/SKILL.md
@@ -0,0 +1,292 @@
 ---
 name: ensembl-database
 description: Work with the Ensembl genome database to query genomic data, retrieve sequences, analyze variants, and perform comparative genomics. This skill should be used when working with vertebrate genomic data, gene annotations, variant analysis, ortholog identification, or when users need to query the Ensembl REST API for genomic information across multiple species.
 ---
 # Ensembl Database
 ## Overview
 Access and query the Ensembl genome database, a comprehensive resource for vertebrate genomic data maintained by EMBL-EBI. The database provides gene annotations, sequences, variants, regulatory information, and comparative genomics data for over 250 species. Current release is 115 (September 2025).
 ## Core Capabilities
 ### 1. Gene Information Retrieval
 Query gene data by symbol, Ensembl ID, or external database identifiers.
 **Common operations:**
 - Look up gene information by symbol (e.g., "BRCA2", "TP53")
 - Retrieve transcript and protein information
 - Get gene coordinates and chromosomal locations
 - Access cross-references to external databases (UniProt, RefSeq, etc.)
 **Using the ensembl_rest package:**
 ```python
 from ensembl_rest import EnsemblClient
 client = EnsemblClient()
 # Look up gene by symbol
 gene_data = client.symbol_lookup(
    species='human',
    symbol='BRCA2'
 )
 # Get detailed gene information
 gene_info = client.lookup_id(
    id='ENSG00000139618',  # BRCA2 Ensembl ID
    expand=True
 )
 ```
 **Direct REST API (no package):**
 ```python
 import requests
 server = "https://rest.ensembl.org"
 # Symbol lookup
 response = requests.get(
    f"{server}/lookup/symbol/homo_sapiens/BRCA2",
    headers={"Content-Type": "application/json"}
 )
 gene_data = response.json()
 ```
 ### 2. Sequence Retrieval
 Fetch genomic, transcript, or protein sequences in various formats (JSON, FASTA, plain text).
 **Operations:**
 - Get DNA sequences for genes or genomic regions
 - Retrieve transcript sequences (cDNA)
 - Access protein sequences
 - Extract sequences with flanking regions or modifications
 **Example:**
 ```python
 # Using ensembl_rest package
 sequence = client.sequence_id(
    id='ENSG00000139618',  # Gene ID
    content_type='application/json'
 )
 # Get sequence for a genomic region
 region_seq = client.sequence_region(
    species='human',
    region='7:140424943-140624564'  # chromosome:start-end
 )
 ```
 ### 3. Variant Analysis
 Query genetic variation data and predict variant consequences using the Variant Effect Predictor (VEP).
 **Capabilities:**
 - Look up variants by rsID or genomic coordinates
 - Predict functional consequences of variants
 - Access population frequency data
 - Retrieve phenotype associations
 **VEP example:**
 ```python
 # Predict variant consequences
 vep_result = client.vep_hgvs(
    species='human',
    hgvs_notation='ENST00000380152.7:c.803C>T'
 )
 # Query variant by rsID
 variant = client.variation_id(
    species='human',
    id='rs699'
 )
 ```
 ### 4. Comparative Genomics
 Perform cross-species comparisons to identify orthologs, paralogs, and evolutionary relationships.
 **Operations:**
 - Find orthologs (same gene in different species)
 - Identify paralogs (related genes in same species)
 - Access gene trees showing evolutionary relationships
 - Retrieve gene family information
 **Example:**
 ```python
 # Find orthologs for a human gene
 orthologs = client.homology_ensemblgene(
    id='ENSG00000139618',  # Human BRCA2
    target_species='mouse'
 )
 # Get gene tree
 gene_tree = client.genetree_member_symbol(
    species='human',
    symbol='BRCA2'
 )
 ```
 ### 5. Genomic Region Analysis
 Find all genomic features (genes, transcripts, regulatory elements) in a specific region.
 **Use cases:**
 - Identify all genes in a chromosomal region
 - Find regulatory features (promoters, enhancers)
 - Locate variants within a region
 - Retrieve structural features
 **Example:**
 ```python
 # Find all features in a region
 features = client.overlap_region(
    species='human',
    region='7:140424943-140624564',
    feature='gene'
 )
 ```
 ### 6. Assembly Mapping
 Convert coordinates between different genome assemblies (e.g., GRCh37 to GRCh38).
 **Important:** Use `https://grch37.rest.ensembl.org` for GRCh37/hg19 queries and `https://rest.ensembl.org` for current assemblies.
 **Example:**
 ```python
 from ensembl_rest import AssemblyMapper
 # Map coordinates from GRCh37 to GRCh38
 mapper = AssemblyMapper(
    species='human',
    asm_from='GRCh37',
    asm_to='GRCh38'
 )
 mapped = mapper.map(chrom='7', start=140453136, end=140453136)
 ```
 ## API Best Practices
 ### Rate Limiting
 The Ensembl REST API has rate limits. Follow these practices:
 1. **Respect rate limits:** Maximum 15 requests per second for anonymous users
 2. **Handle 429 responses:** When rate-limited, check the `Retry-After` header and wait
 3. **Use batch endpoints:** When querying multiple items, use batch endpoints where available
 4. **Cache results:** Store frequently accessed data to reduce API calls
 ### Error Handling
 Always implement proper error handling:
 ```python
 import requests
 import time
 def query_ensembl(endpoint, params=None, max_retries=3):
    server = "https://rest.ensembl.org"
    headers = {"Content-Type": "application/json"}
    for attempt in range(max_retries):
        response = requests.get(
            f"{server}{endpoint}",
            headers=headers,
            params=params
        )
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:
            # Rate limited - wait and retry
            retry_after = int(response.headers.get('Retry-After', 1))
            time.sleep(retry_after)
        else:
            response.raise_for_status()
    raise Exception(f"Failed after {max_retries} attempts")
 ```
 ## Installation
 ### Python Package (Recommended)
 ```bash
 pip install ensembl_rest
 ```
 The `ensembl_rest` package provides a Pythonic interface to all Ensembl REST API endpoints.
 ### Direct REST API
 No installation needed - use standard HTTP libraries like `requests`:
 ```bash
 pip install requests
 ```
 ## Resources
 ### references/
 - `api_endpoints.md`: Comprehensive documentation of all 17 API endpoint categories with examples and parameters
 ### scripts/
 - `ensembl_query.py`: Reusable Python script for common Ensembl queries with built-in rate limiting and error handling
 ## Common Workflows
 ### Workflow 1: Gene Annotation Pipeline
 1. Look up gene by symbol to get Ensembl ID
 2. Retrieve transcript information
 3. Get protein sequences for all transcripts
 4. Find orthologs in other species
 5. Export results
 ### Workflow 2: Variant Analysis
 1. Query variant by rsID or coordinates
 2. Use VEP to predict functional consequences
 3. Check population frequencies
 4. Retrieve phenotype associations
 5. Generate report
 ### Workflow 3: Comparative Analysis
 1. Start with gene of interest in reference species
 2. Find orthologs in target species
 3. Retrieve sequences for all orthologs
 4. Compare gene structures and features
 5. Analyze evolutionary conservation
 ## Species and Assembly Information
 To query available species and assemblies:
 ```python
 # List all available species
 species_list = client.info_species()
 # Get assembly information for a species
 assembly_info = client.info_assembly(species='human')
 ```
 Common species identifiers:
 - Human: `homo_sapiens` or `human`
 - Mouse: `mus_musculus` or `mouse`
 - Zebrafish: `danio_rerio` or `zebrafish`
 - Fruit fly: `drosophila_melanogaster`
 ## Additional Resources
 - **Official Documentation:** https://rest.ensembl.org/documentation
 - **Python Package Docs:** https://ensemblrest.readthedocs.io
 - **EBI Training:** https://www.ebi.ac.uk/training/online/courses/ensembl-rest-api/
 - **Ensembl Browser:** https://useast.ensembl.org
 - **GitHub Examples:** https://github.com/Ensembl/ensembl-rest/wiki
--- a/scientific-databases/ensembl-database/references/api_endpoints.md
+++ b/scientific-databases/ensembl-database/references/api_endpoints.md
@@ -0,0 +1,346 @@
 # Ensembl REST API Endpoints Reference
 Comprehensive documentation of all 17 API endpoint categories available in the Ensembl REST API (Release 115, September 2025).
 **Base URLs:**
 - Current assemblies: `https://rest.ensembl.org`
 - GRCh37/hg19 (human): `https://grch37.rest.ensembl.org`
 **Rate Limits:**
 - Anonymous: 15 requests/second
 - Registered: 55,000 requests/hour
 ## 1. Archive
 Retrieve historical information about retired Ensembl identifiers.
 **GET /archive/id/:id**
 - Retrieve archived entries for a retired identifier
 - Example: `/archive/id/ENSG00000157764` (retired gene ID)
 ## 2. Comparative Genomics
 Access gene trees, genomic alignments, and homology data across species.
 **GET /alignment/region/:species/:region**
 - Get genomic alignments for a region
 - Example: `/alignment/region/human/2:106040000-106040050:1?species_set_group=mammals`
 **GET /genetree/id/:id**
 - Retrieve gene tree for a gene family
 - Example: `/genetree/id/ENSGT00390000003602`
 **GET /genetree/member/id/:id**
 - Get gene tree by member gene ID
 - Example: `/genetree/member/id/ENSG00000139618`
 **GET /homology/id/:id**
 - Find orthologs and paralogs for a gene
 - Parameters: `target_species`, `type` (orthologues, paralogues, all)
 - Example: `/homology/id/ENSG00000139618?target_species=mouse`
 **GET /homology/symbol/:species/:symbol**
 - Find homologs by gene symbol
 - Example: `/homology/symbol/human/BRCA2?target_species=mouse`
 ## 3. Cross References
 Link external database identifiers to Ensembl objects.
 **GET /xrefs/id/:id**
 - Get external references for Ensembl ID
 - Example: `/xrefs/id/ENSG00000139618`
 **GET /xrefs/symbol/:species/:symbol**
 - Get cross-references by gene symbol
 - Example: `/xrefs/symbol/human/BRCA2`
 **GET /xrefs/name/:species/:name**
 - Search for objects by external name
 - Example: `/xrefs/name/human/NP_000050`
 ## 4. Information
 Query metadata about species, assemblies, biotypes, and database versions.
 **GET /info/species**
 - List all available species
 - Returns species names, assemblies, taxonomy IDs
 **GET /info/assembly/:species**
 - Get assembly information for a species
 - Example: `/info/assembly/human` (returns GRCh38.p14)
 **GET /info/assembly/:species/:region**
 - Get detailed information about a chromosomal region
 - Example: `/info/assembly/human/X`
 **GET /info/biotypes/:species**
 - List all available biotypes (gene types)
 - Example: `/info/biotypes/human`
 **GET /info/analysis/:species**
 - List available analysis types
 - Example: `/info/analysis/human`
 **GET /info/data**
 - Get general information about the current Ensembl release
 ## 5. Linkage Disequilibrium (LD)
 Calculate linkage disequilibrium between variants.
 **GET /ld/:species/:id/:population_name**
 - Calculate LD for a variant
 - Example: `/ld/human/rs1042522/1000GENOMES:phase_3:KHV`
 **GET /ld/pairwise/:species/:id1/:id2**
 - Calculate LD between two variants
 - Example: `/ld/pairwise/human/rs1042522/rs11540652`
 ## 6. Lookup
 Identify species and database information for identifiers.
 **GET /lookup/id/:id**
 - Look up object by Ensembl ID
 - Parameter: `expand` (include child objects)
 - Example: `/lookup/id/ENSG00000139618?expand=1`
 **POST /lookup/id**
 - Batch lookup multiple IDs
 - Submit JSON array of IDs
 - Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
 **GET /lookup/symbol/:species/:symbol**
 - Look up gene by symbol
 - Parameter: `expand` (include transcripts)
 - Example: `/lookup/symbol/human/BRCA2?expand=1`
 ## 7. Mapping
 Convert coordinates between assemblies, cDNA, CDS, and protein positions.
 **GET /map/cdna/:id/:region**
 - Map cDNA coordinates to genomic
 - Example: `/map/cdna/ENST00000288602/100..300`
 **GET /map/cds/:id/:region**
 - Map CDS coordinates to genomic
 - Example: `/map/cds/ENST00000288602/1..300`
 **GET /map/translation/:id/:region**
 - Map protein coordinates to genomic
 - Example: `/map/translation/ENSP00000288602/1..100`
 **GET /map/:species/:asm_one/:region/:asm_two**
 - Map coordinates between assemblies
 - Example: `/map/human/GRCh37/7:140453136..140453136/GRCh38`
 **POST /map/:species/:asm_one/:asm_two**
 - Batch assembly mapping
 - Submit JSON array of regions
 ## 8. Ontologies and Taxonomy
 Search biological ontologies and taxonomic classifications.
 **GET /ontology/id/:id**
 - Get ontology term information
 - Example: `/ontology/id/GO:0005515`
 **GET /ontology/name/:name**
 - Search ontology by term name
 - Example: `/ontology/name/protein%20binding`
 **GET /taxonomy/classification/:id**
 - Get taxonomic classification
 - Example: `/taxonomy/classification/9606` (human)
 **GET /taxonomy/id/:id**
 - Get taxonomy information by ID
 - Example: `/taxonomy/id/9606`
 ## 9. Overlap
 Find genomic features overlapping a region.
 **GET /overlap/id/:id**
 - Get features overlapping a gene/transcript
 - Parameters: `feature` (gene, transcript, cds, exon, repeat, etc.)
 - Example: `/overlap/id/ENSG00000139618?feature=transcript`
 **GET /overlap/region/:species/:region**
 - Get all features in a genomic region
 - Parameters: `feature` (gene, transcript, variation, regulatory, etc.)
 - Example: `/overlap/region/human/7:140424943..140624564?feature=gene`
 **GET /overlap/translation/:id**
 - Get protein features
 - Example: `/overlap/translation/ENSP00000288602`
 ## 10. Phenotype Annotations
 Retrieve disease and trait associations.
 **GET /phenotype/accession/:species/:accession**
 - Get phenotypes by ontology accession
 - Example: `/phenotype/accession/human/EFO:0003767`
 **GET /phenotype/gene/:species/:gene**
 - Get phenotype associations for a gene
 - Example: `/phenotype/gene/human/ENSG00000139618`
 **GET /phenotype/region/:species/:region**
 - Get phenotypes in genomic region
 - Example: `/phenotype/region/human/7:140424943-140624564`
 **GET /phenotype/term/:species/:term**
 - Search phenotypes by term
 - Example: `/phenotype/term/human/cancer`
 ## 11. Regulation
 Access regulatory feature and binding motif data.
 **GET /regulatory/species/:species/microarray/:microarray/:probe**
 - Get microarray probe information
 - Example: `/regulatory/species/human/microarray/HumanWG_6_V2/ILMN_1773626`
 **GET /species/:species/binding_matrix/:binding_matrix_id**
 - Get transcription factor binding matrix
 - Example: `/species/human/binding_matrix/ENSPFM0001`
 ## 12. Sequence
 Retrieve genomic, transcript, and protein sequences.
 **GET /sequence/id/:id**
 - Get sequence by ID
 - Parameters: `type` (genomic, cds, cdna, protein), `format` (json, fasta, text)
 - Example: `/sequence/id/ENSG00000139618?type=genomic`
 **POST /sequence/id**
 - Batch sequence retrieval
 - Example: `{"ids": ["ENSG00000139618", "ENSG00000157764"]}`
 **GET /sequence/region/:species/:region**
 - Get genomic sequence for region
 - Parameters: `coord_system`, `format`
 - Example: `/sequence/region/human/7:140424943..140624564?format=fasta`
 **POST /sequence/region/:species**
 - Batch region sequence retrieval
 ## 13. Transcript Haplotypes
 Compute transcript haplotypes from phased genotypes.
 **GET /transcript_haplotypes/:species/:id**
 - Get transcript haplotypes
 - Example: `/transcript_haplotypes/human/ENST00000288602`
 ## 14. Variant Effect Predictor (VEP)
 Predict functional consequences of variants.
 **GET /vep/:species/hgvs/:hgvs_notation**
 - Predict variant effects using HGVS notation
 - Parameters: numerous VEP options
 - Example: `/vep/human/hgvs/ENST00000288602:c.803C>T`
 **POST /vep/:species/hgvs**
 - Batch VEP analysis with HGVS
 - Example: `{"hgvs_notations": ["ENST00000288602:c.803C>T"]}`
 **GET /vep/:species/id/:id**
 - Predict effects for variant ID
 - Example: `/vep/human/id/rs699`
 **POST /vep/:species/id**
 - Batch VEP by variant IDs
 **GET /vep/:species/region/:region/:allele**
 - Predict effects for region and allele
 - Example: `/vep/human/region/7:140453136:C/T`
 **POST /vep/:species/region**
 - Batch VEP by regions
 ## 15. Variation
 Query genetic variation data and associated publications.
 **GET /variation/:species/:id**
 - Get variant information by ID
 - Parameters: `pops` (include population frequencies), `genotypes`
 - Example: `/variation/human/rs699?pops=1`
 **POST /variation/:species**
 - Batch variant queries
 - Example: `{"ids": ["rs699", "rs6025"]}`
 **GET /variation/:species/pmcid/:pmcid**
 - Get variants from PubMed Central article
 - Example: `/variation/human/pmcid/PMC5002951`
 **GET /variation/:species/pmid/:pmid**
 - Get variants from PubMed article
 - Example: `/variation/human/pmid/26318936`
 ## 16. Variation GA4GH
 Access genomic variation data using GA4GH standards.
 **POST /ga4gh/beacon**
 - Query beacon for variant presence
 **GET /ga4gh/features/:id**
 - Get feature by ID in GA4GH format
 **POST /ga4gh/features/search**
 - Search features using GA4GH protocol
 **POST /ga4gh/variants/search**
 - Search variants using GA4GH protocol
 ## Response Formats
 Most endpoints support multiple response formats:
 - **JSON** (default): `Content-Type: application/json`
 - **FASTA**: For sequence data
 - **XML**: Some endpoints support XML
 - **Text**: Plain text output
 Specify format using:
 1. `Content-Type` header
 2. URL parameter: `content-type=text/x-fasta`
 3. File extension: `/sequence/id/ENSG00000139618.fasta`
 ## Common Parameters
 Many endpoints share these parameters:
 - **expand**: Include child objects (transcripts, proteins)
 - **format**: Output format (json, xml, fasta)
 - **db_type**: Database type (core, otherfeatures, variation)
 - **object_type**: Type of object to return
 - **species**: Species name (can be common or scientific)
 ## Error Codes
 - **200**: Success
 - **400**: Bad request (invalid parameters)
 - **404**: Not found (ID doesn't exist)
 - **429**: Rate limit exceeded
 - **500**: Internal server error
 ## Best Practices
 1. **Use batch endpoints** for multiple queries (more efficient)
 2. **Cache responses** to minimize API calls
 3. **Check rate limit headers** in responses
 4. **Handle 429 errors** by respecting `Retry-After` header
 5. **Use appropriate content types** for sequence data
 6. **Specify assembly** when querying older genome versions
 7. **Enable expand parameter** when you need full object details
--- a/scientific-databases/ensembl-database/scripts/ensembl_query.py
+++ b/scientific-databases/ensembl-database/scripts/ensembl_query.py
@@ -0,0 +1,427 @@
 #!/usr/bin/env python3
 """
 Ensembl REST API Query Script
 Reusable functions for common Ensembl database queries with built-in rate limiting and error handling.
 Usage:
    python ensembl_query.py --gene BRCA2 --species human
    python ensembl_query.py --variant rs699 --species human
    python ensembl_query.py --region "7:140424943-140624564" --species human
 """
 import requests
 import time
 import json
 import argparse
 from typing import Dict, List, Optional, Any
 class EnsemblAPIClient:
    """Client for querying the Ensembl REST API with rate limiting and error handling."""
    def __init__(self, server: str = "https://rest.ensembl.org", rate_limit: int = 15):
        """
        Initialize the Ensembl API client.
        Args:
            server: Base URL for the Ensembl REST API
            rate_limit: Maximum requests per second (default 15 for anonymous users)
        """
        self.server = server
        self.rate_limit = rate_limit
        self.request_count = 0
        self.last_request_time = 0
    def _rate_limit_check(self):
        """Enforce rate limiting before making requests."""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time
        if time_since_last < 1.0:
            if self.request_count >= self.rate_limit:
                sleep_time = 1.0 - time_since_last
                time.sleep(sleep_time)
                self.request_count = 0
                self.last_request_time = time.time()
        else:
            self.request_count = 0
            self.last_request_time = current_time
    def _make_request(
        self,
        endpoint: str,
        params: Optional[Dict] = None,
        max_retries: int = 3,
        method: str = "GET",
        data: Optional[Dict] = None
    ) -> Any:
        """
        Make an API request with error handling and retries.
        Args:
            endpoint: API endpoint path
            params: Query parameters
            max_retries: Maximum number of retry attempts
            method: HTTP method (GET or POST)
            data: JSON data for POST requests
        Returns:
            JSON response data
        Raises:
            Exception: If request fails after max retries
        """
        headers = {"Content-Type": "application/json"}
        url = f"{self.server}{endpoint}"
        for attempt in range(max_retries):
            self._rate_limit_check()
            self.request_count += 1
            try:
                if method == "POST":
                    response = requests.post(url, headers=headers, json=data)
                else:
                    response = requests.get(url, headers=headers, params=params)
                if response.status_code == 200:
                    return response.json()
                elif response.status_code == 429:
                    # Rate limited - wait and retry
                    retry_after = int(response.headers.get('Retry-After', 1))
                    print(f"Rate limited. Waiting {retry_after} seconds...")
                    time.sleep(retry_after)
                elif response.status_code == 404:
                    raise Exception(f"Resource not found: {endpoint}")
                else:
                    response.raise_for_status()
            except requests.exceptions.RequestException as e:
                if attempt == max_retries - 1:
                    raise Exception(f"Request failed after {max_retries} attempts: {e}")
                time.sleep(2 ** attempt)  # Exponential backoff
        raise Exception(f"Failed after {max_retries} attempts")
    def lookup_gene_by_symbol(self, species: str, symbol: str, expand: bool = True) -> Dict:
        """
        Look up gene information by symbol.
        Args:
            species: Species name (e.g., 'human', 'mouse')
            symbol: Gene symbol (e.g., 'BRCA2', 'TP53')
            expand: Include transcript information
        Returns:
            Gene information dictionary
        """
        endpoint = f"/lookup/symbol/{species}/{symbol}"
        params = {"expand": 1} if expand else {}
        return self._make_request(endpoint, params=params)
    def lookup_by_id(self, ensembl_id: str, expand: bool = False) -> Dict:
        """
        Look up object by Ensembl ID.
        Args:
            ensembl_id: Ensembl identifier (e.g., 'ENSG00000139618')
            expand: Include child objects
        Returns:
            Object information dictionary
        """
        endpoint = f"/lookup/id/{ensembl_id}"
        params = {"expand": 1} if expand else {}
        return self._make_request(endpoint, params=params)
    def get_sequence(
        self,
        ensembl_id: str,
        seq_type: str = "genomic",
        format: str = "json"
    ) -> Any:
        """
        Retrieve sequence by Ensembl ID.
        Args:
            ensembl_id: Ensembl identifier
            seq_type: Sequence type ('genomic', 'cds', 'cdna', 'protein')
            format: Output format ('json', 'fasta', 'text')
        Returns:
            Sequence data
        """
        endpoint = f"/sequence/id/{ensembl_id}"
        params = {"type": seq_type}
        if format == "fasta":
            headers = {"Content-Type": "text/x-fasta"}
            url = f"{self.server}{endpoint}"
            response = requests.get(url, headers=headers, params=params)
            return response.text
        return self._make_request(endpoint, params=params)
    def get_region_sequence(
        self,
        species: str,
        region: str,
        format: str = "json"
    ) -> Any:
        """
        Get genomic sequence for a region.
        Args:
            species: Species name
            region: Region string (e.g., '7:140424943-140624564')
            format: Output format ('json', 'fasta', 'text')
        Returns:
            Sequence data
        """
        endpoint = f"/sequence/region/{species}/{region}"
        if format == "fasta":
            headers = {"Content-Type": "text/x-fasta"}
            url = f"{self.server}{endpoint}"
            response = requests.get(url, headers=headers)
            return response.text
        return self._make_request(endpoint)
    def get_variant(self, species: str, variant_id: str, include_pops: bool = True) -> Dict:
        """
        Get variant information by ID.
        Args:
            species: Species name
            variant_id: Variant identifier (e.g., 'rs699')
            include_pops: Include population frequencies
        Returns:
            Variant information dictionary
        """
        endpoint = f"/variation/{species}/{variant_id}"
        params = {"pops": 1} if include_pops else {}
        return self._make_request(endpoint, params=params)
    def predict_variant_effect(
        self,
        species: str,
        hgvs_notation: str
    ) -> List[Dict]:
        """
        Predict variant consequences using VEP.
        Args:
            species: Species name
            hgvs_notation: HGVS notation (e.g., 'ENST00000288602:c.803C>T')
        Returns:
            List of predicted consequences
        """
        endpoint = f"/vep/{species}/hgvs/{hgvs_notation}"
        return self._make_request(endpoint)
    def find_orthologs(
        self,
        ensembl_id: str,
        target_species: Optional[str] = None
    ) -> Dict:
        """
        Find orthologs for a gene.
        Args:
            ensembl_id: Source gene Ensembl ID
            target_species: Target species (optional, returns all if not specified)
        Returns:
            Homology information dictionary
        """
        endpoint = f"/homology/id/{ensembl_id}"
        params = {}
        if target_species:
            params["target_species"] = target_species
        return self._make_request(endpoint, params=params)
    def get_region_features(
        self,
        species: str,
        region: str,
        feature_type: str = "gene"
    ) -> List[Dict]:
        """
        Get genomic features in a region.
        Args:
            species: Species name
            region: Region string (e.g., '7:140424943-140624564')
            feature_type: Feature type ('gene', 'transcript', 'variation', etc.)
        Returns:
            List of features
        """
        endpoint = f"/overlap/region/{species}/{region}"
        params = {"feature": feature_type}
        return self._make_request(endpoint, params=params)
    def get_species_info(self) -> List[Dict]:
        """
        Get information about all available species.
        Returns:
            List of species information dictionaries
        """
        endpoint = "/info/species"
        result = self._make_request(endpoint)
        return result.get("species", [])
    def get_assembly_info(self, species: str) -> Dict:
        """
        Get assembly information for a species.
        Args:
            species: Species name
        Returns:
            Assembly information dictionary
        """
        endpoint = f"/info/assembly/{species}"
        return self._make_request(endpoint)
    def map_coordinates(
        self,
        species: str,
        asm_from: str,
        region: str,
        asm_to: str
    ) -> Dict:
        """
        Map coordinates between genome assemblies.
        Args:
            species: Species name
            asm_from: Source assembly (e.g., 'GRCh37')
            region: Region string (e.g., '7:140453136-140453136')
            asm_to: Target assembly (e.g., 'GRCh38')
        Returns:
            Mapped coordinates
        """
        endpoint = f"/map/{species}/{asm_from}/{region}/{asm_to}"
        return self._make_request(endpoint)
 def main():
    """Command-line interface for common Ensembl queries."""
    parser = argparse.ArgumentParser(
        description="Query the Ensembl database via REST API"
    )
    parser.add_argument("--gene", help="Gene symbol to look up")
    parser.add_argument("--ensembl-id", help="Ensembl ID to look up")
    parser.add_argument("--variant", help="Variant ID (e.g., rs699)")
    parser.add_argument("--region", help="Genomic region (chr:start-end)")
    parser.add_argument(
        "--species",
        default="human",
        help="Species name (default: human)"
    )
    parser.add_argument(
        "--orthologs",
        help="Find orthologs for gene (provide Ensembl ID)"
    )
    parser.add_argument(
        "--target-species",
        help="Target species for ortholog search"
    )
    parser.add_argument(
        "--sequence",
        action="store_true",
        help="Retrieve sequence (requires --gene or --ensembl-id or --region)"
    )
    parser.add_argument(
        "--format",
        choices=["json", "fasta"],
        default="json",
        help="Output format (default: json)"
    )
    parser.add_argument(
        "--assembly",
        default="GRCh37",
        help="For GRCh37, use grch37.rest.ensembl.org server"
    )
    args = parser.parse_args()
    # Select appropriate server
    server = "https://rest.ensembl.org"
    if args.assembly.lower() == "grch37":
        server = "https://grch37.rest.ensembl.org"
    client = EnsemblAPIClient(server=server)
    try:
        if args.gene:
            print(f"Looking up gene: {args.gene}")
            result = client.lookup_gene_by_symbol(args.species, args.gene)
            if args.sequence:
                print(f"\nRetrieving sequence for {result['id']}...")
                seq_result = client.get_sequence(
                    result['id'],
                    format=args.format
                )
                print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
            else:
                print(json.dumps(result, indent=2))
        elif args.ensembl_id:
            print(f"Looking up ID: {args.ensembl_id}")
            result = client.lookup_by_id(args.ensembl_id, expand=True)
            if args.sequence:
                print(f"\nRetrieving sequence...")
                seq_result = client.get_sequence(
                    args.ensembl_id,
                    format=args.format
                )
                print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
            else:
                print(json.dumps(result, indent=2))
        elif args.variant:
            print(f"Looking up variant: {args.variant}")
            result = client.get_variant(args.species, args.variant)
            print(json.dumps(result, indent=2))
        elif args.region:
            if args.sequence:
                print(f"Retrieving sequence for region: {args.region}")
                result = client.get_region_sequence(
                    args.species,
                    args.region,
                    format=args.format
                )
                print(json.dumps(result, indent=2) if args.format == "json" else result)
            else:
                print(f"Finding features in region: {args.region}")
                result = client.get_region_features(args.species, args.region)
                print(json.dumps(result, indent=2))
        elif args.orthologs:
            print(f"Finding orthologs for: {args.orthologs}")
            result = client.find_orthologs(
                args.orthologs,
                target_species=args.target_species
            )
            print(json.dumps(result, indent=2))
        else:
            parser.print_help()
    except Exception as e:
        print(f"Error: {e}")
        return 1
    return 0
 if __name__ == "__main__":
    exit(main())
--- a/scientific-databases/metabolomics-workbench-database/SKILL.md
+++ b/scientific-databases/metabolomics-workbench-database/SKILL.md
@@ -0,0 +1,251 @@
 ---
 name: metabolomics-workbench-database
 description: Toolkit for accessing and querying the Metabolomics Workbench, an NIH-sponsored repository containing 4,200+ metabolomics studies with standardized nomenclature (RefMet), study metadata, experimental results, and comprehensive metabolite databases. Use this skill when working with metabolomics data, querying metabolite structures, accessing study results, standardizing metabolite names, performing mass spectrometry searches, or retrieving gene/protein associations with metabolites.
 ---
 # Metabolomics Workbench Database
 ## Overview
 The Metabolomics Workbench is a comprehensive NIH Common Fund-sponsored platform hosted at UCSD that serves as the primary repository for metabolomics research data. It provides programmatic access to over 4,200 processed studies (3,790+ publicly available), standardized metabolite nomenclature through RefMet, and powerful search capabilities across multiple analytical platforms (GC-MS, LC-MS, NMR).
 This skill enables efficient interaction with the Metabolomics Workbench REST API to query metabolite structures, access study data, standardize nomenclature, perform mass spectrometry searches, and retrieve gene/protein-metabolite associations.
 ## Core Capabilities
 ### 1. Querying Metabolite Structures and Data
 Access comprehensive metabolite information including structures, identifiers, and cross-references to external databases.
 **Key operations:**
 - Retrieve compound data by various identifiers (PubChem CID, InChI Key, KEGG ID, HMDB ID, etc.)
 - Download molecular structures as MOL files or PNG images
 - Access standardized compound classifications
 - Cross-reference between different metabolite databases
 **Example queries:**
 ```python
 import requests
 # Get compound information by PubChem CID
 response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json')
 # Download molecular structure as PNG
 response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/png')
 # Get compound name by registry number
 response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json')
 ```
 ### 2. Accessing Study Metadata and Experimental Results
 Query metabolomics studies by various criteria and retrieve complete experimental datasets.
 **Key operations:**
 - Search studies by metabolite, institute, investigator, or title
 - Access study summaries, experimental factors, and analysis details
 - Retrieve complete experimental data in various formats
 - Download mwTab format files for complete study information
 - Query untargeted metabolomics data
 **Example queries:**
 ```python
 # List all available public studies
 response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json')
 # Get study summary
 response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json')
 # Retrieve experimental data
 response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
 # Find studies containing a specific metabolite
 response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json')
 ```
 ### 3. Standardizing Metabolite Nomenclature with RefMet
 Use the RefMet database to standardize metabolite names and access systematic classification across four structural resolution levels.
 **Key operations:**
 - Match common metabolite names to standardized RefMet names
 - Query by chemical formula, exact mass, or InChI Key
 - Access hierarchical classification (super class, main class, sub class)
 - Retrieve all RefMet entries or filter by classification
 **Example queries:**
 ```python
 # Standardize a metabolite name
 response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json')
 # Query by molecular formula
 response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/formula/C12H24O2/all/json')
 # Get all metabolites in a specific class
 response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json')
 # Retrieve complete RefMet database
 response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/all/json')
 ```
 ### 4. Performing Mass Spectrometry Searches
 Search for compounds by mass-to-charge ratio (m/z) with specified ion adducts and tolerance levels.
 **Key operations:**
 - Search precursor ion masses across multiple databases (Metabolomics Workbench, LIPIDS, RefMet)
 - Specify ion adduct types (M+H, M-H, M+Na, M+NH4, M+2H, etc.)
 - Calculate exact masses for known metabolites with specific adducts
 - Set mass tolerance for flexible matching
 **Example queries:**
 ```python
 # Search by m/z value with M+H adduct
 response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json')
 # Calculate exact mass for a metabolite with specific adduct
 response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json')
 # Search across RefMet database
 response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json')
 ```
 ### 5. Filtering Studies by Analytical and Biological Parameters
 Use the MetStat context to find studies matching specific experimental conditions.
 **Key operations:**
 - Filter by analytical method (LCMS, GCMS, NMR)
 - Specify ionization polarity (POSITIVE, NEGATIVE)
 - Filter by chromatography type (HILIC, RP, GC)
 - Target specific species, sample sources, or diseases
 - Combine multiple filters using semicolon-delimited format
 **Example queries:**
 ```python
 # Find human blood studies on diabetes using LC-MS
 response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json')
 # Find all human blood studies containing tyrosine
 response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json')
 # Filter by analytical method only
 response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json')
 ```
 ### 6. Accessing Gene and Protein Information
 Retrieve gene and protein data associated with metabolic pathways and metabolite metabolism.
 **Key operations:**
 - Query genes by symbol, name, or ID
 - Access protein sequences and annotations
 - Cross-reference between gene IDs, RefSeq IDs, and UniProt IDs
 - Retrieve gene-metabolite associations
 **Example queries:**
 ```python
 # Get gene information by symbol
 response = requests.get('https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json')
 # Retrieve protein data by UniProt ID
 response = requests.get('https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json')
 ```
 ## Common Workflows
 ### Workflow 1: Finding Studies for a Specific Metabolite
 To find all studies containing measurements of a specific metabolite:
 1. First standardize the metabolite name using RefMet:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/refmet/match/glucose/name/json')
   ```
 2. Use the standardized name to search for studies:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/study/refmet_name/Glucose/summary/json')
   ```
 3. Retrieve experimental data from specific studies:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json')
   ```
 ### Workflow 2: Identifying Compounds from MS Data
 To identify potential compounds from mass spectrometry m/z values:
 1. Perform m/z search with appropriate adduct and tolerance:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/moverz/MB/180.06/M+H/0.5/json')
   ```
 2. Review candidate compounds from results
 3. Retrieve detailed information for candidate compounds:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/all/json')
   ```
 4. Download structures for confirmation:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
   ```
 ### Workflow 3: Exploring Disease-Specific Metabolomics
 To find metabolomics studies for a specific disease and analytical platform:
 1. Use MetStat to filter studies:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;;Human;;Cancer/json')
   ```
 2. Review study IDs from results
 3. Access detailed study information:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/summary/json')
   ```
 4. Retrieve complete experimental data:
   ```python
   response = requests.get('https://www.metabolomicsworkbench.org/rest/study/study_id/ST{ID}/data/json')
   ```
 ## Output Formats
 The API supports two primary output formats:
 - **JSON** (default): Machine-readable format, ideal for programmatic access
 - **TXT**: Human-readable tab-delimited text format
 Specify format by appending `/json` or `/txt` to API URLs. When format is omitted, JSON is returned by default.
 ## Best Practices
 1. **Use RefMet for standardization**: Always standardize metabolite names through RefMet before searching studies to ensure consistent nomenclature
 2. **Specify appropriate adducts**: When performing m/z searches, use the correct ion adduct type for your analytical method (e.g., M+H for positive mode ESI)
 3. **Set reasonable tolerances**: Use appropriate mass tolerance values (typically 0.5 Da for low-resolution, 0.01 Da for high-resolution MS)
 4. **Cache reference data**: Consider caching frequently used reference data (RefMet database, compound information) to minimize API calls
 5. **Handle pagination**: For large result sets, be prepared to handle multiple data structures in responses
 6. **Validate identifiers**: Cross-reference metabolite identifiers across multiple databases when possible to ensure correct compound identification
 ## Resources
 ### references/
 Detailed API reference documentation is available in `references/api_reference.md`, including:
 - Complete REST API endpoint specifications
 - All available contexts (compound, study, refmet, metstat, gene, protein, moverz)
 - Input/output parameter details
 - Ion adduct types for mass spectrometry
 - Additional query examples
 Load this reference file when detailed API specifications are needed or when working with less common endpoints.
--- a/scientific-databases/metabolomics-workbench-database/references/api_reference.md
+++ b/scientific-databases/metabolomics-workbench-database/references/api_reference.md
@@ -0,0 +1,494 @@
 # Metabolomics Workbench REST API Reference
 ## Base URL
 All API requests use the following base URL:
 ```
 https://www.metabolomicsworkbench.org/rest/
 ```
 ## API Structure
 The REST API follows a consistent URL pattern:
 ```
 /context/input_item/input_value/output_item/output_format
 ```
 - **context**: The type of resource to access (study, compound, refmet, metstat, gene, protein, moverz)
 - **input_item**: The type of identifier or search parameter
 - **input_value**: The specific value to search for
 - **output_item**: What data to return (e.g., all, name, summary)
 - **output_format**: json or txt (json is default if omitted)
 ## Output Formats
 - **json**: Machine-readable JSON format (default)
 - **txt**: Tab-delimited text format for human readability
 ## Context 1: Compound
 Retrieve metabolite structure and identification data.
 ### Input Items
 | Input Item | Description | Example |
 |------------|-------------|---------|
 | `regno` | Metabolomics Workbench registry number | 11 |
 | `pubchem_cid` | PubChem Compound ID | 5281365 |
 | `inchi_key` | International Chemical Identifier Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
 | `formula` | Molecular formula | C6H12O6 |
 | `lm_id` | LIPID MAPS ID | LM... |
 | `hmdb_id` | Human Metabolome Database ID | HMDB0000122 |
 | `kegg_id` | KEGG Compound ID | C00031 |
 ### Output Items
 | Output Item | Description |
 |-------------|-------------|
 | `all` | All available compound data |
 | `classification` | Compound classification |
 | `regno` | Registry number |
 | `formula` | Molecular formula |
 | `exactmass` | Exact mass |
 | `inchi_key` | InChI Key |
 | `name` | Common name |
 | `sys_name` | Systematic name |
 | `smiles` | SMILES notation |
 | `lm_id` | LIPID MAPS ID |
 | `pubchem_cid` | PubChem CID |
 | `hmdb_id` | HMDB ID |
 | `kegg_id` | KEGG ID |
 | `chebi_id` | ChEBI ID |
 | `metacyc_id` | MetaCyc ID |
 | `molfile` | MOL file structure |
 | `png` | PNG image of structure |
 ### Example Requests
 ```bash
 # Get all compound data by PubChem CID
 curl "https://www.metabolomicsworkbench.org/rest/compound/pubchem_cid/5281365/all/json"
 # Get compound name by registry number
 curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/name/json"
 # Download structure as PNG
 curl "https://www.metabolomicsworkbench.org/rest/compound/regno/11/png" -o structure.png
 # Get compound by KEGG ID
 curl "https://www.metabolomicsworkbench.org/rest/compound/kegg_id/C00031/all/json"
 # Get compound by molecular formula
 curl "https://www.metabolomicsworkbench.org/rest/compound/formula/C6H12O6/all/json"
 ```
 ## Context 2: Study
 Access metabolomics research study metadata and experimental results.
 ### Input Items
 | Input Item | Description | Example |
 |------------|-------------|---------|
 | `study_id` | Study identifier | ST000001 |
 | `analysis_id` | Analysis identifier | AN000001 |
 | `study_title` | Keywords in study title | diabetes |
 | `institute` | Institute name | UCSD |
 | `last_name` | Investigator last name | Smith |
 | `metabolite_id` | Metabolite registry number | 11 |
 | `refmet_name` | RefMet standardized name | Glucose |
 | `kegg_id` | KEGG compound ID | C00031 |
 ### Output Items
 | Output Item | Description |
 |-------------|-------------|
 | `summary` | Study overview and metadata |
 | `factors` | Experimental factors and design |
 | `analysis` | Analysis methods and parameters |
 | `metabolites` | List of measured metabolites |
 | `data` | Complete experimental data |
 | `mwtab` | Complete study in mwTab format |
 | `number_of_metabolites` | Count of metabolites measured |
 | `species` | Organism species |
 | `disease` | Disease studied |
 | `source` | Sample source/tissue type |
 | `untarg_studies` | Untargeted study information |
 | `untarg_factors` | Untargeted study factors |
 | `untarg_data` | Untargeted experimental data |
 | `datatable` | Formatted data table |
 | `available` | List available studies (use with ST as input_value) |
 ### Example Requests
 ```bash
 # List all publicly available studies
 curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST/available/json"
 # Get study summary
 curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/summary/json"
 # Get experimental data
 curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/data/json"
 # Get study factors
 curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/factors/json"
 # Find studies containing a specific metabolite
 curl "https://www.metabolomicsworkbench.org/rest/study/refmet_name/Tyrosine/summary/json"
 # Search studies by investigator
 curl "https://www.metabolomicsworkbench.org/rest/study/last_name/Smith/summary/json"
 # Download complete study in mwTab format
 curl "https://www.metabolomicsworkbench.org/rest/study/study_id/ST000001/mwtab/txt"
 ```
 ## Context 3: RefMet
 Query the standardized metabolite nomenclature database with hierarchical classification.
 ### Input Items
 | Input Item | Description | Example |
 |------------|-------------|---------|
 | `name` | Metabolite name | glucose |
 | `inchi_key` | InChI Key | WQZGKKKJIJFFOK-GASJEMHNSA-N |
 | `pubchem_cid` | PubChem CID | 5793 |
 | `exactmass` | Exact mass | 180.0634 |
 | `formula` | Molecular formula | C6H12O6 |
 | `super_class` | Super class name | Organic compounds |
 | `main_class` | Main class name | Carbohydrates |
 | `sub_class` | Sub class name | Monosaccharides |
 | `match` | Name matching/standardization | citrate |
 | `refmet_id` | RefMet identifier | 12345 |
 | `all` | Retrieve all RefMet entries | (no value needed) |
 ### Output Items
 | Output Item | Description |
 |-------------|-------------|
 | `all` | All available RefMet data |
 | `name` | Standardized RefMet name |
 | `inchi_key` | InChI Key |
 | `pubchem_cid` | PubChem CID |
 | `exactmass` | Exact mass |
 | `formula` | Molecular formula |
 | `sys_name` | Systematic name |
 | `super_class` | Super class classification |
 | `main_class` | Main class classification |
 | `sub_class` | Sub class classification |
 | `refmet_id` | RefMet identifier |
 ### Example Requests
 ```bash
 # Standardize a metabolite name
 curl "https://www.metabolomicsworkbench.org/rest/refmet/match/citrate/name/json"
 # Get all RefMet data for a metabolite
 curl "https://www.metabolomicsworkbench.org/rest/refmet/name/Glucose/all/json"
 # Query by molecular formula
 curl "https://www.metabolomicsworkbench.org/rest/refmet/formula/C6H12O6/all/json"
 # Get all metabolites in a main class
 curl "https://www.metabolomicsworkbench.org/rest/refmet/main_class/Fatty%20Acids/all/json"
 # Query by exact mass
 curl "https://www.metabolomicsworkbench.org/rest/refmet/exactmass/180.0634/all/json"
 # Download complete RefMet database
 curl "https://www.metabolomicsworkbench.org/rest/refmet/all/json"
 ```
 ### RefMet Classification Hierarchy
 RefMet provides four-level structural resolution:
 1. **Super Class**: Broadest categorization (e.g., "Organic compounds", "Lipids")
 2. **Main Class**: Major biochemical categories (e.g., "Fatty Acids", "Carbohydrates")
 3. **Sub Class**: More specific groupings (e.g., "Monosaccharides", "Amino acids")
 4. **Individual Metabolite**: Specific compound with standardized name
 ## Context 4: MetStat
 Filter studies by analytical and biological parameters using semicolon-delimited format.
 ### Format
 ```
 /metstat/ANALYSIS_TYPE;POLARITY;CHROMATOGRAPHY;SPECIES;SAMPLE_SOURCE;DISEASE;KEGG_ID;REFMET_NAME
 ```
 ### Parameters
 | Position | Parameter | Options |
 |----------|-----------|---------|
 | 1 | Analysis Type | LCMS, GCMS, NMR, MS, ICPMS |
 | 2 | Polarity | POSITIVE, NEGATIVE |
 | 3 | Chromatography | HILIC, RP (Reverse Phase), GC, IC |
 | 4 | Species | Human, Mouse, Rat, etc. |
 | 5 | Sample Source | Blood, Plasma, Serum, Urine, Liver, etc. |
 | 6 | Disease | Diabetes, Cancer, Alzheimer, etc. |
 | 7 | KEGG ID | C00031, etc. |
 | 8 | RefMet Name | Glucose, Tyrosine, etc. |
 **Note**: Use empty positions (consecutive semicolons) to skip parameters. All parameters are optional.
 ### Example Requests
 ```bash
 # Human blood diabetes studies with LC-MS HILIC positive mode
 curl "https://www.metabolomicsworkbench.org/rest/metstat/LCMS;POSITIVE;HILIC;Human;Blood;Diabetes/json"
 # All human blood studies containing tyrosine
 curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Human;Blood;;;Tyrosine/json"
 # All GC-MS studies regardless of other parameters
 curl "https://www.metabolomicsworkbench.org/rest/metstat/GCMS;;;;;;/json"
 # Mouse liver studies
 curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;Mouse;Liver;;/json"
 # All studies measuring glucose
 curl "https://www.metabolomicsworkbench.org/rest/metstat/;;;;;;;Glucose/json"
 ```
 ## Context 5: Moverz
 Perform mass spectrometry precursor ion searches by m/z value.
 ### Format for m/z Search
 ```
 /moverz/DATABASE/mass/adduct/tolerance/format
 ```
 - **DATABASE**: MB (Metabolomics Workbench), LIPIDS, REFMET
 - **mass**: m/z value (e.g., 635.52)
 - **adduct**: Ion adduct type (see table below)
 - **tolerance**: Mass tolerance in Daltons (e.g., 0.5)
 - **format**: json or txt
 ### Format for Exact Mass Calculation
 ```
 /moverz/exactmass/metabolite_name/adduct/format
 ```
 ### Ion Adduct Types
 #### Positive Mode Adducts
 | Adduct | Description | Example Use |
 |--------|-------------|-------------|
 | `M+H` | Protonated molecule | Most common positive ESI |
 | `M+Na` | Sodium adduct | Common in ESI |
 | `M+K` | Potassium adduct | Less common ESI |
 | `M+NH4` | Ammonium adduct | Common with ammonium salts |
 | `M+2H` | Doubly protonated | Multiply charged ions |
 | `M+H-H2O` | Dehydrated protonated | Loss of water |
 | `M+2Na-H` | Disodium minus hydrogen | Multiple sodium |
 | `M+CH3OH+H` | Methanol adduct | Methanol in mobile phase |
 | `M+ACN+H` | Acetonitrile adduct | ACN in mobile phase |
 | `M+ACN+Na` | ACN + sodium | ACN and sodium |
 #### Negative Mode Adducts
 | Adduct | Description | Example Use |
 |--------|-------------|-------------|
 | `M-H` | Deprotonated molecule | Most common negative ESI |
 | `M+Cl` | Chloride adduct | Chlorinated mobile phases |
 | `M+FA-H` | Formate adduct | Formic acid in mobile phase |
 | `M+HAc-H` | Acetate adduct | Acetic acid in mobile phase |
 | `M-H-H2O` | Deprotonated minus water | Water loss |
 | `M-2H` | Doubly deprotonated | Multiply charged ions |
 | `M+Na-2H` | Sodium minus two protons | Mixed charge states |
 #### Uncharged
 | Adduct | Description |
 |--------|-------------|
 | `M` | Uncharged molecule | Direct ionization methods |
 ### Example Requests
 ```bash
 # Search for compounds with m/z 635.52 (M+H) in MB database
 curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/635.52/M+H/0.5/json"
 # Search in RefMet with negative mode
 curl "https://www.metabolomicsworkbench.org/rest/moverz/REFMET/200.15/M-H/0.3/json"
 # Search lipids database
 curl "https://www.metabolomicsworkbench.org/rest/moverz/LIPIDS/760.59/M+Na/0.5/json"
 # Calculate exact mass for known metabolite
 curl "https://www.metabolomicsworkbench.org/rest/moverz/exactmass/PC(34:1)/M+H/json"
 # High-resolution MS search (tight tolerance)
 curl "https://www.metabolomicsworkbench.org/rest/moverz/MB/180.0634/M+H/0.01/json"
 ```
 ## Context 6: Gene
 Access gene information from the Metabolome Gene/Protein (MGP) database.
 ### Input Items
 | Input Item | Description | Example |
 |------------|-------------|---------|
 | `mgp_id` | MGP database ID | MGP001 |
 | `gene_id` | NCBI Gene ID | 31 |
 | `gene_name` | Full gene name | acetyl-CoA carboxylase |
 | `gene_symbol` | Gene symbol | ACACA |
 | `taxid` | Taxonomy ID | 9606 (human) |
 ### Output Items
 | Output Item | Description |
 |-------------|-------------|
 | `all` | All gene information |
 | `mgp_id` | MGP identifier |
 | `gene_id` | NCBI Gene ID |
 | `gene_name` | Full gene name |
 | `gene_symbol` | Gene symbol |
 | `gene_synonyms` | Alternative names |
 | `alt_names` | Alternative nomenclature |
 | `chromosome` | Chromosomal location |
 | `map_location` | Genetic map position |
 | `summary` | Gene description |
 | `taxid` | Taxonomy ID |
 | `species` | Species short name |
 | `species_long` | Full species name |
 ### Example Requests
 ```bash
 # Get gene information by symbol
 curl "https://www.metabolomicsworkbench.org/rest/gene/gene_symbol/ACACA/all/json"
 # Get gene by NCBI Gene ID
 curl "https://www.metabolomicsworkbench.org/rest/gene/gene_id/31/all/json"
 # Search by gene name
 curl "https://www.metabolomicsworkbench.org/rest/gene/gene_name/carboxylase/summary/json"
 ```
 ## Context 7: Protein
 Retrieve protein sequence and annotation data.
 ### Input Items
 | Input Item | Description | Example |
 |------------|-------------|---------|
 | `mgp_id` | MGP database ID | MGP001 |
 | `gene_id` | NCBI Gene ID | 31 |
 | `gene_name` | Gene name | acetyl-CoA carboxylase |
 | `gene_symbol` | Gene symbol | ACACA |
 | `taxid` | Taxonomy ID | 9606 |
 | `mrna_id` | mRNA identifier | NM_001093.3 |
 | `refseq_id` | RefSeq ID | NP_001084 |
 | `protein_gi` | GenInfo Identifier | 4557237 |
 | `uniprot_id` | UniProt ID | Q13085 |
 | `protein_entry` | Protein entry name | ACACA_HUMAN |
 | `protein_name` | Protein name | Acetyl-CoA carboxylase |
 ### Output Items
 | Output Item | Description |
 |-------------|-------------|
 | `all` | All protein information |
 | `mgp_id` | MGP identifier |
 | `gene_id` | NCBI Gene ID |
 | `gene_name` | Gene name |
 | `gene_symbol` | Gene symbol |
 | `taxid` | Taxonomy ID |
 | `species` | Species short name |
 | `species_long` | Full species name |
 | `mrna_id` | mRNA identifier |
 | `refseq_id` | RefSeq protein ID |
 | `protein_gi` | GenInfo Identifier |
 | `uniprot_id` | UniProt accession |
 | `protein_entry` | Protein entry name |
 | `protein_name` | Full protein name |
 | `seqlength` | Sequence length |
 | `seq` | Amino acid sequence |
 | `is_identical_to` | Identical sequences |
 ### Example Requests
 ```bash
 # Get protein information by UniProt ID
 curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/all/json"
 # Get protein by gene symbol
 curl "https://www.metabolomicsworkbench.org/rest/protein/gene_symbol/ACACA/all/json"
 # Get protein sequence
 curl "https://www.metabolomicsworkbench.org/rest/protein/uniprot_id/Q13085/seq/json"
 # Search by RefSeq ID
 curl "https://www.metabolomicsworkbench.org/rest/protein/refseq_id/NP_001084/all/json"
 ```
 ## Error Handling
 The API returns appropriate HTTP status codes:
 - **200 OK**: Successful request
 - **400 Bad Request**: Invalid parameters or malformed request
 - **404 Not Found**: Resource not found
 - **500 Internal Server Error**: Server-side error
 When no results are found, the API typically returns an empty array or object rather than an error code.
 ## Rate Limiting
 As of 2025, the Metabolomics Workbench REST API does not enforce strict rate limits for reasonable use. However, best practices include:
 - Implementing delays between bulk requests
 - Caching frequently accessed reference data
 - Using appropriate batch sizes for large-scale queries
 ## Additional Resources
 - **Interactive REST URL Creator**: https://www.metabolomicsworkbench.org/tools/mw_rest.php
 - **Official API Specification**: https://www.metabolomicsworkbench.org/tools/MWRestAPIv1.1.pdf
 - **Python Library**: mwtab package for Python users
 - **R Package**: metabolomicsWorkbenchR (Bioconductor)
 - **Julia Package**: MetabolomicsWorkbenchAPI.jl
 ## Python Example: Complete Workflow
 ```python
 import requests
 import json
 # 1. Standardize metabolite name using RefMet
 metabolite = "citrate"
 response = requests.get(f'https://www.metabolomicsworkbench.org/rest/refmet/match/{metabolite}/name/json')
 standardized_name = response.json()['name']
 # 2. Search for studies containing this metabolite
 response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/refmet_name/{standardized_name}/summary/json')
 studies = response.json()
 # 3. Get detailed data from a specific study
 study_id = studies[0]['study_id']
 response = requests.get(f'https://www.metabolomicsworkbench.org/rest/study/study_id/{study_id}/data/json')
 data = response.json()
 # 4. Perform m/z search for compound identification
 mz_value = 180.06
 response = requests.get(f'https://www.metabolomicsworkbench.org/rest/moverz/MB/{mz_value}/M+H/0.5/json')
 matches = response.json()
 # 5. Get compound structure
 regno = matches[0]['regno']
 response = requests.get(f'https://www.metabolomicsworkbench.org/rest/compound/regno/{regno}/png')
 with open('structure.png', 'wb') as f:
    f.write(response.content)
 ```
--- a/scientific-databases/reactome-database/reactome-database/SKILL.md
+++ b/scientific-databases/reactome-database/reactome-database/SKILL.md
@@ -0,0 +1,261 @@
 ---
 name: reactome-database
 description: Work with Reactome pathway database for analyzing biological pathways, performing pathway enrichment analysis, querying molecular interactions, and analyzing gene expression data. This skill should be used when working with biological pathways, performing overrepresentation analysis, mapping gene identifiers to pathways, analyzing gene expression datasets, or exploring disease-related pathways. Supports both direct REST API access and the reactome2py Python package.
 ---
 # Reactome Database
 ## Overview
 This skill enables interaction with Reactome, a free, open-source, curated and peer-reviewed pathway database. Reactome provides comprehensive biological pathway data for research, genome analysis, modeling, and systems biology. The database contains thousands of human pathways, reactions, proteins, small molecules, and drugs, all supported by extensive literature references.
 ## Core Capabilities
 Reactome provides two main API services and a Python client library:
 ### 1. Content Service - Data Retrieval
 Query and retrieve biological pathway data, molecular interactions, and entity information.
 **Common operations:**
 - Retrieve pathway information and hierarchies
 - Query specific entities (proteins, reactions, complexes)
 - Get participating molecules in pathways
 - Access database version and metadata
 - Explore pathway compartments and locations
 **API Base URL:** `https://reactome.org/ContentService`
 ### 2. Analysis Service - Pathway Analysis
 Perform computational analysis on gene lists and expression data.
 **Analysis types:**
 - **Overrepresentation Analysis**: Identify statistically significant pathways from gene/protein lists
 - **Expression Data Analysis**: Analyze gene expression datasets to find relevant pathways
 - **Species Comparison**: Compare pathway data across different organisms
 **API Base URL:** `https://reactome.org/AnalysisService`
 ### 3. reactome2py Python Package
 Python client library that wraps Reactome API calls for easier programmatic access.
 **Installation:**
 ```bash
 pip install reactome2py
 ```
 **Note:** The reactome2py package (version 3.0.0, released January 2021) is functional but not actively maintained. For the most up-to-date functionality, consider using direct REST API calls.
 ## Querying Pathway Data
 ### Using Content Service REST API
 The Content Service uses REST protocol and returns data in JSON or plain text formats.
 **Get database version:**
 ```python
 import requests
 response = requests.get("https://reactome.org/ContentService/data/database/version")
 version = response.text
 print(f"Reactome version: {version}")
 ```
 **Query a specific entity:**
 ```python
 import requests
 entity_id = "R-HSA-69278"  # Example pathway ID
 response = requests.get(f"https://reactome.org/ContentService/data/query/{entity_id}")
 data = response.json()
 ```
 **Get participating molecules in a pathway:**
 ```python
 import requests
 event_id = "R-HSA-69278"
 response = requests.get(
    f"https://reactome.org/ContentService/data/event/{event_id}/participatingPhysicalEntities"
 )
 molecules = response.json()
 ```
 ### Using reactome2py Package
 ```python
 import reactome2py
 from reactome2py import content
 # Query pathway information
 pathway_info = content.query_by_id("R-HSA-69278")
 # Get database version
 version = content.get_database_version()
 ```
 **For detailed API endpoints and parameters**, refer to `references/api_reference.md` in this skill.
 ## Performing Pathway Analysis
 ### Overrepresentation Analysis
 Submit a list of gene/protein identifiers to find enriched pathways.
 **Using REST API:**
 ```python
 import requests
 # Prepare identifier list
 identifiers = ["TP53", "BRCA1", "EGFR", "MYC"]
 data = "\n".join(identifiers)
 # Submit analysis
 response = requests.post(
    "https://reactome.org/AnalysisService/identifiers/",
    headers={"Content-Type": "text/plain"},
    data=data
 )
 result = response.json()
 token = result["summary"]["token"]  # Save token to retrieve results later
 # Access pathways
 for pathway in result["pathways"]:
    print(f"{pathway['stId']}: {pathway['name']} (p-value: {pathway['entities']['pValue']})")
 ```
 **Retrieve analysis by token:**
 ```python
 # Token is valid for 7 days
 response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
 results = response.json()
 ```
 ### Expression Data Analysis
 Analyze gene expression datasets with quantitative values.
 **Input format (TSV with header starting with #):**
 ```
 #Gene	Sample1	Sample2	Sample3
 TP53	2.5	3.1	2.8
 BRCA1	1.2	1.5	1.3
 EGFR	4.5	4.2	4.8
 ```
 **Submit expression data:**
 ```python
 import requests
 # Read TSV file
 with open("expression_data.tsv", "r") as f:
    data = f.read()
 response = requests.post(
    "https://reactome.org/AnalysisService/identifiers/",
    headers={"Content-Type": "text/plain"},
    data=data
 )
 result = response.json()
 ```
 ### Species Projection
 Map identifiers to human pathways exclusively using the `/projection/` endpoint:
 ```python
 response = requests.post(
    "https://reactome.org/AnalysisService/identifiers/projection/",
    headers={"Content-Type": "text/plain"},
    data=data
 )
 ```
 ## Visualizing Results
 Analysis results can be visualized in the Reactome Pathway Browser by constructing URLs with the analysis token:
 ```python
 token = result["summary"]["token"]
 pathway_id = "R-HSA-69278"
 url = f"https://reactome.org/PathwayBrowser/#{pathway_id}&DTAB=AN&ANALYSIS={token}"
 print(f"View results: {url}")
 ```
 ## Working with Analysis Tokens
 - Analysis tokens are valid for **7 days**
 - Tokens allow retrieval of previously computed results without re-submission
 - Store tokens to access results across sessions
 - Use `GET /token/{TOKEN}` endpoint to retrieve results
 ## Data Formats and Identifiers
 ### Supported Identifier Types
 Reactome accepts various identifier formats:
 - UniProt accessions (e.g., P04637)
 - Gene symbols (e.g., TP53)
 - Ensembl IDs (e.g., ENSG00000141510)
 - EntrezGene IDs (e.g., 7157)
 - ChEBI IDs for small molecules
 The system automatically detects identifier types.
 ### Input Format Requirements
 **For overrepresentation analysis:**
 - Plain text list of identifiers (one per line)
 - OR single column in TSV format
 **For expression analysis:**
 - TSV format with mandatory header row starting with "#"
 - Column 1: identifiers
 - Columns 2+: numeric expression values
 - Use period (.) as decimal separator
 ### Output Format
 All API responses return JSON containing:
 - `pathways`: Array of enriched pathways with statistical metrics
 - `summary`: Analysis metadata and token
 - `entities`: Matched and unmapped identifiers
 - Statistical values: pValue, FDR (false discovery rate)
 ## Helper Scripts
 This skill includes `scripts/reactome_query.py`, a helper script for common Reactome operations:
 ```bash
 # Query pathway information
 python scripts/reactome_query.py query R-HSA-69278
 # Perform overrepresentation analysis
 python scripts/reactome_query.py analyze gene_list.txt
 # Get database version
 python scripts/reactome_query.py version
 ```
 ## Additional Resources
 - **API Documentation**: https://reactome.org/dev
 - **User Guide**: https://reactome.org/userguide
 - **Documentation Portal**: https://reactome.org/documentation
 - **Data Downloads**: https://reactome.org/download-data
 - **reactome2py Docs**: https://reactome.github.io/reactome2py/
 For comprehensive API endpoint documentation, see `references/api_reference.md` in this skill.
 ## Current Database Statistics (Version 94, September 2025)
 - 2,825 human pathways
 - 16,002 reactions
 - 11,630 proteins
 - 2,176 small molecules
 - 1,070 drugs
 - 41,373 literature references
--- a/scientific-databases/reactome-database/reactome-database/references/api_reference.md
+++ b/scientific-databases/reactome-database/reactome-database/references/api_reference.md
@@ -0,0 +1,465 @@
 # Reactome API Reference
 This document provides comprehensive reference information for Reactome's REST APIs.
 ## Base URLs
 - **Content Service**: `https://reactome.org/ContentService`
 - **Analysis Service**: `https://reactome.org/AnalysisService`
 ## Content Service API
 The Content Service provides access to Reactome's curated pathway data through REST endpoints.
 ### Database Information
 #### Get Database Version
 ```
 GET /data/database/version
 ```
 **Response:** Plain text containing the database version number
 **Example:**
 ```python
 import requests
 response = requests.get("https://reactome.org/ContentService/data/database/version")
 print(response.text)  # e.g., "94"
 ```
 #### Get Database Name
 ```
 GET /data/database/name
 ```
 **Response:** Plain text containing the database name
 ### Entity Queries
 #### Query Entity by ID
 ```
 GET /data/query/{id}
 ```
 **Parameters:**
 - `id` (path): Stable identifier or database ID (e.g., "R-HSA-69278")
 **Response:** JSON object containing full entity information including:
 - `stId`: Stable identifier
 - `displayName`: Human-readable name
 - `schemaClass`: Entity type (Pathway, Reaction, Complex, etc.)
 - `species`: Array of species information
 - Additional type-specific fields
 **Example:**
 ```python
 import requests
 response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278")
 pathway = response.json()
 print(f"Pathway: {pathway['displayName']}")
 print(f"Species: {pathway['species'][0]['displayName']}")
 ```
 #### Query Entity Attribute
 ```
 GET /data/query/{id}/{attribute}
 ```
 **Parameters:**
 - `id` (path): Entity identifier
 - `attribute` (path): Specific attribute name (e.g., "displayName", "compartment")
 **Response:** JSON or plain text depending on attribute type
 **Example:**
 ```python
 response = requests.get("https://reactome.org/ContentService/data/query/R-HSA-69278/displayName")
 name = response.text
 ```
 ### Pathway Queries
 #### Get Pathway Entities
 ```
 GET /data/event/{id}/participatingPhysicalEntities
 ```
 **Parameters:**
 - `id` (path): Pathway or reaction stable identifier
 **Response:** JSON array of physical entities (proteins, complexes, small molecules) participating in the pathway
 **Example:**
 ```python
 response = requests.get(
    "https://reactome.org/ContentService/data/event/R-HSA-69278/participatingPhysicalEntities"
 )
 entities = response.json()
 for entity in entities:
    print(f"{entity['stId']}: {entity['displayName']} ({entity['schemaClass']})")
 ```
 #### Get Contained Events
 ```
 GET /data/pathway/{id}/containedEvents
 ```
 **Parameters:**
 - `id` (path): Pathway stable identifier
 **Response:** JSON array of events (reactions, subpathways) contained within the pathway
 ### Search Queries
 #### Search by Name
 ```
 GET /data/query?name={query}
 ```
 **Parameters:**
 - `name` (query): Search term
 **Response:** JSON array of matching entities
 **Example:**
 ```python
 response = requests.get(
    "https://reactome.org/ContentService/data/query",
    params={"name": "glycolysis"}
 )
 results = response.json()
 ```
 ## Analysis Service API
 The Analysis Service performs pathway enrichment and expression analysis.
 ### Submit Analysis
 #### Submit Identifiers (POST)
 ```
 POST /identifiers/
 POST /identifiers/projection/  # Map to human pathways only
 ```
 **Headers:**
 - `Content-Type: text/plain`
 **Body:**
 - For overrepresentation: Plain text list of identifiers (one per line)
 - For expression analysis: TSV format with header starting with "#"
 **Expression data format:**
 ```
 #Gene	Sample1	Sample2	Sample3
 TP53	2.5	3.1	2.8
 BRCA1	1.2	1.5	1.3
 ```
 **Response:** JSON object containing:
 ```json
 {
  "summary": {
    "token": "MzUxODM3NTQzMDAwMDA1ODI4MA==",
    "type": "OVERREPRESENTATION",
    "species": "9606",
    "sampleName": null,
    "fileName": null,
    "text": true
  },
  "pathways": [
    {
      "stId": "R-HSA-69278",
      "name": "Cell Cycle, Mitotic",
      "species": {
        "name": "Homo sapiens",
        "taxId": "9606"
      },
      "entities": {
        "found": 15,
        "total": 450,
        "pValue": 0.0000234,
        "fdr": 0.00156
      },
      "reactions": {
        "found": 12,
        "total": 342
      }
    }
  ],
  "resourceSummary": [
    {
      "resource": "TOTAL",
      "pathways": 25
    }
  ]
 }
 ```
 **Example:**
 ```python
 import requests
 # Overrepresentation analysis
 identifiers = ["TP53", "BRCA1", "EGFR", "MYC", "CDK1"]
 data = "\n".join(identifiers)
 response = requests.post(
    "https://reactome.org/AnalysisService/identifiers/",
    headers={"Content-Type": "text/plain"},
    data=data
 )
 result = response.json()
 token = result["summary"]["token"]
 # Process pathways
 for pathway in result["pathways"]:
    print(f"Pathway: {pathway['name']}")
    print(f"  Found: {pathway['entities']['found']}/{pathway['entities']['total']}")
    print(f"  p-value: {pathway['entities']['pValue']:.6f}")
    print(f"  FDR: {pathway['entities']['fdr']:.6f}")
 ```
 #### Submit File (Form Upload)
 ```
 POST /identifiers/form/
 ```
 **Content-Type:** `multipart/form-data`
 **Parameters:**
 - `file`: File containing identifiers or expression data
 #### Submit URL
 ```
 POST /identifiers/url/
 ```
 **Parameters:**
 - `url`: URL pointing to data file
 ### Retrieve Analysis Results
 #### Get Results by Token
 ```
 GET /token/{token}
 GET /token/{token}/projection/  # With species projection
 ```
 **Parameters:**
 - `token` (path): Analysis token returned from submission
 **Response:** Same structure as initial analysis response
 **Example:**
 ```python
 token = "MzUxODM3NTQzMDAwMDA1ODI4MA=="
 response = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
 results = response.json()
 ```
 **Note:** Tokens are valid for 7 days
 #### Filter Results
 ```
 GET /token/{token}/filter/pathways?resource={resource}
 ```
 **Parameters:**
 - `token` (path): Analysis token
 - `resource` (query): Resource filter (e.g., "TOTAL", "UNIPROT", "ENSEMBL")
 ### Download Results
 #### Download as CSV
 ```
 GET /download/{token}/pathways/{resource}/result.csv
 ```
 #### Download Mapping
 ```
 GET /download/{token}/entities/found/{resource}/mapping.tsv
 ```
 ## Supported Identifiers
 Reactome automatically detects and processes various identifier types:
 ### Proteins and Genes
 - **UniProt**: P04637
 - **Gene Symbol**: TP53
 - **Ensembl**: ENSG00000141510
 - **EntrezGene**: 7157
 - **RefSeq**: NM_000546
 - **OMIM**: 191170
 ### Small Molecules
 - **ChEBI**: CHEBI:15377
 - **KEGG Compound**: C00031
 - **PubChem**: 702
 ### Other
 - **miRBase**: hsa-miR-21
 - **InterPro**: IPR011616
 ## Response Formats
 ### JSON Objects
 Entity objects contain standardized fields:
 ```json
 {
  "stId": "R-HSA-69278",
  "displayName": "Cell Cycle, Mitotic",
  "schemaClass": "Pathway",
  "species": [
    {
      "dbId": 48887,
      "displayName": "Homo sapiens",
      "taxId": "9606"
    }
  ],
  "isInDisease": false
 }
 ```
 ### TSV Format
 For bulk queries, TSV returns:
 ```
 stId	displayName	schemaClass
 R-HSA-69278	Cell Cycle, Mitotic	Pathway
 R-HSA-69306	DNA Replication	Pathway
 ```
 ## Error Responses
 ### HTTP Status Codes
 - `200`: Success
 - `400`: Bad Request (invalid parameters)
 - `404`: Not Found (invalid ID)
 - `415`: Unsupported Media Type
 - `500`: Internal Server Error
 ### Error JSON Structure
 ```json
 {
  "code": 404,
  "reason": "NOT_FOUND",
  "messages": ["Pathway R-HSA-INVALID not found"]
 }
 ```
 ## Rate Limiting
 Reactome does not currently enforce strict rate limits, but consider:
 - Implementing reasonable delays between requests
 - Using batch operations when available
 - Caching results when appropriate
 - Respecting the 7-day token validity period
 ## Best Practices
 ### 1. Use Analysis Tokens
 Store and reuse analysis tokens to avoid redundant computation:
 ```python
 # Store token after analysis
 token = result["summary"]["token"]
 save_token(token)  # Save to file or database
 # Retrieve results later
 result = requests.get(f"https://reactome.org/AnalysisService/token/{token}")
 ```
 ### 2. Batch Queries
 Submit multiple identifiers in a single request rather than individual queries:
 ```python
 # Good: Single batch request
 identifiers = ["TP53", "BRCA1", "EGFR"]
 result = analyze_batch(identifiers)
 # Avoid: Multiple individual requests
 # for gene in genes:
 #     result = analyze_single(gene)  # Don't do this
 ```
 ### 3. Handle Species Appropriately
 Use `/projection/` endpoints to map non-human identifiers to human pathways:
 ```python
 # For mouse genes, project to human pathways
 response = requests.post(
    "https://reactome.org/AnalysisService/identifiers/projection/",
    headers={"Content-Type": "text/plain"},
    data=mouse_genes
 )
 ```
 ### 4. Process Large Result Sets
 For analyses returning many pathways, filter by significance:
 ```python
 significant_pathways = [
    p for p in result["pathways"]
    if p["entities"]["fdr"] < 0.05
 ]
 ```
 ## Integration Examples
 ### Complete Analysis Workflow
 ```python
 import requests
 import json
 def analyze_gene_list(genes, output_file="analysis_results.json"):
    """
    Perform pathway enrichment analysis on a list of genes
    """
    # Submit analysis
    data = "\n".join(genes)
    response = requests.post(
        "https://reactome.org/AnalysisService/identifiers/",
        headers={"Content-Type": "text/plain"},
        data=data
    )
    if response.status_code != 200:
        raise Exception(f"Analysis failed: {response.text}")
    result = response.json()
    token = result["summary"]["token"]
    # Filter significant pathways (FDR < 0.05)
    significant = [
        p for p in result["pathways"]
        if p["entities"]["fdr"] < 0.05
    ]
    # Save results
    with open(output_file, "w") as f:
        json.dump({
            "token": token,
            "total_pathways": len(result["pathways"]),
            "significant_pathways": len(significant),
            "pathways": significant
        }, f, indent=2)
    # Generate browser URL for top pathway
    if significant:
        top_pathway = significant[0]
        url = f"https://reactome.org/PathwayBrowser/#{top_pathway['stId']}&DTAB=AN&ANALYSIS={token}"
        print(f"View top result: {url}")
    return result
 # Usage
 genes = ["TP53", "BRCA1", "BRCA2", "CDK1", "CDK2"]
 result = analyze_gene_list(genes)
 ```
 ## Additional Resources
 - **Interactive API Documentation**: https://reactome.org/dev/content-service
 - **Analysis Service Docs**: https://reactome.org/dev/analysis
 - **User Guide**: https://reactome.org/userguide
 - **Data Downloads**: https://reactome.org/download-data
--- a/scientific-databases/reactome-database/reactome-database/scripts/reactome_query.py
+++ b/scientific-databases/reactome-database/reactome-database/scripts/reactome_query.py
@@ -0,0 +1,286 @@
 #!/usr/bin/env python3
 """
 Reactome Database Query Helper Script
 This script provides convenient command-line access to common Reactome operations.
 Usage:
    python reactome_query.py version
    python reactome_query.py query <pathway_id>
    python reactome_query.py analyze <gene_list_file>
    python reactome_query.py search <term>
    python reactome_query.py entities <pathway_id>
 Examples:
    python reactome_query.py version
    python reactome_query.py query R-HSA-69278
    python reactome_query.py analyze genes.txt
    python reactome_query.py search "cell cycle"
    python reactome_query.py entities R-HSA-69278
 """
 import sys
 import json
 import requests
 from typing import List, Dict, Optional
 class ReactomeClient:
    """Client for interacting with Reactome REST APIs"""
    CONTENT_BASE = "https://reactome.org/ContentService"
    ANALYSIS_BASE = "https://reactome.org/AnalysisService"
    def get_version(self) -> str:
        """Get Reactome database version"""
        response = requests.get(f"{self.CONTENT_BASE}/data/database/version")
        response.raise_for_status()
        return response.text.strip()
    def query_pathway(self, pathway_id: str) -> Dict:
        """Query pathway information by ID"""
        response = requests.get(f"{self.CONTENT_BASE}/data/query/{pathway_id}")
        response.raise_for_status()
        return response.json()
    def get_pathway_entities(self, pathway_id: str) -> List[Dict]:
        """Get participating entities in a pathway"""
        response = requests.get(
            f"{self.CONTENT_BASE}/data/event/{pathway_id}/participatingPhysicalEntities"
        )
        response.raise_for_status()
        return response.json()
    def search_pathways(self, term: str) -> List[Dict]:
        """Search for pathways by name"""
        response = requests.get(
            f"{self.CONTENT_BASE}/data/query",
            params={"name": term}
        )
        response.raise_for_status()
        return response.json()
    def analyze_genes(self, gene_list: List[str]) -> Dict:
        """Perform pathway enrichment analysis on gene list"""
        data = "\n".join(gene_list)
        response = requests.post(
            f"{self.ANALYSIS_BASE}/identifiers/",
            headers={"Content-Type": "text/plain"},
            data=data
        )
        response.raise_for_status()
        return response.json()
    def get_analysis_by_token(self, token: str) -> Dict:
        """Retrieve analysis results by token"""
        response = requests.get(f"{self.ANALYSIS_BASE}/token/{token}")
        response.raise_for_status()
        return response.json()
 def print_json(data):
    """Pretty print JSON data"""
    print(json.dumps(data, indent=2))
 def command_version():
    """Get and display Reactome version"""
    client = ReactomeClient()
    version = client.get_version()
    print(f"Reactome Database Version: {version}")
 def command_query(pathway_id: str):
    """Query and display pathway information"""
    client = ReactomeClient()
    try:
        pathway = client.query_pathway(pathway_id)
        print(f"Pathway: {pathway['displayName']}")
        print(f"ID: {pathway['stId']}")
        print(f"Type: {pathway['schemaClass']}")
        if 'species' in pathway and pathway['species']:
            species = pathway['species'][0]['displayName']
            print(f"Species: {species}")
        if 'summation' in pathway and pathway['summation']:
            summation = pathway['summation'][0]['text']
            print(f"\nDescription: {summation}")
        print("\nFull JSON response:")
        print_json(pathway)
    except requests.HTTPError as e:
        if e.response.status_code == 404:
            print(f"Error: Pathway '{pathway_id}' not found")
        else:
            print(f"Error: {e}")
        sys.exit(1)
 def command_entities(pathway_id: str):
    """Display entities participating in a pathway"""
    client = ReactomeClient()
    try:
        entities = client.get_pathway_entities(pathway_id)
        print(f"Entities in pathway {pathway_id}: {len(entities)} total\n")
        # Group by type
        by_type = {}
        for entity in entities:
            entity_type = entity['schemaClass']
            if entity_type not in by_type:
                by_type[entity_type] = []
            by_type[entity_type].append(entity)
        # Display by type
        for entity_type, entities_list in sorted(by_type.items()):
            print(f"{entity_type} ({len(entities_list)}):")
            for entity in entities_list[:10]:  # Show first 10
                print(f"  - {entity['stId']}: {entity['displayName']}")
            if len(entities_list) > 10:
                print(f"  ... and {len(entities_list) - 10} more")
            print()
    except requests.HTTPError as e:
        if e.response.status_code == 404:
            print(f"Error: Pathway '{pathway_id}' not found")
        else:
            print(f"Error: {e}")
        sys.exit(1)
 def command_search(term: str):
    """Search for pathways by term"""
    client = ReactomeClient()
    try:
        results = client.search_pathways(term)
        print(f"Search results for '{term}': {len(results)} found\n")
        for result in results[:20]:  # Show first 20
            print(f"{result['stId']}: {result['displayName']}")
            if 'species' in result and result['species']:
                species = result['species'][0]['displayName']
                print(f"  Species: {species}")
            print(f"  Type: {result['schemaClass']}")
            print()
        if len(results) > 20:
            print(f"... and {len(results) - 20} more results")
    except requests.HTTPError as e:
        print(f"Error: {e}")
        sys.exit(1)
 def command_analyze(gene_file: str):
    """Perform pathway enrichment analysis"""
    client = ReactomeClient()
    # Read gene list
    try:
        with open(gene_file, 'r') as f:
            genes = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Error: File '{gene_file}' not found")
        sys.exit(1)
    print(f"Analyzing {len(genes)} genes...")
    try:
        result = client.analyze_genes(genes)
        # Display summary
        summary = result['summary']
        print(f"\nAnalysis Type: {summary['type']}")
        print(f"Token: {summary['token']} (valid for 7 days)")
        print(f"Species: {summary.get('species', 'N/A')}")
        # Display pathways
        pathways = result.get('pathways', [])
        print(f"\nEnriched Pathways: {len(pathways)} found")
        # Show significant pathways (FDR < 0.05)
        significant = [p for p in pathways if p['entities']['fdr'] < 0.05]
        print(f"Significant (FDR < 0.05): {len(significant)}\n")
        # Display top 10 pathways
        print("Top 10 Pathways:")
        for i, pathway in enumerate(pathways[:10], 1):
            print(f"\n{i}. {pathway['name']}")
            print(f"   ID: {pathway['stId']}")
            entities = pathway['entities']
            print(f"   Found: {entities['found']}/{entities['total']} entities")
            print(f"   p-value: {entities['pValue']:.6e}")
            print(f"   FDR: {entities['fdr']:.6e}")
        # Generate browser URL for top pathway
        if pathways:
            token = summary['token']
            top_pathway = pathways[0]['stId']
            url = f"https://reactome.org/PathwayBrowser/#{top_pathway}&DTAB=AN&ANALYSIS={token}"
            print(f"\nView top result in browser:")
            print(url)
        # Save full results
        output_file = gene_file.replace('.txt', '_results.json')
        with open(output_file, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"\nFull results saved to: {output_file}")
    except requests.HTTPError as e:
        print(f"Error: {e}")
        sys.exit(1)
 def print_usage():
    """Print usage information"""
    print(__doc__)
 def main():
    if len(sys.argv) < 2:
        print_usage()
        sys.exit(1)
    command = sys.argv[1].lower()
    if command == "version":
        command_version()
    elif command == "query":
        if len(sys.argv) < 3:
            print("Error: pathway_id required")
            print("Usage: python reactome_query.py query <pathway_id>")
            sys.exit(1)
        command_query(sys.argv[2])
    elif command == "entities":
        if len(sys.argv) < 3:
            print("Error: pathway_id required")
            print("Usage: python reactome_query.py entities <pathway_id>")
            sys.exit(1)
        command_entities(sys.argv[2])
    elif command == "search":
        if len(sys.argv) < 3:
            print("Error: search term required")
            print("Usage: python reactome_query.py search <term>")
            sys.exit(1)
        command_search(" ".join(sys.argv[2:]))
    elif command == "analyze":
        if len(sys.argv) < 3:
            print("Error: gene list file required")
            print("Usage: python reactome_query.py analyze <gene_list_file>")
            sys.exit(1)
        command_analyze(sys.argv[2])
    else:
        print(f"Error: Unknown command '{command}'")
        print_usage()
        sys.exit(1)
 if __name__ == "__main__":
    main()