From 0943c243bb72f4b378c34e99fc29b3e750c052c5 Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Wed, 22 Oct 2025 14:59:27 -0700 Subject: [PATCH] Added full support for ToolUniverse. Claude Code can now use the tools it provide as it needs without the MCP being a gateway. --- .claude-plugin/marketplace.json | 3 +- README.md | 13 +- docs/scientific-packages.md | 3 + scientific-packages/tooluniverse/SKILL.md | 290 +++++++++++++++++ .../tooluniverse/references/api_reference.md | 298 ++++++++++++++++++ .../tooluniverse/references/domains.md | 272 ++++++++++++++++ .../tooluniverse/references/installation.md | 89 ++++++ .../references/tool-composition.md | 249 +++++++++++++++ .../tooluniverse/references/tool-discovery.md | 126 ++++++++ .../tooluniverse/references/tool-execution.md | 177 +++++++++++ .../scripts/example_tool_search.py | 91 ++++++ .../tooluniverse/scripts/example_workflow.py | 219 +++++++++++++ 12 files changed, 1823 insertions(+), 7 deletions(-) create mode 100644 scientific-packages/tooluniverse/SKILL.md create mode 100644 scientific-packages/tooluniverse/references/api_reference.md create mode 100644 scientific-packages/tooluniverse/references/domains.md create mode 100644 scientific-packages/tooluniverse/references/installation.md create mode 100644 scientific-packages/tooluniverse/references/tool-composition.md create mode 100644 scientific-packages/tooluniverse/references/tool-discovery.md create mode 100644 scientific-packages/tooluniverse/references/tool-execution.md create mode 100755 scientific-packages/tooluniverse/scripts/example_tool_search.py create mode 100755 scientific-packages/tooluniverse/scripts/example_workflow.py diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 091adbb..6da2f97 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -7,7 +7,7 @@ }, "metadata": { "description": "Claude scientific skills from K-Dense Inc", - "version": "1.52.0" + "version": "1.53.0" }, "plugins": [ { @@ -55,6 +55,7 @@ "./scientific-packages/statsmodels", "./scientific-packages/torch_geometric", "./scientific-packages/torchdrug", + "./scientific-packages/tooluniverse", "./scientific-packages/transformers", "./scientific-packages/umap-learn", "./scientific-packages/zarr-python" diff --git a/README.md b/README.md index f04df97..fd16882 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![License: PolyForm Noncommercial 1.0.0](https://img.shields.io/badge/License-PolyForm%20Noncommercial-blue.svg)](LICENSE.md) [![GitHub Stars](https://img.shields.io/github/stars/K-Dense-AI/claude-scientific-skills?style=social)](https://github.com/K-Dense-AI/claude-scientific-skills) -[![Skills](https://img.shields.io/badge/Skills-72%2B-brightgreen.svg)](#what-s-included) +[![Skills](https://img.shields.io/badge/Skills-73%2B-brightgreen.svg)](#what-s-included) [![Workflows](https://img.shields.io/badge/Workflows-122-orange.svg)](#what-s-included) A comprehensive collection of ready-to-use scientific skills for Claude, curated by the K-Dense team. @@ -45,7 +45,7 @@ These skills enable Claude to work with specialized scientific libraries and dat | Category | Count | Description | |----------|-------|-------------| | 📊 **Scientific Databases** | 24 | PubMed, PubChem, UniProt, ChEMBL, COSMIC, AlphaFold DB, and more | -| 🔬 **Scientific Packages** | 42 | BioPython, RDKit, PyTorch, Scanpy, and specialized tools | +| 🔬 **Scientific Packages** | 43 | BioPython, RDKit, PyTorch, Scanpy, and specialized tools | | 🔌 **Scientific Integrations** | 6 | Benchling, DNAnexus, Opentrons, LabArchives, LatchBio, OMERO | | 🎯 **Context Initialization** | 1 | Auto-invoked skill to ensure Claude uses existing skills effectively | | 📚 **Documented Workflows** | 122 | Ready-to-use examples and reference materials | @@ -78,7 +78,7 @@ Then, to install a specific set of skills: 2. Select **claude-scientific-skills** 3. Choose from: - `scientific-databases` - Access to 24 scientific databases - - `scientific-packages` - 40 specialized Python packages + - `scientific-packages` - 43 specialized Python packages - `scientific-thinking` - Analysis tools and document processing - `scientific-integrations` - Lab automation and platform integrations - `scientific-context-initialization` - Ensures Claude searches for and uses existing skills @@ -247,7 +247,7 @@ network visualizations. Finally, search GEO for similar expression patterns acro --- ### 🔬 Scientific Packages -**42 specialized Python packages** organized by domain. +**43 specialized Python packages** organized by domain. 📖 **[Full Package Documentation →](docs/scientific-packages.md)** @@ -296,11 +296,12 @@ network visualizations. Finally, search GEO for similar expression patterns acro
-Additional Packages (5 packages) +Additional Packages (6 packages) - BIOMNI (Multi-omics), ETE Toolkit (Phylogenetics) - Paper-2-Web (Academic paper dissemination and presentation) -- scikit-bio (Sequence analysis), Zarr (Array storage) +- scikit-bio (Sequence analysis), ToolUniverse (600+ scientific tool ecosystem) +- Zarr (Array storage)
diff --git a/docs/scientific-packages.md b/docs/scientific-packages.md index a1ae182..10fb7e4 100644 --- a/docs/scientific-packages.md +++ b/docs/scientific-packages.md @@ -62,4 +62,7 @@ ## Scientific Communication & Publishing - **Paper-2-Web** - Autonomous pipeline for transforming academic papers into multiple promotional formats using the Paper2All system. Converts LaTeX or PDF papers into: (1) Paper2Web - interactive, layout-aware academic homepages with responsive design, interactive figures, and mobile support; (2) Paper2Video - professional presentation videos with slides, narration, cursor movements, and optional talking-head generation using Hallo2; (3) Paper2Poster - print-ready conference posters with custom dimensions, professional layouts, and institution branding. Supports GPT-4/GPT-4.1 models, batch processing, QR code generation, multi-language content, and quality assessment metrics. Use cases: conference materials, video abstracts, preprint enhancement, research promotion, poster sessions, and academic website creation +## Tool Discovery & Research Platforms +- **ToolUniverse** - Unified ecosystem providing standardized access to 600+ scientific tools, models, datasets, and APIs across bioinformatics, cheminformatics, genomics, structural biology, and proteomics. Enables AI agents to function as research scientists through: (1) Tool Discovery - natural language, semantic, and keyword-based search for finding relevant scientific tools (Tool_Finder, Tool_Finder_LLM, Tool_Finder_Keyword); (2) Tool Execution - standardized AI-Tool Interaction Protocol for running tools with consistent interfaces; (3) Tool Composition - sequential and parallel workflow chaining for multi-step research pipelines; (4) Model Context Protocol (MCP) integration for Claude Desktop/Code. Supports drug discovery workflows (disease→targets→structures→screening→candidates), genomics analysis (expression→differential analysis→pathways), clinical genomics (variants→annotation→pathogenicity→disease associations), and cross-domain research. Use cases: accessing scientific databases (OpenTargets, PubChem, UniProt, PDB, ChEMBL, KEGG), protein structure prediction (AlphaFold), molecular docking, pathway enrichment, variant annotation, literature searches, and automated scientific workflows + diff --git a/scientific-packages/tooluniverse/SKILL.md b/scientific-packages/tooluniverse/SKILL.md new file mode 100644 index 0000000..110dfd6 --- /dev/null +++ b/scientific-packages/tooluniverse/SKILL.md @@ -0,0 +1,290 @@ +--- +name: tooluniverse +description: Use this skill when working with scientific research tools and workflows across bioinformatics, cheminformatics, genomics, structural biology, proteomics, and drug discovery. This skill provides access to 600+ scientific tools including machine learning models, datasets, APIs, and analysis packages. Use when searching for scientific tools, executing computational biology workflows, composing multi-step research pipelines, accessing databases like OpenTargets/PubChem/UniProt/PDB/ChEMBL, performing tool discovery for research tasks, or integrating scientific computational resources into LLM workflows. +--- + +# ToolUniverse + +## Overview + +ToolUniverse is a unified ecosystem that enables AI agents to function as research scientists by providing standardized access to 600+ scientific resources. Use this skill to discover, execute, and compose scientific tools across multiple research domains including bioinformatics, cheminformatics, genomics, structural biology, proteomics, and drug discovery. + +**Key Capabilities:** +- Access 600+ scientific tools, models, datasets, and APIs +- Discover tools using natural language, semantic search, or keywords +- Execute tools through standardized AI-Tool Interaction Protocol +- Compose multi-step workflows for complex research problems +- Integration with Claude Desktop/Code via Model Context Protocol (MCP) + +## When to Use This Skill + +Use this skill when: +- Searching for scientific tools by function or domain (e.g., "find protein structure prediction tools") +- Executing computational biology workflows (e.g., disease target identification, drug discovery, genomics analysis) +- Accessing scientific databases (OpenTargets, PubChem, UniProt, PDB, ChEMBL, KEGG, etc.) +- Composing multi-step research pipelines (e.g., target discovery → structure prediction → virtual screening) +- Working with bioinformatics, cheminformatics, or structural biology tasks +- Analyzing gene expression, protein sequences, molecular structures, or clinical data +- Performing literature searches, pathway enrichment, or variant annotation +- Building automated scientific research workflows + +## Quick Start + +### Basic Setup +```python +from tooluniverse import ToolUniverse + +# Initialize and load tools +tu = ToolUniverse() +tu.load_tools() # Loads 600+ scientific tools + +# Discover tools +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "disease target associations", + "limit": 10 + } +}) + +# Execute a tool +result = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": "EFO_0000537"} # Hypertension +}) +``` + +### Model Context Protocol (MCP) +For Claude Desktop/Code integration: +```bash +tooluniverse-smcp +``` + +## Core Workflows + +### 1. Tool Discovery + +Find relevant tools for your research task: + +**Three discovery methods:** +- `Tool_Finder` - Embedding-based semantic search (requires GPU) +- `Tool_Finder_LLM` - LLM-based semantic search (no GPU required) +- `Tool_Finder_Keyword` - Fast keyword search + +**Example:** +```python +# Search by natural language description +tools = tu.run({ + "name": "Tool_Finder_LLM", + "arguments": { + "description": "Find tools for RNA sequencing differential expression analysis", + "limit": 10 + } +}) + +# Review available tools +for tool in tools: + print(f"{tool['name']}: {tool['description']}") +``` + +**See `references/tool-discovery.md` for:** +- Detailed discovery methods and search strategies +- Domain-specific keyword suggestions +- Best practices for finding tools + +### 2. Tool Execution + +Execute individual tools through the standardized interface: + +**Example:** +```python +# Execute disease-target lookup +targets = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": "EFO_0000616"} # Breast cancer +}) + +# Get protein structure +structure = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": {"uniprot_id": "P12345"} +}) + +# Calculate molecular properties +properties = tu.run({ + "name": "RDKit_calculate_descriptors", + "arguments": {"smiles": "CCO"} # Ethanol +}) +``` + +**See `references/tool-execution.md` for:** +- Real-world execution examples across domains +- Tool parameter handling and validation +- Result processing and error handling +- Best practices for production use + +### 3. Tool Composition and Workflows + +Compose multiple tools for complex research workflows: + +**Drug Discovery Example:** +```python +# 1. Find disease targets +targets = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": "EFO_0000616"} +}) + +# 2. Get protein structures +structures = [] +for target in targets[:5]: + structure = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": {"uniprot_id": target['uniprot_id']} + }) + structures.append(structure) + +# 3. Screen compounds +hits = [] +for structure in structures: + compounds = tu.run({ + "name": "ZINC_virtual_screening", + "arguments": { + "structure": structure, + "library": "lead-like", + "top_n": 100 + } + }) + hits.extend(compounds) + +# 4. Evaluate drug-likeness +drug_candidates = [] +for compound in hits: + props = tu.run({ + "name": "RDKit_calculate_drug_properties", + "arguments": {"smiles": compound['smiles']} + }) + if props['lipinski_pass']: + drug_candidates.append(compound) +``` + +**See `references/tool-composition.md` for:** +- Complete workflow examples (drug discovery, genomics, clinical) +- Sequential and parallel tool composition patterns +- Output processing hooks +- Workflow best practices + +## Scientific Domains + +ToolUniverse supports 600+ tools across major scientific domains: + +**Bioinformatics:** +- Sequence analysis, alignment, BLAST +- Gene expression (RNA-seq, DESeq2) +- Pathway enrichment (KEGG, Reactome, GO) +- Variant annotation (VEP, ClinVar) + +**Cheminformatics:** +- Molecular descriptors and fingerprints +- Drug discovery and virtual screening +- ADMET prediction and drug-likeness +- Chemical databases (PubChem, ChEMBL, ZINC) + +**Structural Biology:** +- Protein structure prediction (AlphaFold) +- Structure retrieval (PDB) +- Binding site detection +- Protein-protein interactions + +**Proteomics:** +- Mass spectrometry analysis +- Protein databases (UniProt, STRING) +- Post-translational modifications + +**Genomics:** +- Genome assembly and annotation +- Copy number variation +- Clinical genomics workflows + +**Medical/Clinical:** +- Disease databases (OpenTargets, OMIM) +- Clinical trials and FDA data +- Variant classification + +**See `references/domains.md` for:** +- Complete domain categorization +- Tool examples by discipline +- Cross-domain applications +- Search strategies by domain + +## Reference Documentation + +This skill includes comprehensive reference files that provide detailed information for specific aspects: + +- **`references/installation.md`** - Installation, setup, MCP configuration, platform integration +- **`references/tool-discovery.md`** - Discovery methods, search strategies, listing tools +- **`references/tool-execution.md`** - Execution patterns, real-world examples, error handling +- **`references/tool-composition.md`** - Workflow composition, complex pipelines, parallel execution +- **`references/domains.md`** - Tool categorization by domain, use case examples +- **`references/api_reference.md`** - Python API documentation, hooks, protocols + +**Workflow:** When helping with specific tasks, reference the appropriate file for detailed instructions. For example, if searching for tools, consult `references/tool-discovery.md` for search strategies. + +## Example Scripts + +Two executable example scripts demonstrate common use cases: + +**`scripts/example_tool_search.py`** - Demonstrates all three discovery methods: +- Keyword-based search +- LLM-based search +- Domain-specific searches +- Getting detailed tool information + +**`scripts/example_workflow.py`** - Complete workflow examples: +- Drug discovery pipeline (disease → targets → structures → screening → candidates) +- Genomics analysis (expression data → differential analysis → pathways) + +Run examples to understand typical usage patterns and workflow composition. + +## Best Practices + +1. **Tool Discovery:** + - Start with broad searches, then refine based on results + - Use `Tool_Finder_Keyword` for fast searches with known terms + - Use `Tool_Finder_LLM` for complex semantic queries + - Set appropriate `limit` parameter (default: 10) + +2. **Tool Execution:** + - Always verify tool parameters before execution + - Implement error handling for production workflows + - Validate input data formats (SMILES, UniProt IDs, gene symbols) + - Check result types and structures + +3. **Workflow Composition:** + - Test each step individually before composing full workflows + - Implement checkpointing for long workflows + - Consider rate limits for remote APIs + - Use parallel execution when tools are independent + +4. **Integration:** + - Initialize ToolUniverse once and reuse the instance + - Call `load_tools()` once at startup + - Cache frequently used tool information + - Enable logging for debugging + +## Key Terminology + +- **Tool**: A scientific resource (model, dataset, API, package) accessible through ToolUniverse +- **Tool Discovery**: Finding relevant tools using search methods (Finder, LLM, Keyword) +- **Tool Execution**: Running a tool with specific arguments via `tu.run()` +- **Tool Composition**: Chaining multiple tools for multi-step workflows +- **MCP**: Model Context Protocol for integration with Claude Desktop/Code +- **AI-Tool Interaction Protocol**: Standardized interface for LLM-tool communication + +## Resources + +- **Official Website**: https://aiscientist.tools +- **GitHub**: https://github.com/mims-harvard/ToolUniverse +- **Documentation**: https://zitniklab.hms.harvard.edu/ToolUniverse/ +- **Installation**: `uv pip install tooluniverse` +- **MCP Server**: `tooluniverse-smcp` diff --git a/scientific-packages/tooluniverse/references/api_reference.md b/scientific-packages/tooluniverse/references/api_reference.md new file mode 100644 index 0000000..b86655a --- /dev/null +++ b/scientific-packages/tooluniverse/references/api_reference.md @@ -0,0 +1,298 @@ +# ToolUniverse Python API Reference + +## Core Classes + +### ToolUniverse + +Main class for interacting with the ToolUniverse ecosystem. + +```python +from tooluniverse import ToolUniverse + +tu = ToolUniverse() +``` + +#### Methods + +##### `load_tools()` +Load all available tools into the ToolUniverse instance. + +```python +tu.load_tools() +``` + +**Returns:** None + +**Side effects:** Loads 600+ tools into memory for discovery and execution. + +--- + +##### `run(tool_config)` +Execute a tool with specified arguments. + +**Parameters:** +- `tool_config` (dict): Configuration dictionary with keys: + - `name` (str): Tool name to execute + - `arguments` (dict): Tool-specific arguments + +**Returns:** Tool-specific output (dict, list, str, or other types) + +**Example:** +```python +result = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": { + "efoId": "EFO_0000537" + } +}) +``` + +--- + +##### `list_tools(limit=None)` +List all available tools or a subset. + +**Parameters:** +- `limit` (int, optional): Maximum number of tools to return. If None, returns all tools. + +**Returns:** List of tool dictionaries + +**Example:** +```python +# List all tools +all_tools = tu.list_tools() + +# List first 20 tools +tools = tu.list_tools(limit=20) +``` + +--- + +##### `get_tool_info(tool_name)` +Get detailed information about a specific tool. + +**Parameters:** +- `tool_name` (str): Name of the tool + +**Returns:** Dictionary containing tool metadata, parameters, and documentation + +**Example:** +```python +info = tu.get_tool_info("AlphaFold_get_structure") +print(info['description']) +print(info['parameters']) +``` + +--- + +## Built-in Discovery Tools + +These are special tools that help find other tools in the ecosystem. + +### Tool_Finder + +Embedding-based semantic search for tools. Requires GPU. + +```python +tools = tu.run({ + "name": "Tool_Finder", + "arguments": { + "description": "protein structure prediction", + "limit": 10 + } +}) +``` + +**Parameters:** +- `description` (str): Natural language description of desired functionality +- `limit` (int): Maximum number of tools to return + +**Returns:** List of relevant tools with similarity scores + +--- + +### Tool_Finder_LLM + +LLM-based semantic search for tools. No GPU required. + +```python +tools = tu.run({ + "name": "Tool_Finder_LLM", + "arguments": { + "description": "Find tools for RNA sequencing analysis", + "limit": 10 + } +}) +``` + +**Parameters:** +- `description` (str): Natural language query +- `limit` (int): Maximum number of tools to return + +**Returns:** List of relevant tools + +--- + +### Tool_Finder_Keyword + +Fast keyword-based search through tool names and descriptions. + +```python +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "pathway enrichment", + "limit": 10 + } +}) +``` + +**Parameters:** +- `description` (str): Keywords to search for +- `limit` (int): Maximum number of tools to return + +**Returns:** List of matching tools + +--- + +## Tool Output Hooks + +Post-processing hooks for tool results. + +### Summarization Hook +```python +result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} +}, +hooks={ + "summarize": { + "format": "brief" # or "detailed" + } +}) +``` + +### File Saving Hook +```python +result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} +}, +hooks={ + "save_to_file": { + "filename": "output.json", + "format": "json" # or "csv", "txt" + } +}) +``` + +--- + +## Model Context Protocol (MCP) + +### Starting MCP Server + +Command-line interface: +```bash +tooluniverse-smcp +``` + +This launches an MCP server that exposes all ToolUniverse tools through the Model Context Protocol. + +**Configuration:** +- Default port: Automatically assigned +- Protocol: MCP standard +- Authentication: None required for local use + +--- + +## Integration Modules + +### OpenRouter Integration + +Access 100+ LLMs through OpenRouter API: + +```python +from tooluniverse import OpenRouterClient + +client = OpenRouterClient(api_key="your_key") +response = client.chat("Analyze this protein sequence", model="anthropic/claude-3-5-sonnet") +``` + +--- + +## AI-Tool Interaction Protocol + +ToolUniverse uses a standardized protocol for LLM-tool communication: + +**Request Format:** +```json +{ + "name": "tool_name", + "arguments": { + "param1": "value1", + "param2": "value2" + } +} +``` + +**Response Format:** +```json +{ + "status": "success", + "data": { ... }, + "metadata": { + "execution_time": 1.23, + "tool_version": "1.0.0" + } +} +``` + +--- + +## Error Handling + +```python +try: + result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} + }) +except ToolNotFoundError as e: + print(f"Tool not found: {e}") +except InvalidArgumentError as e: + print(f"Invalid arguments: {e}") +except ToolExecutionError as e: + print(f"Execution failed: {e}") +``` + +--- + +## Type Hints + +```python +from typing import Dict, List, Any, Optional + +def run_tool( + tu: ToolUniverse, + tool_name: str, + arguments: Dict[str, Any] +) -> Any: + """Execute a tool with type-safe arguments.""" + return tu.run({ + "name": tool_name, + "arguments": arguments + }) +``` + +--- + +## Best Practices + +1. **Initialize Once**: Create a single ToolUniverse instance and reuse it +2. **Load Tools Early**: Call `load_tools()` once at startup +3. **Cache Tool Info**: Store frequently used tool information +4. **Error Handling**: Always wrap tool execution in try-except blocks +5. **Type Validation**: Validate argument types before execution +6. **Resource Management**: Consider rate limits for remote APIs +7. **Logging**: Enable logging for production environments diff --git a/scientific-packages/tooluniverse/references/domains.md b/scientific-packages/tooluniverse/references/domains.md new file mode 100644 index 0000000..ad0ef03 --- /dev/null +++ b/scientific-packages/tooluniverse/references/domains.md @@ -0,0 +1,272 @@ +# ToolUniverse Tool Domains and Categories + +## Overview + +ToolUniverse integrates 600+ scientific tools across multiple research domains. This document categorizes tools by scientific discipline and use case. + +## Major Scientific Domains + +### Bioinformatics + +**Sequence Analysis:** +- Sequence alignment and comparison +- Multiple sequence alignment (MSA) +- BLAST and homology searches +- Motif finding and pattern matching + +**Genomics:** +- Gene expression analysis +- RNA-seq data processing +- Variant calling and annotation +- Genome assembly and annotation +- Copy number variation analysis + +**Functional Analysis:** +- Gene Ontology (GO) enrichment +- Pathway analysis (KEGG, Reactome) +- Gene set enrichment analysis (GSEA) +- Protein domain analysis + +**Example Tools:** +- GEO data download and analysis +- DESeq2 differential expression +- KEGG pathway enrichment +- UniProt sequence retrieval +- VEP variant annotation + +### Cheminformatics + +**Molecular Descriptors:** +- Chemical property calculation +- Molecular fingerprints +- SMILES/InChI conversion +- 3D conformer generation + +**Drug Discovery:** +- Virtual screening +- Molecular docking +- ADMET prediction +- Drug-likeness assessment (Lipinski's Rule of Five) +- Toxicity prediction + +**Chemical Databases:** +- PubChem compound search +- ChEMBL bioactivity data +- ZINC compound libraries +- DrugBank drug information + +**Example Tools:** +- RDKit molecular descriptors +- AutoDock molecular docking +- ZINC library screening +- ChEMBL target-compound associations + +### Structural Biology + +**Protein Structure:** +- AlphaFold structure prediction +- PDB structure retrieval +- Structure alignment and comparison +- Binding site prediction +- Protein-protein interaction prediction + +**Structure Analysis:** +- Secondary structure prediction +- Solvent accessibility calculation +- Structure quality assessment +- Ramachandran plot analysis + +**Example Tools:** +- AlphaFold structure prediction +- PDB structure download +- Fpocket binding site detection +- DSSP secondary structure assignment + +### Proteomics + +**Protein Analysis:** +- Mass spectrometry data analysis +- Protein identification +- Post-translational modification analysis +- Protein quantification + +**Protein Databases:** +- UniProt protein information +- STRING protein interactions +- IntAct interaction databases + +**Example Tools:** +- UniProt data retrieval +- STRING interaction networks +- Mass spec peak analysis + +### Machine Learning + +**Model Types:** +- Classification models +- Regression models +- Clustering algorithms +- Neural networks +- Deep learning models + +**Applications:** +- Predictive modeling +- Feature selection +- Dimensionality reduction +- Pattern recognition +- Biomarker discovery + +**Example Tools:** +- Scikit-learn models +- TensorFlow/PyTorch models +- XGBoost predictors +- Random forest classifiers + +### Medical/Clinical + +**Disease Databases:** +- OpenTargets disease-target associations +- OMIM genetic disorders +- ClinVar pathogenic variants +- DisGeNET disease-gene associations + +**Clinical Data:** +- Electronic health records analysis +- Clinical trial data +- Diagnostic tools +- Treatment recommendations + +**Example Tools:** +- OpenTargets disease queries +- ClinVar variant classification +- OMIM disease lookup +- FDA drug approval data + +### Neuroscience + +**Brain Imaging:** +- fMRI data analysis +- Brain atlas mapping +- Connectivity analysis +- Neuroimaging pipelines + +**Neural Data:** +- Electrophysiology analysis +- Spike train analysis +- Neural network simulation + +### Image Processing + +**Biomedical Imaging:** +- Microscopy image analysis +- Cell segmentation +- Object detection +- Image enhancement +- Feature extraction + +**Image Analysis:** +- ImageJ/Fiji tools +- CellProfiler pipelines +- Deep learning segmentation + +### Systems Biology + +**Network Analysis:** +- Biological network construction +- Network topology analysis +- Module identification +- Hub gene identification + +**Modeling:** +- Systems biology models +- Metabolic network modeling +- Signaling pathway simulation + +## Tool Categories by Use Case + +### Literature and Knowledge + +**Literature Search:** +- PubMed article search +- Article summarization +- Citation analysis +- Knowledge extraction + +**Knowledge Bases:** +- Ontology queries (GO, DO, HPO) +- Database cross-referencing +- Entity recognition + +### Data Access + +**Public Repositories:** +- GEO (Gene Expression Omnibus) +- SRA (Sequence Read Archive) +- PDB (Protein Data Bank) +- ChEMBL (Bioactivity database) + +**API Access:** +- RESTful API clients +- Database query tools +- Batch data retrieval + +### Visualization + +**Plot Generation:** +- Heatmaps +- Volcano plots +- Manhattan plots +- Network graphs +- Molecular structures + +### Utilities + +**Data Processing:** +- Format conversion +- Data normalization +- Statistical analysis +- Quality control + +**Workflow Management:** +- Pipeline construction +- Task orchestration +- Result aggregation + +## Finding Tools by Domain + +Use domain-specific keywords with Tool_Finder: + +```python +# Bioinformatics +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": {"description": "RNA-seq genomics", "limit": 10} +}) + +# Cheminformatics +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": {"description": "molecular docking SMILES", "limit": 10} +}) + +# Structural biology +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": {"description": "protein structure PDB", "limit": 10} +}) + +# Clinical +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": {"description": "disease clinical variants", "limit": 10} +}) +``` + +## Cross-Domain Applications + +Many scientific problems require tools from multiple domains: + +- **Precision Medicine**: Genomics + Clinical + Proteomics +- **Drug Discovery**: Cheminformatics + Structural Biology + Machine Learning +- **Cancer Research**: Genomics + Pathways + Literature +- **Neurodegenerative Diseases**: Genomics + Proteomics + Imaging diff --git a/scientific-packages/tooluniverse/references/installation.md b/scientific-packages/tooluniverse/references/installation.md new file mode 100644 index 0000000..17063a3 --- /dev/null +++ b/scientific-packages/tooluniverse/references/installation.md @@ -0,0 +1,89 @@ +# ToolUniverse Installation and Setup + +## Installation + +### Using uv (Recommended) +```bash +uv pip install tooluniverse +``` + +### Using pip +```bash +pip install tooluniverse +``` + +## Basic Setup + +### Python SDK +```python +from tooluniverse import ToolUniverse + +# Initialize ToolUniverse +tu = ToolUniverse() + +# Load all available tools (600+ scientific tools) +tu.load_tools() +``` + +## Model Context Protocol (MCP) Setup + +ToolUniverse provides native MCP support for integration with Claude Desktop, Claude Code, and other MCP-compatible systems. + +### Starting MCP Server +```bash +tooluniverse-smcp +``` + +This launches an MCP server that exposes ToolUniverse's 600+ tools through the Model Context Protocol. + +### Claude Desktop Integration + +Add to Claude Desktop configuration (~/.config/Claude/claude_desktop_config.json): +```json +{ + "mcpServers": { + "tooluniverse": { + "command": "tooluniverse-smcp" + } + } +} +``` + +### Claude Code Integration + +ToolUniverse MCP server works natively with Claude Code through the MCP protocol. + +## Integration with Other Platforms + +### OpenRouter Integration +ToolUniverse integrates with OpenRouter for access to 100+ LLMs through a single API: +- GPT-5, Claude, Gemini +- Qwen, Deepseek +- Open-source models + +### Supported LLM Platforms +- Claude Desktop and Claude Code +- Gemini CLI +- Qwen Code +- ChatGPT API +- GPT Codex CLI + +## Requirements + +- Python 3.8+ +- For Tool_Finder (embedding-based search): GPU recommended +- For Tool_Finder_LLM: No GPU required (uses LLM-based search) + +## Verification + +Test installation: +```python +from tooluniverse import ToolUniverse + +tu = ToolUniverse() +tu.load_tools() + +# List first 5 tools to verify setup +tools = tu.list_tools(limit=5) +print(f"Loaded {len(tools)} tools successfully") +``` diff --git a/scientific-packages/tooluniverse/references/tool-composition.md b/scientific-packages/tooluniverse/references/tool-composition.md new file mode 100644 index 0000000..f6beee1 --- /dev/null +++ b/scientific-packages/tooluniverse/references/tool-composition.md @@ -0,0 +1,249 @@ +# Tool Composition and Workflows in ToolUniverse + +## Overview + +ToolUniverse enables chaining multiple tools together to create complex scientific workflows. Tools can be composed sequentially or in parallel to solve multi-step research problems. + +## Sequential Tool Composition + +Execute tools in sequence where each tool's output feeds into the next tool. + +### Basic Pattern +```python +from tooluniverse import ToolUniverse + +tu = ToolUniverse() +tu.load_tools() + +# Step 1: Get disease-associated targets +targets = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": "EFO_0000537"} # Hypertension +}) + +# Step 2: For each target, get protein structure +structures = [] +for target in targets[:5]: # First 5 targets + structure = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": {"uniprot_id": target['uniprot_id']} + }) + structures.append(structure) + +# Step 3: Analyze structures +for structure in structures: + analysis = tu.run({ + "name": "ProteinAnalysis_calculate_properties", + "arguments": {"structure": structure} + }) +``` + +## Complex Workflow Examples + +### Drug Discovery Workflow + +Complete workflow from disease to drug candidates: + +```python +# 1. Find disease-associated targets +print("Finding disease targets...") +targets = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": "EFO_0000616"} # Breast cancer +}) + +# 2. Get target protein sequences +print("Retrieving protein sequences...") +sequences = [] +for target in targets[:10]: + seq = tu.run({ + "name": "UniProt_get_sequence", + "arguments": {"uniprot_id": target['uniprot_id']} + }) + sequences.append(seq) + +# 3. Predict protein structures +print("Predicting structures...") +structures = [] +for seq in sequences: + structure = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": {"sequence": seq} + }) + structures.append(structure) + +# 4. Find binding sites +print("Identifying binding sites...") +binding_sites = [] +for structure in structures: + sites = tu.run({ + "name": "Fpocket_find_binding_sites", + "arguments": {"structure": structure} + }) + binding_sites.append(sites) + +# 5. Screen compound libraries +print("Screening compounds...") +hits = [] +for site in binding_sites: + compounds = tu.run({ + "name": "ZINC_virtual_screening", + "arguments": { + "binding_site": site, + "library": "lead-like", + "top_n": 100 + } + }) + hits.extend(compounds) + +# 6. Calculate drug-likeness +print("Evaluating drug-likeness...") +drug_candidates = [] +for compound in hits: + properties = tu.run({ + "name": "RDKit_calculate_drug_properties", + "arguments": {"smiles": compound['smiles']} + }) + if properties['lipinski_pass']: + drug_candidates.append(compound) + +print(f"Found {len(drug_candidates)} drug candidates") +``` + +### Genomics Analysis Workflow + +```python +# 1. Download gene expression data +expression_data = tu.run({ + "name": "GEO_download_dataset", + "arguments": {"geo_id": "GSE12345"} +}) + +# 2. Perform differential expression analysis +de_genes = tu.run({ + "name": "DESeq2_differential_expression", + "arguments": { + "data": expression_data, + "condition1": "control", + "condition2": "treated" + } +}) + +# 3. Pathway enrichment analysis +pathways = tu.run({ + "name": "KEGG_pathway_enrichment", + "arguments": { + "gene_list": de_genes['significant_genes'], + "organism": "hsa" + } +}) + +# 4. Find relevant literature +papers = tu.run({ + "name": "PubMed_search", + "arguments": { + "query": f"{pathways[0]['pathway_name']} AND cancer", + "max_results": 20 + } +}) + +# 5. Summarize findings +summary = tu.run({ + "name": "LLM_summarize", + "arguments": { + "text": papers, + "focus": "therapeutic implications" + } +}) +``` + +### Clinical Genomics Workflow + +```python +# 1. Load patient variants +variants = tu.run({ + "name": "VCF_parse", + "arguments": {"vcf_file": "patient_001.vcf"} +}) + +# 2. Annotate variants +annotated = tu.run({ + "name": "VEP_annotate_variants", + "arguments": {"variants": variants} +}) + +# 3. Filter pathogenic variants +pathogenic = tu.run({ + "name": "ClinVar_filter_pathogenic", + "arguments": {"variants": annotated} +}) + +# 4. Find disease associations +diseases = tu.run({ + "name": "OMIM_disease_lookup", + "arguments": {"genes": pathogenic['affected_genes']} +}) + +# 5. Generate clinical report +report = tu.run({ + "name": "Report_generator", + "arguments": { + "variants": pathogenic, + "diseases": diseases, + "format": "clinical" + } +}) +``` + +## Parallel Tool Execution + +Execute multiple tools simultaneously when they don't depend on each other: + +```python +import concurrent.futures + +def run_tool(tu, tool_config): + return tu.run(tool_config) + +# Define parallel tasks +tasks = [ + {"name": "PubMed_search", "arguments": {"query": "cancer", "max_results": 10}}, + {"name": "OpenTargets_get_diseases", "arguments": {"therapeutic_area": "oncology"}}, + {"name": "ChEMBL_search_compounds", "arguments": {"target": "EGFR"}} +] + +# Execute in parallel +with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + futures = [executor.submit(run_tool, tu, task) for task in tasks] + results = [future.result() for future in concurrent.futures.as_completed(futures)] +``` + +## Output Processing Hooks + +ToolUniverse supports post-processing hooks for: +- Summarization +- File saving +- Data transformation +- Visualization + +```python +# Example: Save results to file +result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} +}, +hooks={ + "save_to_file": {"filename": "results.json"}, + "summarize": {"format": "brief"} +}) +``` + +## Best Practices + +1. **Error Handling**: Implement try-except blocks for each tool in workflow +2. **Data Validation**: Verify output from each step before passing to next tool +3. **Checkpointing**: Save intermediate results for long workflows +4. **Logging**: Track progress through complex workflows +5. **Resource Management**: Consider rate limits and computational resources +6. **Modularity**: Break complex workflows into reusable functions +7. **Testing**: Test each step individually before composing full workflow diff --git a/scientific-packages/tooluniverse/references/tool-discovery.md b/scientific-packages/tooluniverse/references/tool-discovery.md new file mode 100644 index 0000000..a0a5cab --- /dev/null +++ b/scientific-packages/tooluniverse/references/tool-discovery.md @@ -0,0 +1,126 @@ +# Tool Discovery in ToolUniverse + +## Overview + +ToolUniverse provides multiple methods to discover and search through 600+ scientific tools using natural language, keywords, or embeddings. + +## Discovery Methods + +### 1. Tool_Finder (Embedding-Based Search) + +Uses semantic embeddings to find relevant tools. **Requires GPU** for optimal performance. + +```python +from tooluniverse import ToolUniverse + +tu = ToolUniverse() +tu.load_tools() + +# Search by natural language description +tools = tu.run({ + "name": "Tool_Finder", + "arguments": { + "description": "protein structure prediction", + "limit": 10 + } +}) + +print(tools) +``` + +**When to use:** +- Natural language queries +- Semantic similarity search +- When GPU is available + +### 2. Tool_Finder_LLM (LLM-Based Search) + +Alternative to embedding-based search that uses LLM reasoning. **No GPU required**. + +```python +tools = tu.run({ + "name": "Tool_Finder_LLM", + "arguments": { + "description": "Find tools for analyzing gene expression data", + "limit": 10 + } +}) +``` + +**When to use:** +- When GPU is not available +- Complex queries requiring reasoning +- Semantic understanding needed + +### 3. Tool_Finder_Keyword (Keyword Search) + +Fast keyword-based search through tool names and descriptions. + +```python +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "disease target associations", + "limit": 10 + } +}) +``` + +**When to use:** +- Fast searches +- Known keywords +- Exact term matching + +## Listing Available Tools + +### List All Tools +```python +all_tools = tu.list_tools() +print(f"Total tools available: {len(all_tools)}") +``` + +### List Tools with Limit +```python +tools = tu.list_tools(limit=20) +for tool in tools: + print(f"{tool['name']}: {tool['description']}") +``` + +## Tool Information + +### Get Tool Details +```python +# After finding a tool, inspect its details +tool_info = tu.get_tool_info("OpenTargets_get_associated_targets_by_disease_efoId") +print(tool_info) +``` + +## Search Strategies + +### By Domain +Use domain-specific keywords: +- Bioinformatics: "sequence alignment", "genomics", "RNA-seq" +- Cheminformatics: "molecular dynamics", "drug design", "SMILES" +- Machine Learning: "classification", "prediction", "neural network" +- Structural Biology: "protein structure", "PDB", "crystallography" + +### By Functionality +Search by what you want to accomplish: +- "Find disease-gene associations" +- "Predict protein interactions" +- "Analyze clinical trial data" +- "Generate molecular descriptors" + +### By Data Source +Search for specific databases or APIs: +- "OpenTargets", "PubChem", "UniProt" +- "AlphaFold", "ChEMBL", "PDB" +- "KEGG", "Reactome", "STRING" + +## Best Practices + +1. **Start Broad**: Begin with general terms, then refine +2. **Use Multiple Methods**: Try different discovery methods if results aren't satisfactory +3. **Set Appropriate Limits**: Use `limit` parameter to control result size (default: 10) +4. **Check Tool Descriptions**: Review returned tool descriptions to verify relevance +5. **Iterate**: Refine search terms based on initial results diff --git a/scientific-packages/tooluniverse/references/tool-execution.md b/scientific-packages/tooluniverse/references/tool-execution.md new file mode 100644 index 0000000..9d36703 --- /dev/null +++ b/scientific-packages/tooluniverse/references/tool-execution.md @@ -0,0 +1,177 @@ +# Tool Execution in ToolUniverse + +## Overview + +Execute individual tools through ToolUniverse's standardized interface using the `run()` method. + +## Basic Tool Execution + +### Standard Pattern +```python +from tooluniverse import ToolUniverse + +tu = ToolUniverse() +tu.load_tools() + +# Execute a tool +result = tu.run({ + "name": "tool_name_here", + "arguments": { + "param1": "value1", + "param2": "value2" + } +}) + +print(result) +``` + +## Real-World Examples + +### Example 1: Disease-Target Associations (OpenTargets) +```python +# Find targets associated with hypertension +result = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": { + "efoId": "EFO_0000537" # Hypertension + } +}) + +print(f"Found {len(result)} targets associated with hypertension") +``` + +### Example 2: Protein Structure Prediction +```python +# Get AlphaFold structure prediction +result = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": { + "uniprot_id": "P12345" + } +}) +``` + +### Example 3: Chemical Property Calculation +```python +# Calculate molecular descriptors +result = tu.run({ + "name": "RDKit_calculate_descriptors", + "arguments": { + "smiles": "CCO" # Ethanol + } +}) +``` + +### Example 4: Gene Expression Analysis +```python +# Analyze differential gene expression +result = tu.run({ + "name": "GeneExpression_differential_analysis", + "arguments": { + "dataset_id": "GSE12345", + "condition1": "control", + "condition2": "treatment" + } +}) +``` + +## Tool Execution Workflow + +### 1. Discover the Tool +```python +# Find relevant tools +tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "pathway enrichment", + "limit": 5 + } +}) + +# Review available tools +for tool in tools: + print(f"Name: {tool['name']}") + print(f"Description: {tool['description']}") + print(f"Parameters: {tool['parameters']}") + print("---") +``` + +### 2. Check Tool Parameters +```python +# Get detailed tool information +tool_info = tu.get_tool_info("KEGG_pathway_enrichment") +print(tool_info['parameters']) +``` + +### 3. Execute with Proper Arguments +```python +# Execute the tool +result = tu.run({ + "name": "KEGG_pathway_enrichment", + "arguments": { + "gene_list": ["TP53", "BRCA1", "EGFR"], + "organism": "hsa" # Homo sapiens + } +}) +``` + +## Handling Tool Results + +### Check Result Type +```python +result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} +}) + +# Results can be various types +if isinstance(result, dict): + print("Dictionary result") +elif isinstance(result, list): + print(f"List with {len(result)} items") +elif isinstance(result, str): + print("String result") +``` + +### Process Results +```python +# Example: Processing multiple results +results = tu.run({ + "name": "PubMed_search", + "arguments": { + "query": "cancer immunotherapy", + "max_results": 10 + } +}) + +for idx, paper in enumerate(results, 1): + print(f"{idx}. {paper['title']}") + print(f" PMID: {paper['pmid']}") + print(f" Authors: {', '.join(paper['authors'][:3])}") + print() +``` + +## Error Handling + +```python +try: + result = tu.run({ + "name": "some_tool", + "arguments": {"param": "value"} + }) +except Exception as e: + print(f"Tool execution failed: {e}") + # Check if tool exists + # Verify parameter names and types + # Review tool documentation +``` + +## Best Practices + +1. **Verify Tool Parameters**: Always check required parameters before execution +2. **Start Simple**: Test with simple cases before complex workflows +3. **Handle Results Appropriately**: Check result type and structure +4. **Error Recovery**: Implement try-except blocks for production code +5. **Documentation**: Review tool descriptions for parameter requirements and output formats +6. **Rate Limiting**: Be aware of API rate limits for remote tools +7. **Data Validation**: Validate input data format (e.g., SMILES, UniProt IDs, gene symbols) diff --git a/scientific-packages/tooluniverse/scripts/example_tool_search.py b/scientific-packages/tooluniverse/scripts/example_tool_search.py new file mode 100755 index 0000000..2c4a4e9 --- /dev/null +++ b/scientific-packages/tooluniverse/scripts/example_tool_search.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating tool discovery in ToolUniverse. + +This script shows how to search for tools using different methods: +- Embedding-based search (Tool_Finder) +- LLM-based search (Tool_Finder_LLM) +- Keyword-based search (Tool_Finder_Keyword) +""" + +from tooluniverse import ToolUniverse + + +def main(): + # Initialize ToolUniverse + print("Initializing ToolUniverse...") + tu = ToolUniverse() + tu.load_tools() + print(f"Loaded {len(tu.list_tools())} tools\n") + + # Example 1: Keyword-based search (fastest) + print("=" * 60) + print("Example 1: Keyword Search for Disease-Target Tools") + print("=" * 60) + + tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "disease target associations", + "limit": 5 + } + }) + + print(f"Found {len(tools)} tools:") + for idx, tool in enumerate(tools, 1): + print(f"\n{idx}. {tool['name']}") + print(f" Description: {tool['description']}") + + # Example 2: LLM-based search (no GPU required) + print("\n" + "=" * 60) + print("Example 2: LLM Search for Protein Structure Tools") + print("=" * 60) + + tools = tu.run({ + "name": "Tool_Finder_LLM", + "arguments": { + "description": "Find tools for predicting protein structures from sequences", + "limit": 5 + } + }) + + print(f"Found {len(tools)} tools:") + for idx, tool in enumerate(tools, 1): + print(f"\n{idx}. {tool['name']}") + print(f" Description: {tool['description']}") + + # Example 3: Search by specific domain + print("\n" + "=" * 60) + print("Example 3: Search for Cheminformatics Tools") + print("=" * 60) + + tools = tu.run({ + "name": "Tool_Finder_Keyword", + "arguments": { + "description": "molecular docking SMILES compound", + "limit": 5 + } + }) + + print(f"Found {len(tools)} tools:") + for idx, tool in enumerate(tools, 1): + print(f"\n{idx}. {tool['name']}") + print(f" Description: {tool['description']}") + + # Example 4: Get detailed tool information + print("\n" + "=" * 60) + print("Example 4: Get Tool Details") + print("=" * 60) + + if tools: + tool_name = tools[0]['name'] + print(f"Getting details for: {tool_name}") + + tool_info = tu.get_tool_info(tool_name) + print(f"\nTool: {tool_info['name']}") + print(f"Description: {tool_info['description']}") + print(f"Parameters: {tool_info.get('parameters', 'No parameters listed')}") + + +if __name__ == "__main__": + main() diff --git a/scientific-packages/tooluniverse/scripts/example_workflow.py b/scientific-packages/tooluniverse/scripts/example_workflow.py new file mode 100755 index 0000000..de8fb0f --- /dev/null +++ b/scientific-packages/tooluniverse/scripts/example_workflow.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Example workflow demonstrating tool composition in ToolUniverse. + +This script shows a complete drug discovery workflow: +1. Find disease-associated targets +2. Retrieve protein sequences +3. Get structure predictions +4. Screen compound libraries +5. Calculate drug-likeness properties +""" + +from tooluniverse import ToolUniverse + + +def drug_discovery_workflow(disease_efo_id: str, max_targets: int = 3): + """ + Execute a drug discovery workflow for a given disease. + + Args: + disease_efo_id: EFO ID for the disease (e.g., "EFO_0000537" for hypertension) + max_targets: Maximum number of targets to process + """ + tu = ToolUniverse() + tu.load_tools() + + print("=" * 70) + print("DRUG DISCOVERY WORKFLOW") + print("=" * 70) + + # Step 1: Find disease-associated targets + print(f"\nStep 1: Finding targets for disease {disease_efo_id}...") + targets = tu.run({ + "name": "OpenTargets_get_associated_targets_by_disease_efoId", + "arguments": {"efoId": disease_efo_id} + }) + print(f"✓ Found {len(targets)} disease-associated targets") + + # Process top targets + top_targets = targets[:max_targets] + print(f" Processing top {len(top_targets)} targets:") + for idx, target in enumerate(top_targets, 1): + print(f" {idx}. {target.get('target_name', 'Unknown')} ({target.get('uniprot_id', 'N/A')})") + + # Step 2: Get protein sequences + print(f"\nStep 2: Retrieving protein sequences...") + sequences = [] + for target in top_targets: + try: + seq = tu.run({ + "name": "UniProt_get_sequence", + "arguments": {"uniprot_id": target['uniprot_id']} + }) + sequences.append({ + "target": target, + "sequence": seq + }) + print(f" ✓ Retrieved sequence for {target.get('target_name', 'Unknown')}") + except Exception as e: + print(f" ✗ Failed to get sequence: {e}") + + # Step 3: Predict protein structures + print(f"\nStep 3: Predicting protein structures...") + structures = [] + for seq_data in sequences: + try: + structure = tu.run({ + "name": "AlphaFold_get_structure", + "arguments": {"uniprot_id": seq_data['target']['uniprot_id']} + }) + structures.append({ + "target": seq_data['target'], + "structure": structure + }) + print(f" ✓ Predicted structure for {seq_data['target'].get('target_name', 'Unknown')}") + except Exception as e: + print(f" ✗ Failed to predict structure: {e}") + + # Step 4: Find binding sites + print(f"\nStep 4: Identifying binding sites...") + binding_sites = [] + for struct_data in structures: + try: + sites = tu.run({ + "name": "Fpocket_find_binding_sites", + "arguments": {"structure": struct_data['structure']} + }) + binding_sites.append({ + "target": struct_data['target'], + "sites": sites + }) + print(f" ✓ Found {len(sites)} binding sites for {struct_data['target'].get('target_name', 'Unknown')}") + except Exception as e: + print(f" ✗ Failed to find binding sites: {e}") + + # Step 5: Virtual screening (simplified) + print(f"\nStep 5: Screening compound libraries...") + all_hits = [] + for site_data in binding_sites: + for site in site_data['sites'][:1]: # Top site only + try: + compounds = tu.run({ + "name": "ZINC_virtual_screening", + "arguments": { + "binding_site": site, + "library": "lead-like", + "top_n": 10 + } + }) + all_hits.extend(compounds) + print(f" ✓ Found {len(compounds)} hit compounds for {site_data['target'].get('target_name', 'Unknown')}") + except Exception as e: + print(f" ✗ Screening failed: {e}") + + # Step 6: Calculate drug-likeness + print(f"\nStep 6: Evaluating drug-likeness...") + drug_candidates = [] + for compound in all_hits: + try: + properties = tu.run({ + "name": "RDKit_calculate_drug_properties", + "arguments": {"smiles": compound['smiles']} + }) + + if properties.get('lipinski_pass', False): + drug_candidates.append({ + "compound": compound, + "properties": properties + }) + except Exception as e: + print(f" ✗ Property calculation failed: {e}") + + print(f"\n ✓ Identified {len(drug_candidates)} drug candidates passing Lipinski's Rule of Five") + + # Summary + print("\n" + "=" * 70) + print("WORKFLOW SUMMARY") + print("=" * 70) + print(f"Disease targets processed: {len(top_targets)}") + print(f"Protein structures predicted: {len(structures)}") + print(f"Binding sites identified: {sum(len(s['sites']) for s in binding_sites)}") + print(f"Compounds screened: {len(all_hits)}") + print(f"Drug candidates identified: {len(drug_candidates)}") + print("=" * 70) + + return drug_candidates + + +def genomics_workflow(geo_id: str): + """ + Execute a genomics analysis workflow. + + Args: + geo_id: GEO dataset ID (e.g., "GSE12345") + """ + tu = ToolUniverse() + tu.load_tools() + + print("=" * 70) + print("GENOMICS ANALYSIS WORKFLOW") + print("=" * 70) + + # Step 1: Download gene expression data + print(f"\nStep 1: Downloading dataset {geo_id}...") + try: + expression_data = tu.run({ + "name": "GEO_download_dataset", + "arguments": {"geo_id": geo_id} + }) + print(f" ✓ Downloaded expression data") + except Exception as e: + print(f" ✗ Failed: {e}") + return + + # Step 2: Differential expression analysis + print(f"\nStep 2: Performing differential expression analysis...") + try: + de_genes = tu.run({ + "name": "DESeq2_differential_expression", + "arguments": { + "data": expression_data, + "condition1": "control", + "condition2": "treated" + } + }) + print(f" ✓ Found {len(de_genes.get('significant_genes', []))} differentially expressed genes") + except Exception as e: + print(f" ✗ Failed: {e}") + return + + # Step 3: Pathway enrichment + print(f"\nStep 3: Running pathway enrichment analysis...") + try: + pathways = tu.run({ + "name": "KEGG_pathway_enrichment", + "arguments": { + "gene_list": de_genes['significant_genes'], + "organism": "hsa" + } + }) + print(f" ✓ Found {len(pathways)} enriched pathways") + if pathways: + print(f" Top pathway: {pathways[0].get('pathway_name', 'Unknown')}") + except Exception as e: + print(f" ✗ Failed: {e}") + + print("\n" + "=" * 70) + + +if __name__ == "__main__": + # Example 1: Drug discovery workflow for hypertension + print("EXAMPLE 1: Drug Discovery for Hypertension") + candidates = drug_discovery_workflow("EFO_0000537", max_targets=2) + + print("\n\n") + + # Example 2: Genomics workflow + print("EXAMPLE 2: Genomics Analysis") + genomics_workflow("GSE12345")