Merge pull request #5 from K-Dense-AI/refactor_skills

Refactor skills into a single installable plugin
This commit is contained in:
Timothy Kassis
2025-11-13 19:21:02 -08:00
committed by GitHub
793 changed files with 591 additions and 801 deletions

View File

@@ -2,172 +2,137 @@
{
"name": "claude-scientific-skills",
"owner": {
"name": "Timothy Kassis",
"email": "timothy.kassis@k-dense.ai"
"name": "K-Dense Inc.",
"email": "contact@k-dense.ai"
},
"metadata": {
"description": "Claude scientific skills from K-Dense Inc",
"version": "1.80.0"
"version": "2.0.0"
},
"plugins": [
{
"name": "scientific-packages",
"description": "Collection of python scientific packages",
"name": "scientific-skills",
"description": "Collection of scientific skills",
"source": "./",
"strict": false,
"skills": [
"./scientific-packages/aeon",
"./scientific-packages/anndata",
"./scientific-packages/arboreto",
"./scientific-packages/astropy",
"./scientific-packages/biomni",
"./scientific-packages/biopython",
"./scientific-packages/bioservices",
"./scientific-packages/cellxgene-census",
"./scientific-packages/cobrapy",
"./scientific-packages/dask",
"./scientific-packages/datacommons-client",
"./scientific-packages/denario",
"./scientific-packages/datamol",
"./scientific-packages/deepchem",
"./scientific-packages/deeptools",
"./scientific-packages/diffdock",
"./scientific-packages/esm",
"./scientific-packages/etetoolkit",
"./scientific-packages/flowio",
"./scientific-packages/geniml",
"./scientific-packages/gget",
"./scientific-packages/gtars",
"./scientific-packages/hypogenic",
"./scientific-packages/histolab",
"./scientific-packages/lamindb",
"./scientific-packages/markitdown",
"./scientific-packages/matchms",
"./scientific-packages/matplotlib",
"./scientific-packages/medchem",
"./scientific-packages/modal",
"./scientific-packages/molfeat",
"./scientific-packages/neurokit2",
"./scientific-packages/networkx",
"./scientific-packages/paper-2-web",
"./scientific-packages/pathml",
"./scientific-packages/polars",
"./scientific-packages/pydeseq2",
"./scientific-packages/pydicom",
"./scientific-packages/pyhealth",
"./scientific-packages/pymatgen",
"./scientific-packages/pymc",
"./scientific-packages/pylabrobot",
"./scientific-packages/pymoo",
"./scientific-packages/pufferlib",
"./scientific-packages/pyopenms",
"./scientific-packages/pysam",
"./scientific-packages/pytdc",
"./scientific-packages/pytorch-lightning",
"./scientific-packages/rdkit",
"./scientific-packages/reportlab",
"./scientific-packages/scanpy",
"./scientific-packages/scvi-tools",
"./scientific-packages/scikit-bio",
"./scientific-packages/scikit-learn",
"./scientific-packages/scikit-survival",
"./scientific-packages/seaborn",
"./scientific-packages/shap",
"./scientific-packages/simpy",
"./scientific-packages/stable-baselines3",
"./scientific-packages/statsmodels",
"./scientific-packages/sympy",
"./scientific-packages/torch_geometric",
"./scientific-packages/torchdrug",
"./scientific-packages/tooluniverse",
"./scientific-packages/transformers",
"./scientific-packages/umap-learn",
"./scientific-packages/vaex",
"./scientific-packages/zarr-python"
"./scientific-skills/aeon",
"./scientific-skills/anndata",
"./scientific-skills/arboreto",
"./scientific-skills/astropy",
"./scientific-skills/biomni",
"./scientific-skills/biopython",
"./scientific-skills/bioservices",
"./scientific-skills/cellxgene-census",
"./scientific-skills/cobrapy",
"./scientific-skills/dask",
"./scientific-skills/datacommons-client",
"./scientific-skills/denario",
"./scientific-skills/datamol",
"./scientific-skills/deepchem",
"./scientific-skills/deeptools",
"./scientific-skills/diffdock",
"./scientific-skills/esm",
"./scientific-skills/etetoolkit",
"./scientific-skills/flowio",
"./scientific-skills/geniml",
"./scientific-skills/gget",
"./scientific-skills/gtars",
"./scientific-skills/hypogenic",
"./scientific-skills/histolab",
"./scientific-skills/lamindb",
"./scientific-skills/markitdown",
"./scientific-skills/matchms",
"./scientific-skills/matplotlib",
"./scientific-skills/medchem",
"./scientific-skills/modal",
"./scientific-skills/molfeat",
"./scientific-skills/neurokit2",
"./scientific-skills/networkx",
"./scientific-skills/paper-2-web",
"./scientific-skills/pathml",
"./scientific-skills/polars",
"./scientific-skills/pydeseq2",
"./scientific-skills/pydicom",
"./scientific-skills/pyhealth",
"./scientific-skills/pymatgen",
"./scientific-skills/pymc",
"./scientific-skills/pylabrobot",
"./scientific-skills/pymoo",
"./scientific-skills/pufferlib",
"./scientific-skills/pyopenms",
"./scientific-skills/pysam",
"./scientific-skills/pytdc",
"./scientific-skills/pytorch-lightning",
"./scientific-skills/rdkit",
"./scientific-skills/reportlab",
"./scientific-skills/scanpy",
"./scientific-skills/scvi-tools",
"./scientific-skills/scikit-bio",
"./scientific-skills/scikit-learn",
"./scientific-skills/scikit-survival",
"./scientific-skills/seaborn",
"./scientific-skills/shap",
"./scientific-skills/simpy",
"./scientific-skills/stable-baselines3",
"./scientific-skills/statsmodels",
"./scientific-skills/sympy",
"./scientific-skills/torch_geometric",
"./scientific-skills/torchdrug",
"./scientific-skills/tooluniverse",
"./scientific-skills/transformers",
"./scientific-skills/umap-learn",
"./scientific-skills/vaex",
"./scientific-skills/zarr-python",
"./scientific-skills/alphafold-database",
"./scientific-skills/biorxiv-database",
"./scientific-skills/chembl-database",
"./scientific-skills/clinpgx-database",
"./scientific-skills/clinvar-database",
"./scientific-skills/clinicaltrials-database",
"./scientific-skills/cosmic-database",
"./scientific-skills/drugbank-database",
"./scientific-skills/ena-database",
"./scientific-skills/ensembl-database",
"./scientific-skills/fda-database",
"./scientific-skills/gene-database",
"./scientific-skills/geo-database",
"./scientific-skills/gwas-database",
"./scientific-skills/hmdb-database",
"./scientific-skills/kegg-database",
"./scientific-skills/metabolomics-workbench-database",
"./scientific-skills/opentargets-database",
"./scientific-skills/pdb-database",
"./scientific-skills/pubchem-database",
"./scientific-skills/pubmed-database",
"./scientific-skills/reactome-database",
"./scientific-skills/string-database",
"./scientific-skills/uniprot-database",
"./scientific-skills/uspto-database",
"./scientific-skills/zinc-database",
"./scientific-skills/exploratory-data-analysis",
"./scientific-skills/hypothesis-generation",
"./scientific-skills/literature-review",
"./scientific-skills/peer-review",
"./scientific-skills/scholar-evaluation",
"./scientific-skills/scientific-brainstorming",
"./scientific-skills/scientific-critical-thinking",
"./scientific-skills/scientific-writing",
"./scientific-skills/statistical-analysis",
"./scientific-skills/scientific-visualization",
"./scientific-skills/document-skills/docx",
"./scientific-skills/document-skills/pdf",
"./scientific-skills/document-skills/pptx",
"./scientific-skills/document-skills/xlsx",
"./scientific-skills/benchling-integration",
"./scientific-skills/dnanexus-integration",
"./scientific-skills/labarchive-integration",
"./scientific-skills/latchbio-integration",
"./scientific-skills/omero-integration",
"./scientific-skills/opentrons-integration",
"./scientific-skills/protocolsio-integration",
"./scientific-skills/get-available-resources"
]
},
{
"name": "scientific-databases",
"description": "Collection of scientific databases",
"source": "./",
"strict": false,
"skills": [
"./scientific-databases/alphafold-database",
"./scientific-databases/biorxiv-database",
"./scientific-databases/chembl-database",
"./scientific-databases/clinpgx-database",
"./scientific-databases/clinvar-database",
"./scientific-databases/clinicaltrials-database",
"./scientific-databases/cosmic-database",
"./scientific-databases/drugbank-database",
"./scientific-databases/ena-database",
"./scientific-databases/ensembl-database",
"./scientific-databases/fda-database",
"./scientific-databases/gene-database",
"./scientific-databases/geo-database",
"./scientific-databases/gwas-database",
"./scientific-databases/hmdb-database",
"./scientific-databases/kegg-database",
"./scientific-databases/metabolomics-workbench-database",
"./scientific-databases/opentargets-database",
"./scientific-databases/pdb-database",
"./scientific-databases/pubchem-database",
"./scientific-databases/pubmed-database",
"./scientific-databases/reactome-database",
"./scientific-databases/string-database",
"./scientific-databases/uniprot-database",
"./scientific-databases/uspto-database",
"./scientific-databases/zinc-database"
]
},
{
"name": "scientific-thinking",
"description": "Collection of scientific thinking methodologies",
"source": "./",
"strict": false,
"skills": [
"./scientific-thinking/exploratory-data-analysis",
"./scientific-thinking/hypothesis-generation",
"./scientific-thinking/literature-review",
"./scientific-thinking/peer-review",
"./scientific-thinking/scholar-evaluation",
"./scientific-thinking/scientific-brainstorming",
"./scientific-thinking/scientific-critical-thinking",
"./scientific-thinking/scientific-writing",
"./scientific-thinking/statistical-analysis",
"./scientific-thinking/scientific-visualization",
"./scientific-thinking/document-skills/docx",
"./scientific-thinking/document-skills/pdf",
"./scientific-thinking/document-skills/pptx",
"./scientific-thinking/document-skills/xlsx"
]
},
{
"name": "scientific-integrations",
"description": "Collection of scientific platform integrations",
"source": "./",
"strict": false,
"skills": [
"./scientific-integrations/benchling-integration",
"./scientific-integrations/dnanexus-integration",
"./scientific-integrations/labarchive-integration",
"./scientific-integrations/latchbio-integration",
"./scientific-integrations/omero-integration",
"./scientific-integrations/opentrons-integration",
"./scientific-integrations/protocolsio-integration"
]
},
{
"name": "scientific-context-initialization",
"description": "Always Auto-invoked skill that creates/updates workspace AGENT.md to instruct the agent to always search for existing skills before attempting any scientific task",
"source": "./scientific-helpers/scientific-context-initialization",
"strict": false
},
{
"name": "get-available-resources",
"description": "Detects and reports available system resources (CPU cores, GPUs, memory, disk space) to inform computational approach decisions",
"source": "./scientific-helpers/get-available-resources",
"strict": false
}
]
}
}

606
README.md
View File

@@ -1,18 +1,26 @@
# Claude Scientific Skills
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE.md)
[![Skills](https://img.shields.io/badge/Skills-117-brightgreen.svg)](#what-s-included)
[![Equivalent Tools](https://img.shields.io/badge/Equivalent_Tools-1002-blue.svg)](#what-s-included)
[![Skills](https://img.shields.io/badge/Skills-117-brightgreen.svg)](#whats-included)
A comprehensive collection of ready-to-use scientific skills for Claude, curated by the K-Dense team.
A comprehensive collection of **117+ ready-to-use scientific skills** for Claude, created by the K-Dense team. Transform Claude into your AI research assistant capable of executing complex multi-step scientific workflows across biology, chemistry, medicine, and beyond.
These skills enable Claude to work with specialized scientific libraries and databases across multiple scientific domains:
- 🧬 Bioinformatics & Genomics
- 🧪 Cheminformatics & Drug Discovery
- 🔬 Proteomics & Mass Spectrometry
- 🤖 Machine Learning & AI
- 🔮 Materials Science & Chemistry
- 📊 Data Analysis & Visualization
These skills enable Claude to seamlessly work with specialized scientific libraries, databases, and tools across multiple scientific domains:
- 🧬 Bioinformatics & Genomics - Sequence analysis, single-cell RNA-seq, gene regulatory networks, variant annotation, phylogenetic analysis
- 🧪 Cheminformatics & Drug Discovery - Molecular property prediction, virtual screening, ADMET analysis, molecular docking, lead optimization
- 🔬 Proteomics & Mass Spectrometry - LC-MS/MS processing, peptide identification, spectral matching, protein quantification
- 🏥 Clinical Research & Precision Medicine - Clinical trials, pharmacogenomics, variant interpretation, drug safety, precision therapeutics
- 🧠 Healthcare AI & Clinical ML - EHR analysis, physiological signal processing, medical imaging, clinical prediction models
- 🖼️ Medical Imaging & Digital Pathology - DICOM processing, whole slide image analysis, computational pathology, radiology workflows
- 🤖 Machine Learning & AI - Deep learning, reinforcement learning, time series analysis, model interpretability, Bayesian methods
- 🔮 Materials Science & Chemistry - Crystal structure analysis, phase diagrams, metabolic modeling, computational chemistry
- 🌌 Physics & Astronomy - Astronomical data analysis, coordinate transformations, cosmological calculations, symbolic mathematics, physics computations
- ⚙️ Engineering & Simulation - Discrete-event simulation, multi-objective optimization, metabolic engineering, systems modeling, process optimization
- 📊 Data Analysis & Visualization - Statistical analysis, network analysis, time series, publication-quality figures, large-scale data processing
- 🧪 Laboratory Automation - Liquid handling protocols, lab equipment control, workflow automation, LIMS integration
- 📚 Scientific Communication - Literature review, peer review, scientific writing, document processing, publication workflows
- 🔬 Multi-omics & Systems Biology - Multi-modal data integration, pathway analysis, network biology, systems-level insights
- 🧬 Protein Engineering & Design - Protein language models, structure prediction, sequence design, function annotation
**Transform Claude Code into an 'AI Scientist' on your desktop!**
@@ -22,13 +30,32 @@ These skills enable Claude to work with specialized scientific libraries and dat
---
## 📦 What's Included
This repository provides **117+ scientific skills** organized into the following categories:
- **25+ Scientific Databases** - Direct API access to PubMed, ChEMBL, UniProt, COSMIC, ClinicalTrials.gov, and more
- **50+ Python Packages** - RDKit, Scanpy, PyTorch Lightning, scikit-learn, BioPython, and others
- **15+ Scientific Integrations** - Benchling, DNAnexus, LatchBio, OMERO, Protocols.io, and more
- **20+ Analysis & Communication Tools** - Literature review, scientific writing, peer review, document processing
Each skill includes:
- ✅ Comprehensive documentation (`SKILL.md`)
- ✅ Practical code examples
- ✅ Use cases and best practices
- ✅ Integration guides
- ✅ Reference materials
---
## 📋 Table of Contents
- [What's Included](#what-s-included)
- [What's Included](#whats-included)
- [Why Use This?](#why-use-this)
- [Getting Started](#getting-started)
- [Claude Code](#claude-code)
- [Any MCP Client](#any-mcp-client-including-chatgpt-cursor-google-adk-openai-agent-sdk-etc)
- [Claude Code](#claude-code-recommended)
- [Cursor IDE](#cursor-ide)
- [Any MCP Client](#any-mcp-client)
- [Prerequisites](#prerequisites)
- [Quick Examples](#quick-examples)
- [Use Cases](#use-cases)
@@ -43,38 +70,39 @@ These skills enable Claude to work with specialized scientific libraries and dat
---
## 📦 What's Included
| Category | Count | Description |
|----------|-------|-------------|
| 📊 **Scientific Databases** | 26 | PubMed, PubChem, UniProt, ChEMBL, COSMIC, DrugBank, AlphaFold DB, bioRxiv, and more |
| 🔬 **Scientific Packages** | 68 | BioPython, RDKit, PyTorch, Scanpy, scvi-tools, ESM, NetworkX, SimPy, pydicom, PyHealth, Data Commons, histolab, LaminDB, PathML, PyLabRobot, HypoGeniC, MarkItDown, Modal, PufferLib, Stable Baselines3, Vaex, Denario, geniml, gtars, and more |
| 🔌 **Scientific Integrations** | 7 | Benchling, DNAnexus, Opentrons, LabArchives, LatchBio, OMERO, Protocols.io |
| 🛠️ **Scientific Helpers** | 2 | Context initialization and resource detection utilities |
| 📚 **Documented Workflows** | 122 | Ready-to-use examples and reference materials |
---
## 🚀 Why Use This?
**Save Time** - Skip days of API documentation research and integration work
**Best Practices** - Curated workflows following scientific computing standards
**Production Ready** - Tested and validated code examples
**Regular Updates** - Maintained and expanded by K-Dense team
**Comprehensive** - Coverage across major scientific domains
**Enterprise Support** - Commercial offerings available for advanced needs
### ⚡ **Accelerate Your Research**
- **Save Days of Work** - Skip API documentation research and integration setup
- **Production-Ready Code** - Tested, validated examples following scientific best practices
- **Multi-Step Workflows** - Execute complex pipelines with a single prompt
### 🎯 **Comprehensive Coverage**
- **117+ Skills** - Extensive coverage across all major scientific domains
- **25+ Databases** - Direct access to PubMed, ChEMBL, UniProt, COSMIC, and more
- **50+ Python Packages** - RDKit, Scanpy, PyTorch Lightning, scikit-learn, and others
### 🔧 **Easy Integration**
- **One-Click Setup** - Install via Claude Code or MCP server
- **Automatic Discovery** - Claude automatically finds and uses relevant skills
- **Well Documented** - Each skill includes examples, use cases, and best practices
### 🌟 **Maintained & Supported**
- **Regular Updates** - Continuously maintained and expanded by K-Dense team
- **Community Driven** - Open source with active community contributions
- **Enterprise Ready** - Commercial support available for advanced needs
---
## 🎯 Getting Started
### Claude Code
Choose your preferred platform to get started:
### 🖥️ Claude Code (Recommended)
> 📚 **New to Claude Code?** Check out the [Claude Code Quickstart Guide](https://docs.claude.com/en/docs/claude-code/quickstart) to get started.
#### Installation
Install Claude Code on your system:
**Step 1: Install Claude Code**
**macOS:**
```bash
@@ -86,33 +114,27 @@ curl -fsSL https://claude.ai/install.sh | bash
irm https://claude.ai/install.ps1 | iex
```
#### Setup
Register this repository as a Claude Code Plugin marketplace by running:
**Step 2: Register the Marketplace**
```bash
/plugin marketplace add K-Dense-AI/claude-scientific-skills
```
Then, to install a specific set of skills:
**Step 3: Install Skills**
1. Select **Browse and install plugins**
2. Select **claude-scientific-skills**
3. Choose from:
- `scientific-databases` - Access to 26 scientific databases
- `scientific-packages` - 64 specialized Python packages
- `scientific-thinking` - Analysis tools and document processing
- `scientific-integrations` - Lab automation and platform integrations
- `scientific-context-initialization` - Ensures Claude searches for and uses existing skills
4. Select **Install now**
1. Open Claude Code
2. Select **Browse and install plugins**
3. Choose **claude-scientific-skills**
4. Select **scientific-skills**
5. Click **Install now**
After installation, simply mention the skill or describe your task - Claude Code will automatically use the appropriate skills!
**That's it!** Claude will automatically use the appropriate skills when you describe your scientific tasks. Make sure to keep the skill up to date!
> 💡 **Tip**: If you find that Claude isn't utilizing the installed skills as much as you'd like, install the `scientific-context-initialization` skill. It automatically creates/updates an `AGENTS.md` file in your workspace that instructs Claude to always search for and use existing skills before attempting any scientific task. This ensures Claude leverages documented patterns, authentication methods, working examples, and best practices from the repository.
---
### Cursor
### ⌨️ Cursor IDE
For Cursor users, we now offer a hosted MCP server for one-click installation:
One-click installation via our hosted MCP server:
<a href="https://cursor.com/en-US/install-mcp?name=claude-scientific-skills&config=eyJ1cmwiOiJodHRwczovL21jcC5rLWRlbnNlLmFpL2NsYXVkZS1zY2llbnRpZmljLXNraWxscy9tY3AifQ%3D%3D">
<picture>
@@ -122,11 +144,19 @@ For Cursor users, we now offer a hosted MCP server for one-click installation:
</picture>
</a>
### Any MCP Client (including ChatGPT, Google ADK, OpenAI Agent SDK, etc.)
Use our MCP server to access the complete skills collection in any MCP-compatible client:
---
🔗 **[claude-skills-mcp](https://github.com/K-Dense-AI/claude-skills-mcp)** - Self-hosted MCP server
🔗 **Hosted MCP**: Available at `https://mcp.k-dense.ai/claude-scientific-skills/mcp`
### 🔌 Any MCP Client
Access all skills via our MCP server in any MCP-compatible client (ChatGPT, Google ADK, OpenAI Agent SDK, etc.):
**Option 1: Hosted MCP Server** (Easiest)
```
https://mcp.k-dense.ai/claude-scientific-skills/mcp
```
**Option 2: Self-Hosted** (More Control)
🔗 **[claude-skills-mcp](https://github.com/K-Dense-AI/claude-skills-mcp)** - Deploy your own MCP server
---
@@ -141,101 +171,95 @@ Use our MCP server to access the complete skills collection in any MCP-compatibl
## 💡 Quick Examples
Once you've installed the skills, you can ask Claude to execute complex multi-step scientific workflows:
Once you've installed the skills, you can ask Claude to execute complex multi-step scientific workflows. Here are some example prompts:
### End-to-End Drug Discovery Pipeline
### 🧪 Drug Discovery Pipeline
**Goal**: Find novel EGFR inhibitors for lung cancer treatment
**Prompt**:
```
"Always use available 'skills' when possible. Keep the output organized.
I need to find novel EGFR inhibitors for lung cancer treatment. Query ChEMBL for existing
EGFR inhibitors with IC50 < 50nM, analyze their structure-activity relationships using RDKit,
generate similar molecules with improved properties using datamol, perform virtual screening
with DiffDock against the AlphaFold-predicted EGFR structure, and search PubMed for recent
papers on resistance mechanisms to prioritize scaffolds. Finally, check COSMIC for common
EGFR mutations and assess how our candidates might interact with mutant forms.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
Use available skills you have access to whenever possible. Query ChEMBL for EGFR inhibitors (IC50 < 50nM), analyze structure-activity relationships
with RDKit, generate improved analogs with datamol, perform virtual screening with DiffDock
against AlphaFold EGFR structure, search PubMed for resistance mechanisms, check COSMIC for
mutations, and create visualizations and a comprehensive report.
```
### Comprehensive Single-Cell Analysis Workflow
```
"Always use available 'skills' when possible. Keep the output organized.
**Skills Used**: ChEMBL, RDKit, datamol, DiffDock, AlphaFold DB, PubMed, COSMIC, scientific visualization
Load this 10X Genomics dataset using Scanpy, perform quality control and doublet removal,
integrate with public data from Cellxgene Census for the same tissue type, identify cell
populations using known markers from NCBI Gene, perform differential expression analysis
with PyDESeq2, run gene regulatory network inference with Arboreto, query Reactome and
KEGG for pathway enrichment, and create publication-quality visualizations with matplotlib.
Then cross-reference top dysregulated genes with Open Targets to identify potential
therapeutic targets.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
---
### 🔬 Single-Cell RNA-seq Analysis
**Goal**: Comprehensive analysis of 10X Genomics data with public data integration
**Prompt**:
```
Use available skills you have access to whenever possible. Load 10X dataset with Scanpy, perform QC and doublet removal, integrate with Cellxgene
Census data, identify cell types using NCBI Gene markers, run differential expression with
PyDESeq2, infer gene regulatory networks with Arboreto, enrich pathways via Reactome/KEGG,
and identify therapeutic targets with Open Targets.
```
### Multi-Omics Integration for Biomarker Discovery
```
"Always use available 'skills' when possible. Keep the output organized.
**Skills Used**: Scanpy, Cellxgene Census, NCBI Gene, PyDESeq2, Arboreto, Reactome, KEGG, Open Targets
I have RNA-seq, proteomics, and metabolomics data from cancer patients. Use PyDESeq2 for
differential expression, pyOpenMS to analyze mass spec data, and integrate metabolite
information from HMDB and Metabolomics Workbench. Map proteins to pathways using UniProt
and KEGG, identify protein-protein interactions via STRING, correlate multi-omics layers
using statsmodels, and build a machine learning model with scikit-learn to predict patient
outcomes. Search ClinicalTrials.gov for ongoing trials targeting the top candidates.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
---
### 🧬 Multi-Omics Biomarker Discovery
**Goal**: Integrate RNA-seq, proteomics, and metabolomics to predict patient outcomes
**Prompt**:
```
Use available skills you have access to whenever possible. Analyze RNA-seq with PyDESeq2, process mass spec with pyOpenMS, integrate metabolites from
HMDB/Metabolomics Workbench, map proteins to pathways (UniProt/KEGG), find interactions via
STRING, correlate omics layers with statsmodels, build predictive model with scikit-learn,
and search ClinicalTrials.gov for relevant trials.
```
### Structure-Based Virtual Screening Campaign
```
"Always use available 'skills' when possible. Keep the output organized.
**Skills Used**: PyDESeq2, pyOpenMS, HMDB, Metabolomics Workbench, UniProt, KEGG, STRING, statsmodels, scikit-learn, ClinicalTrials.gov
I want to discover allosteric modulators for a protein-protein interaction. Retrieve the
AlphaFold structure for both proteins, identify the interaction interface using BioPython,
search ZINC15 for molecules with suitable properties for allosteric binding (MW 300-500,
logP 2-4), filter for drug-likeness using RDKit, perform molecular docking with DiffDock
to identify potential allosteric sites, rank candidates using DeepChem's property prediction
models, check PubChem for suppliers, and search USPTO patents to assess freedom to operate.
Finally, generate analogs with MedChem and molfeat for lead optimization.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
---
### 🎯 Virtual Screening Campaign
**Goal**: Discover allosteric modulators for protein-protein interactions
**Prompt**:
```
Use available skills you have access to whenever possible. Retrieve AlphaFold structures, identify interaction interface with BioPython, search ZINC
for allosteric candidates (MW 300-500, logP 2-4), filter with RDKit, dock with DiffDock,
rank with DeepChem, check PubChem suppliers, search USPTO patents, and optimize leads with
MedChem/molfeat.
```
### Clinical Genomics Variant Interpretation Pipeline
```
"Always use available 'skills' when possible. Keep the output organized.
**Skills Used**: AlphaFold DB, BioPython, ZINC, RDKit, DiffDock, DeepChem, PubChem, USPTO, MedChem, molfeat
Analyze this VCF file from a patient with suspected hereditary cancer. Use pysam to parse
variants, annotate with Ensembl for functional consequences, query ClinVar for known
pathogenic variants, check COSMIC for somatic mutations in cancer, retrieve gene information
from NCBI Gene, analyze protein impact using UniProt, search PubMed for case reports of
similar variants, query ClinPGx for pharmacogenomic implications, and generate a clinical
report with ReportLab. Then search ClinicalTrials.gov for precision medicine trials matching
the patient's profile.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
---
### 🏥 Clinical Variant Interpretation
**Goal**: Analyze VCF file for hereditary cancer risk assessment
**Prompt**:
```
Use available skills you have access to whenever possible. Parse VCF with pysam, annotate variants with Ensembl VEP, query ClinVar for pathogenicity,
check COSMIC for cancer mutations, retrieve gene info from NCBI Gene, analyze protein impact
with UniProt, search PubMed for case reports, check ClinPGx for pharmacogenomics, generate
clinical report with ReportLab, and find matching trials on ClinicalTrials.gov.
```
### Systems Biology Network Analysis
```
"Always use available 'skills' when possible. Keep the output organized.
**Skills Used**: pysam, Ensembl, ClinVar, COSMIC, NCBI Gene, UniProt, PubMed, ClinPGx, ReportLab, ClinicalTrials.gov
Starting with a list of differentially expressed genes from my RNA-seq experiment, query
NCBI Gene for detailed annotations, retrieve protein sequences from UniProt, identify
protein-protein interactions using STRING, map to biological pathways in Reactome and KEGG,
analyze network topology with Torch Geometric, identify hub genes and bottleneck proteins,
perform gene regulatory network reconstruction with Arboreto, integrate with Open Targets
for druggability assessment, use PyMC for Bayesian network modeling, and create interactive
network visualizations. Finally, search GEO for similar expression patterns across diseases.
Create useful visualizations in the form of scientific figures as you go (if needed).
When done, create a comprehensive README.md and a well formatted pdf summarizing the methodology,
results, conclusions and providing recommendations."
---
### 🌐 Systems Biology Network Analysis
**Goal**: Analyze gene regulatory networks from RNA-seq data
**Prompt**:
```
Use available skills you have access to whenever possible. Query NCBI Gene for annotations, retrieve sequences from UniProt, identify interactions via
STRING, map to Reactome/KEGG pathways, analyze topology with Torch Geometric, reconstruct
GRNs with Arboreto, assess druggability with Open Targets, model with PyMC, visualize
networks, and search GEO for similar patterns.
```
**Skills Used**: NCBI Gene, UniProt, STRING, Reactome, KEGG, Torch Geometric, Arboreto, Open Targets, PyMC, GEO
> 📖 **Want more examples?** Check out [docs/examples.md](docs/examples.md) for comprehensive workflow examples and detailed use cases across all scientific domains.
@@ -243,190 +267,141 @@ results, conclusions and providing recommendations."
## 🔬 Use Cases
### Drug Discovery Research
- Screen compound libraries from PubChem and ZINC
- Analyze bioactivity data from ChEMBL
- Predict molecular properties with RDKit and DeepChem
- Perform molecular docking with DiffDock
### 🧪 Drug Discovery & Medicinal Chemistry
- **Virtual Screening**: Screen millions of compounds from PubChem/ZINC against protein targets
- **Lead Optimization**: Analyze structure-activity relationships with RDKit, generate analogs with datamol
- **ADMET Prediction**: Predict absorption, distribution, metabolism, excretion, and toxicity with DeepChem
- **Molecular Docking**: Predict binding poses and affinities with DiffDock
- **Bioactivity Mining**: Query ChEMBL for known inhibitors and analyze SAR patterns
### Bioinformatics Analysis
- Process genomic sequences with BioPython
- Analyze single-cell RNA-seq data with Scanpy
- Query gene information from Ensembl and NCBI Gene
- Identify protein-protein interactions via STRING
### 🧬 Bioinformatics & Genomics
- **Sequence Analysis**: Process DNA/RNA/protein sequences with BioPython and pysam
- **Single-Cell Analysis**: Analyze 10X Genomics data with Scanpy, identify cell types, infer GRNs with Arboreto
- **Variant Annotation**: Annotate VCF files with Ensembl VEP, query ClinVar for pathogenicity
- **Gene Discovery**: Query NCBI Gene, UniProt, and Ensembl for comprehensive gene information
- **Network Analysis**: Identify protein-protein interactions via STRING, map to pathways (KEGG, Reactome)
### Materials Science
- Analyze crystal structures with Pymatgen
- Predict material properties
- Design novel compounds and materials
### 🏥 Clinical Research & Precision Medicine
- **Clinical Trials**: Search ClinicalTrials.gov for relevant studies, analyze eligibility criteria
- **Variant Interpretation**: Annotate variants with ClinVar, COSMIC, and ClinPGx for pharmacogenomics
- **Drug Safety**: Query FDA databases for adverse events, drug interactions, and recalls
- **Precision Therapeutics**: Match patient variants to targeted therapies and clinical trials
### Clinical Research
- Search clinical trials on ClinicalTrials.gov
- Analyze genetic variants in ClinVar
- Review pharmacogenomic data from ClinPGx
- Access cancer mutations from COSMIC
### 🔬 Multi-Omics & Systems Biology
- **Multi-Omics Integration**: Combine RNA-seq, proteomics, and metabolomics data
- **Pathway Analysis**: Enrich differentially expressed genes in KEGG/Reactome pathways
- **Network Biology**: Reconstruct gene regulatory networks, identify hub genes
- **Biomarker Discovery**: Integrate multi-omics layers to predict patient outcomes
### Academic Research
- Literature searches via PubMed
- Patent landscape analysis using USPTO
- Data visualization for publications
- Statistical analysis and hypothesis testing
### 📊 Data Analysis & Visualization
- **Statistical Analysis**: Perform hypothesis testing, power analysis, and experimental design
- **Publication Figures**: Create publication-quality visualizations with matplotlib and seaborn
- **Network Visualization**: Visualize biological networks with NetworkX
- **Report Generation**: Generate comprehensive PDF reports with ReportLab
### 🧪 Laboratory Automation
- **Protocol Design**: Create Opentrons protocols for automated liquid handling
- **LIMS Integration**: Integrate with Benchling and LabArchives for data management
- **Workflow Automation**: Automate multi-step laboratory workflows
---
## 📚 Available Skills
### 🗄️ Scientific Databases
**26 comprehensive databases** including PubMed, PubChem, UniProt, ChEMBL, DrugBank, AlphaFold DB, bioRxiv, COSMIC, Ensembl, KEGG, and more.
This repository contains **117+ scientific skills** organized across multiple domains. Each skill provides comprehensive documentation, code examples, and best practices for working with scientific libraries, databases, and tools.
📖 **[Full Database Documentation →](docs/scientific-databases.md)**
### Skill Categories
<details>
<summary><strong>View all databases</strong></summary>
#### 🧬 **Bioinformatics & Genomics** (15+ skills)
- Sequence analysis: BioPython, pysam, scikit-bio
- Single-cell analysis: Scanpy, AnnData, scvi-tools, Arboreto, Cellxgene Census
- Genomic tools: gget, geniml, gtars, deepTools, FlowIO, Zarr
- Phylogenetics: ETE Toolkit
- **AlphaFold DB** - AI-predicted protein structures (200M+ predictions)
- **bioRxiv** - Life sciences preprint server with medRxiv integration
- **ChEMBL** - Bioactive molecules and drug-like properties
- **ClinPGx** - Clinical pharmacogenomics and gene-drug interactions
- **ClinVar** - Genomic variants and clinical significance
- **ClinicalTrials.gov** - Global clinical studies registry
- **COSMIC** - Somatic cancer mutations database
- **DrugBank** - Comprehensive drug and drug target information
- **ENA** - European Nucleotide Archive
- **Ensembl** - Genome browser and annotations
- **FDA Databases** - Drug approvals, adverse events, recalls
- **GEO** - Gene expression and functional genomics
- **GWAS Catalog** - Genome-wide association studies
- **HMDB** - Human metabolome database
- **KEGG** - Biological pathways and molecular interactions
- **Metabolomics Workbench** - NIH metabolomics data
- **NCBI Gene** - Gene information and annotations
- **Open Targets** - Therapeutic target identification
- **PDB** - Protein structure database
- **PubChem** - Chemical compound data (110M+ compounds)
- **PubMed** - Biomedical literature database
- **Reactome** - Curated biological pathways
- **STRING** - Protein-protein interaction networks
- **UniProt** - Protein sequences and annotations
- **USPTO** - Patent and trademark data
- **ZINC** - Commercially-available compounds for screening
#### 🧪 **Cheminformatics & Drug Discovery** (10+ skills)
- Molecular manipulation: RDKit, Datamol, Molfeat
- Deep learning: DeepChem, TorchDrug
- Docking & screening: DiffDock
- Drug-likeness: MedChem
- Benchmarks: PyTDC
</details>
#### 🔬 **Proteomics & Mass Spectrometry** (2 skills)
- Spectral processing: matchms, pyOpenMS
---
#### 🏥 **Clinical Research & Precision Medicine** (8+ skills)
- Clinical databases: ClinicalTrials.gov, ClinVar, ClinPGx, COSMIC, FDA Databases
- Healthcare AI: PyHealth, NeuroKit2
- Variant analysis: Ensembl, NCBI Gene
### 🔬 Scientific Packages
**68 specialized Python packages** organized by domain.
#### 🖼️ **Medical Imaging & Digital Pathology** (3 skills)
- DICOM processing: pydicom
- Whole slide imaging: histolab, PathML
📖 **[Full Package Documentation →](docs/scientific-packages.md)**
#### 🤖 **Machine Learning & AI** (15+ skills)
- Deep learning: PyTorch Lightning, Transformers, Stable Baselines3, PufferLib
- Classical ML: scikit-learn, scikit-survival, SHAP
- Time series: aeon
- Bayesian methods: PyMC
- Optimization: PyMOO
- Graph ML: Torch Geometric
- Dimensionality reduction: UMAP-learn
- Statistical modeling: statsmodels
<details>
<summary><strong>Bioinformatics & Genomics (14 packages)</strong></summary>
#### 🔮 **Materials Science & Chemistry** (3 skills)
- Materials: Pymatgen
- Metabolic modeling: COBRApy
- Astronomy: Astropy
- AnnData, Arboreto, BioPython, BioServices, Cellxgene Census
- deepTools, FlowIO, gget, geniml, gtars, pysam, PyDESeq2, Scanpy, scvi-tools
#### ⚙️ **Engineering & Simulation** (2 skills)
- Discrete-event simulation: SimPy
- Data processing: Dask, Polars, Vaex
</details>
#### 📊 **Data Analysis & Visualization** (8+ skills)
- Visualization: Matplotlib, Seaborn
- Network analysis: NetworkX
- Symbolic math: SymPy
- PDF generation: ReportLab
- Data access: Data Commons
<details>
<summary><strong>Cheminformatics & Drug Discovery (8 packages)</strong></summary>
#### 🧪 **Laboratory Automation** (3 skills)
- Liquid handling: PyLabRobot
- Protocol management: Protocols.io
- LIMS integration: Benchling, LabArchives
- Datamol, DeepChem, DiffDock, MedChem, Molfeat, PyTDC, RDKit, TorchDrug
#### 🔬 **Multi-omics & Systems Biology** (5+ skills)
- Pathway analysis: KEGG, Reactome, STRING
- Multi-omics: BIOMNI, Denario, HypoGeniC
- Data management: LaminDB
</details>
#### 🧬 **Protein Engineering & Design** (1 skill)
- Protein language models: ESM
<details>
<summary><strong>Proteomics & Mass Spectrometry (2 packages)</strong></summary>
#### 📚 **Scientific Communication** (7+ skills)
- Literature: PubMed, Literature Review
- Writing: Scientific Writing, Peer Review
- Document processing: DOCX, PDF, PPTX, XLSX, MarkItDown
- Publishing: Paper-2-Web
- matchms, pyOpenMS
#### 🔬 **Scientific Databases** (25+ skills)
- Protein: UniProt, PDB, AlphaFold DB
- Chemical: PubChem, ChEMBL, DrugBank, ZINC, HMDB
- Genomic: Ensembl, NCBI Gene, GEO, ENA, GWAS Catalog
- Clinical: ClinVar, COSMIC, ClinicalTrials.gov, ClinPGx, FDA Databases
- Pathways: KEGG, Reactome, STRING
- Targets: Open Targets
- Metabolomics: Metabolomics Workbench
- Patents: USPTO
</details>
#### 🔧 **Infrastructure & Platforms** (5+ skills)
- Cloud compute: Modal
- Genomics platforms: DNAnexus, LatchBio
- Microscopy: OMERO
- Automation: Opentrons
- Tool discovery: ToolUniverse
<details>
<summary><strong>Machine Learning & Deep Learning (13 packages)</strong></summary>
> 📖 **For complete details on all skills**, see [docs/scientific-skills.md](docs/scientific-skills.md)
- aeon, PufferLib, PyMC, PyMOO, PyTorch Lightning, scikit-learn, scikit-survival, SHAP
- Stable Baselines3, statsmodels, Torch Geometric, Transformers, UMAP-learn
</details>
<details>
<summary><strong>Materials Science & Chemistry (3 packages)</strong></summary>
- Astropy, COBRApy, Pymatgen
</details>
<details>
<summary><strong>Data Analysis & Visualization (9 packages)</strong></summary>
- Dask, Matplotlib, NetworkX, Polars, ReportLab, Seaborn, SimPy, SymPy, Vaex
</details>
<details>
<summary><strong>Additional Packages (13 packages)</strong></summary>
- BIOMNI (Multi-omics), ETE Toolkit (Phylogenetics)
- histolab (Digital pathology WSI processing and tile extraction)
- HypoGeniC (Automated hypothesis generation and testing)
- LaminDB (Data framework for biology with FAIR data management, lineage tracking, and ontology integration)
- MarkItDown (Document format conversion to Markdown for LLM processing)
- Modal (Serverless cloud platform for Python with GPUs, autoscaling, and batch processing)
- Paper-2-Web (Academic paper dissemination and presentation)
- PathML (Computational pathology and whole-slide image analysis)
- PyLabRobot (Laboratory automation for liquid handlers, plate readers, and lab equipment)
- scikit-bio (Sequence analysis), ToolUniverse (600+ scientific tool ecosystem)
- Zarr (Array storage)
</details>
---
### 🧠 Scientific Thinking & Analysis
**Comprehensive analysis tools** and document processing capabilities.
📖 **[Full Thinking & Analysis Documentation →](docs/scientific-thinking.md)**
**Analysis & Methodology:**
- Exploratory Data Analysis (automated statistics and insights)
- Hypothesis Generation (structured frameworks)
- Literature Review (systematic search and citation management)
- Peer Review (comprehensive evaluation toolkit)
- Scholar Evaluation (systematic framework for evaluating scholarly and research work)
- Scientific Brainstorming (ideation workflows)
- Scientific Critical Thinking (rigorous reasoning)
- Scientific Visualization (publication-quality figures)
- Scientific Writing (IMRAD format, citation styles)
- Statistical Analysis (testing and experimental design)
**Document Processing:**
- DOCX, PDF, PPTX, XLSX manipulation and analysis
- Tracked changes, comments, and formatting preservation
- Text extraction, table parsing, and data analysis
---
### 🔌 Scientific Integrations
**7 platform integrations** for lab automation and workflow management.
📖 **[Full Integration Documentation →](docs/scientific-integrations.md)**
- **Benchling** - R&D platform and LIMS integration
- **DNAnexus** - Cloud genomics and biomedical data analysis
- **LabArchives** - Electronic Lab Notebook (ELN) integration
- **LatchBio** - Workflow platform and cloud execution
- **OMERO** - Microscopy and bio-image data management
- **Opentrons** - Laboratory automation protocols
- **Protocols.io** - Scientific protocol management and sharing platform
---
### 🛠️ Scientific Helpers
**2 helper utilities** for enhanced scientific computing capabilities.
- **scientific-context-initialization** - Auto-invoked skill that creates/updates workspace AGENTS.md to instruct Claude to search for and use existing skills before attempting any scientific task
- **get-available-resources** - Detects available system resources (CPU cores, GPUs, memory, disk space) and generates strategic recommendations for computational approaches (parallel processing, out-of-core computing, GPU acceleration)
> 💡 **Looking for practical examples?** Check out [docs/examples.md](docs/examples.md) for comprehensive workflow examples across all scientific domains.
---
@@ -480,26 +455,19 @@ Contributors are recognized in our community and may be featured in:
Your contributions help make scientific computing more accessible and enable researchers to leverage AI tools more effectively!
📖 **[Contributing Guidelines →](CONTRIBUTING.md)** *(coming soon)*
---
## 🔧 Troubleshooting
### Common Issues
**Problem: Claude not using installed skills**
- Solution: Install the `scientific-context-initialization` skill
- This creates an `AGENTS.md` file that instructs Claude to search for and use existing skills before attempting tasks
- After installation, Claude will automatically leverage documented patterns, examples, and best practices
**Problem: Skills not loading in Claude Code**
- Solution: Ensure you've installed the latest version of Claude Code
- Try reinstalling the plugin: `/plugin marketplace add K-Dense-AI/claude-scientific-skills`
**Problem: Missing Python dependencies**
- Solution: Check the specific `SKILL.md` file for required packages
- Install dependencies: `pip install package-name`
- Install dependencies: `uv pip install package-name`
**Problem: API rate limits**
- Solution: Many databases have rate limits. Review the specific database documentation
@@ -517,29 +485,41 @@ Your contributions help make scientific computing more accessible and enable res
## ❓ FAQ
### General Questions
**Q: Is this free to use?**
A: Yes, for any purpose including commercial use. This project is MIT licensed.
A: Yes! This project is MIT licensed, allowing free use for any purpose including commercial projects.
**Q: Do I need all the Python packages installed?**
A: No, only install the packages you need. Each skill specifies its requirements.
**Q: Can I use this with other AI models?**
A: The skills are designed for Claude but can be adapted for other models with MCP support.
**Q: How often is this updated?**
A: We regularly update skills to reflect the latest versions of packages and APIs.
**Q: Why are all skills grouped into one plugin instead of separate plugins?**
A: We believe good science in the age of AI is inherently interdisciplinary. Bundling all skills into a single plugin makes it trivial for you (and Claude) to bridge across fields—e.g., combining genomics, cheminformatics, clinical data, and machine learning in one workflow—without worrying about which individual skills to install or wire together.
**Q: Can I use this for commercial projects?**
A: Yes! The MIT License allows both commercial and noncommercial use without restrictions.
A: Absolutely! The MIT License allows both commercial and noncommercial use without restrictions.
**Q: How often is this updated?**
A: We regularly update skills to reflect the latest versions of packages and APIs. Major updates are announced in release notes.
**Q: Can I use this with other AI models?**
A: The skills are optimized for Claude but can be adapted for other models with MCP support. The MCP server works with any MCP-compatible client.
### Installation & Setup
**Q: Do I need all the Python packages installed?**
A: No! Only install the packages you need. Each skill specifies its requirements in its `SKILL.md` file.
**Q: What if a skill doesn't work?**
A: First check the troubleshooting section, then file an issue on GitHub with details.
**Q: Can I contribute my own skills?**
A: Absolutely! See the [Contributing](#contributing) section for guidelines.
A: First check the [Troubleshooting](#troubleshooting) section. If the issue persists, file an issue on GitHub with detailed reproduction steps.
**Q: Do the skills work offline?**
A: Database skills require internet access. Package skills work offline once dependencies are installed.
A: Database skills require internet access to query APIs. Package skills work offline once Python dependencies are installed.
### Contributing
**Q: Can I contribute my own skills?**
A: Absolutely! We welcome contributions. See the [Contributing](#contributing) section for guidelines and best practices.
**Q: How do I report bugs or suggest features?**
A: Open an issue on GitHub with a clear description. For bugs, include reproduction steps and expected vs actual behavior.
---

View File

@@ -1,29 +0,0 @@
# Scientific Databases
- **AlphaFold DB** - AI-predicted protein structure database with 200M+ predictions, confidence metrics (pLDDT, PAE), and Google Cloud bulk access
- **ChEMBL** - Bioactive molecule database with drug-like properties (2M+ compounds, 19M+ activities, 13K+ targets)
- **ClinPGx** - Clinical pharmacogenomics database (successor to PharmGKB) providing gene-drug interactions, CPIC clinical guidelines, allele functions, drug labels, and pharmacogenomic annotations for precision medicine and personalized pharmacotherapy (consolidates PharmGKB, CPIC, and PharmCAT resources)
- **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
- **ClinicalTrials.gov** - Comprehensive registry of clinical studies conducted worldwide (maintained by U.S. National Library of Medicine) with API v2 access for searching trials by condition, intervention, location, sponsor, study status, and phase; retrieve detailed trial information including eligibility criteria, outcomes, contacts, and locations; export to CSV/JSON formats for analysis (public API, no authentication required, ~50 req/min rate limit)
- **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
- **DrugBank** - Comprehensive bioinformatics and cheminformatics database containing detailed drug and drug target information (9,591+ drug entries including 2,037 FDA-approved small molecules, 241 biotech drugs, 96 nutraceuticals, 6,000+ experimental compounds) with 200+ data fields per entry covering chemical structures (SMILES, InChI), pharmacology (mechanism of action, pharmacodynamics, ADME), drug-drug interactions, protein targets (enzymes, transporters, carriers), biological pathways, external identifiers (PubChem, ChEMBL, UniProt), and physicochemical properties for drug discovery, pharmacology research, interaction analysis, target identification, chemical similarity searches, and ADMET predictions
- **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
- **Ensembl** - Genome browser and bioinformatics database providing genomic annotations, sequences, variants, and comparative genomics data for 250+ vertebrate species (Release 115, 2025) with comprehensive REST API for gene lookups, sequence retrieval, variant effect prediction (VEP), ortholog finding, assembly mapping (GRCh37/GRCh38), and region analysis
- **FDA Databases** - Comprehensive access to all FDA (Food and Drug Administration) regulatory databases through openFDA API covering drugs (adverse events, labeling, NDC, recalls, approvals, shortages), medical devices (adverse events, 510k clearances, PMA, UDI, classifications), foods (recalls, adverse events, allergen tracking), animal/veterinary medicines (species-specific adverse events), and substances (UNII/CAS lookup, chemical structures, molecular data) for drug safety research, pharmacovigilance, regulatory compliance, and scientific analysis
- **GEO (Gene Expression Omnibus)** - High-throughput gene expression and functional genomics data repository (264K+ studies, 8M+ samples) with microarray, RNA-seq, and expression profile access
- **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies with curated SNP-trait associations (thousands of studies, genome-wide significant associations p≤5×10⁻⁸), full summary statistics, REST API access for variant/trait/gene queries, and FTP downloads for genetic epidemiology and precision medicine research
- **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
- **KEGG** - Kyoto Encyclopedia of Genes and Genomes for biological pathway analysis, gene-to-pathway mapping, compound searches, and molecular interaction networks (pathway enrichment, metabolic pathways, gene annotations, drug-drug interactions, ID conversion)
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository with 4,200+ processed studies, standardized nomenclature (RefMet), mass spectrometry searches, and comprehensive REST API for accessing metabolite structures, study metadata, experimental results, and gene/protein-metabolite associations
- **Open Targets** - Comprehensive therapeutic target identification and validation platform integrating genetics, omics, and chemical data (200M+ evidence strings, target-disease associations with scoring, tractability assessments, safety liabilities, known drugs from ChEMBL, GraphQL API) for drug target discovery, prioritization, evidence evaluation, drug repurposing, competitive intelligence, and mechanism research
- **NCBI Gene** - Work with NCBI Gene database to search, retrieve, and analyze gene information including nomenclature, sequences, variations, phenotypes, and pathways using E-utilities and Datasets API
- **Protein Data Bank (PDB)** - Access 3D structural data of proteins, nucleic acids, and biological macromolecules (200K+ structures) with search, retrieval, and analysis capabilities
- **PubChem** - Access chemical compound data from the world's largest free chemical database (110M+ compounds, 270M+ bioactivities)
- **PubMed** - Access to PubMed literature database with advanced search capabilities
- **Reactome** - Curated pathway database for biological processes and molecular interactions (2,825+ human pathways, 16K+ reactions, 11K+ proteins) with pathway enrichment analysis, expression data analysis, and species comparison using Content Service and Analysis Service APIs
- **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
- **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
- **USPTO** - United States Patent and Trademark Office data access including patent searches, trademark lookups, patent examination history (PEDS), office actions, assignments, citations, and litigation records; supports PatentSearch API (ElasticSearch-based patent search), TSDR (Trademark Status & Document Retrieval), Patent/Trademark Assignment APIs, and additional specialized APIs for comprehensive IP analysis
- **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery (230M+ purchasable compounds in ready-to-dock 3D formats)

View File

@@ -1,24 +0,0 @@
# Scientific Integrations
## Laboratory Information Management Systems (LIMS) & R&D Platforms
- **Benchling Integration** - Toolkit for integrating with Benchling's R&D platform, providing programmatic access to laboratory data management including registry entities (DNA sequences, proteins), inventory systems (samples, containers, locations), electronic lab notebooks (entries, protocols), workflows (tasks, automation), and data exports using Python SDK and REST API
## Cloud Platforms for Genomics & Biomedical Data
- **DNAnexus Integration** - Comprehensive toolkit for working with the DNAnexus cloud platform for genomics and biomedical data analysis. Covers building and deploying apps/applets (Python/Bash), managing data objects (files, records, databases), running analyses and workflows, using the dxpy Python SDK, and configuring app metadata and dependencies (dxapp.json setup, system packages, Docker, assets). Enables processing of FASTQ/BAM/VCF files, bioinformatics pipelines, job execution, workflow orchestration, and platform operations including project management and permissions
## Laboratory Automation
- **Opentrons Integration** - Toolkit for creating, editing, and debugging Opentrons Python Protocol API v2 protocols for laboratory automation using Flex and OT-2 robots. Enables automated liquid handling, pipetting workflows, hardware module control (thermocycler, temperature, magnetic, heater-shaker, absorbance plate reader), labware management, and complex protocol development for biological and chemical experiments
## Electronic Lab Notebooks (ELN)
- **LabArchives Integration** - Toolkit for interacting with LabArchives Electronic Lab Notebook (ELN) REST API. Provides programmatic access to notebooks (backup, retrieval, management), entries (creation, comments, attachments), user authentication, site reports and analytics, and third-party integrations (Protocols.io, GraphPad Prism, SnapGene, Geneious, Jupyter, REDCap). Includes Python scripts for configuration setup, notebook operations, and entry management. Supports multi-regional API endpoints (US, UK, Australia) and OAuth authentication
## Workflow Platforms & Cloud Execution
- **LatchBio Integration** - Integration with the Latch platform for building, deploying, and executing bioinformatics workflows. Provides comprehensive support for creating serverless bioinformatics pipelines using Python decorators, deploying Nextflow/Snakemake pipelines, managing cloud data (LatchFile, LatchDir) and structured Registry (Projects, Tables, Records), configuring computational resources (CPU, GPU, memory, storage), and using pre-built Latch Verified workflows (RNA-seq, AlphaFold, DESeq2, single-cell analysis, CRISPR editing). Enables automatic containerization, UI generation, workflow versioning, and execution on scalable cloud infrastructure with comprehensive data management
## Microscopy & Bio-image Data
- **OMERO Integration** - Toolkit for interacting with OMERO microscopy data management systems using Python. Provides comprehensive access to microscopy images stored in OMERO servers, including dataset and screening data retrieval, pixel data analysis, annotation and metadata management, regions of interest (ROIs) creation and analysis, batch processing, OMERO.scripts development, and OMERO.tables for structured data storage. Essential for researchers working with high-content screening data, multi-dimensional microscopy datasets, or collaborative image repositories
## Protocol Management & Sharing
- **Protocols.io Integration** - Integration with protocols.io API for managing scientific protocols. Enables programmatic access to protocol discovery (search by keywords, DOI, category), protocol lifecycle management (create, update, publish with DOI), step-by-step procedure documentation, collaborative development with workspaces and discussions, file management (upload data, images, documents), experiment tracking and documentation, and data export. Supports OAuth authentication, protocol PDF generation, materials management, threaded comments, workspace permissions, and institutional protocol repositories. Essential for protocol standardization, reproducibility, lab knowledge management, and scientific collaboration

View File

@@ -1,105 +0,0 @@
# Scientific Packages
## Bioinformatics & Genomics
- **AnnData** - Annotated data matrices for single-cell genomics and h5ad files
- **Arboreto** - Gene regulatory network inference using GRNBoost2 and GENIE3
- **BioPython** - Sequence manipulation, NCBI database access, BLAST searches, alignments, and phylogenetics
- **BioServices** - Programmatic access to 40+ biological web services (KEGG, UniProt, ChEBI, ChEMBL)
- **Cellxgene Census** - Query and analyze large-scale single-cell RNA-seq data
- **gget** - Efficient genomic database queries (Ensembl, UniProt, NCBI, PDB, COSMIC)
- **geniml** - Genomic interval machine learning toolkit providing unsupervised methods for building ML models on BED files. Key capabilities include Region2Vec (word2vec-style embeddings of genomic regions and region sets using tokenization and neural language modeling), BEDspace (joint embeddings of regions and metadata labels using StarSpace for cross-modal queries), scEmbed (Region2Vec applied to single-cell ATAC-seq data generating cell-level embeddings for clustering and annotation with scanpy integration), consensus peak building (four statistical methods CC/CCF/ML/HMM for creating reference universes from BED collections), and comprehensive utilities (BBClient for BED caching, BEDshift for genomic randomization preserving context, evaluation metrics for embedding quality, Text2BedNN for neural search backends). Part of BEDbase ecosystem. Supports Python API and CLI workflows, pre-trained models on Hugging Face, and integration with gtars for tokenization. Use cases: region similarity searches, dimension reduction of chromatin accessibility data, scATAC-seq clustering and cell-type annotation, metadata-aware genomic queries, universe construction for standardized references, and any ML task requiring genomic region feature vectors
- **gtars** - High-performance Rust toolkit for genomic interval analysis providing specialized tools for overlap detection using IGD (Integrated Genome Database) indexing, coverage track generation (uniwig module for WIG/BigWig formats), genomic tokenization for machine learning applications (TreeTokenizer for deep learning models), reference sequence management (refget protocol compliance), fragment processing for single-cell genomics (barcode-based splitting and cluster analysis), and fragment scoring against reference datasets. Offers Python bindings with NumPy integration, command-line tools (gtars-cli), and Rust library. Key modules include: tokenizers (convert genomic regions to ML tokens), overlaprs (efficient overlap computation), uniwig (ATAC-seq/ChIP-seq/RNA-seq coverage profiles), refget (GA4GH-compliant sequence digests), bbcache (BEDbase.org integration), scoring (fragment enrichment metrics), and fragsplit (single-cell fragment manipulation). Supports parallel processing, memory-mapped files, streaming for large datasets, and serves as foundation for geniml genomic ML package. Ideal for genomic ML preprocessing, regulatory element analysis, variant annotation, chromatin accessibility profiling, and computational genomics workflows
- **pysam** - Read, write, and manipulate genomic data files (SAM/BAM/CRAM alignments, VCF/BCF variants, FASTA/FASTQ sequences) with pileup analysis, coverage calculations, and bioinformatics workflows
- **PyDESeq2** - Differential gene expression analysis for bulk RNA-seq data
- **Scanpy** - Single-cell RNA-seq analysis with clustering, marker genes, and UMAP/t-SNE visualization
- **scvi-tools** - Probabilistic deep learning models for single-cell omics analysis. PyTorch-based framework providing variational autoencoders (VAEs) for dimensionality reduction, batch correction, differential expression, and data integration across modalities. Includes 25+ models: scVI/scANVI (RNA-seq integration and cell type annotation), totalVI (CITE-seq protein+RNA), MultiVI (multiome RNA+ATAC integration), PeakVI (ATAC-seq analysis), DestVI/Stereoscope/Tangram (spatial transcriptomics deconvolution), MethylVI (methylation), CytoVI (flow/mass cytometry), VeloVI (RNA velocity), contrastiveVI (perturbation studies), and Solo (doublet detection). Supports seamless integration with Scanpy/AnnData ecosystem, GPU acceleration, reference mapping (scArches), and probabilistic differential expression with uncertainty quantification
## Data Management & Infrastructure
- **LaminDB** - Open-source data framework for biology that makes data queryable, traceable, reproducible, and FAIR (Findable, Accessible, Interoperable, Reusable). Provides unified platform combining lakehouse architecture, lineage tracking, feature stores, biological ontologies (via Bionty plugin with 20+ ontologies: genes, proteins, cell types, tissues, diseases, pathways), LIMS, and ELN capabilities through a single Python API. Key features include: automatic data lineage tracking (code, inputs, outputs, environment), versioned artifacts (DataFrame, AnnData, SpatialData, Parquet, Zarr), schema validation and data curation with standardization/synonym mapping, queryable metadata with feature-based filtering, cross-registry traversal, and streaming for large datasets. Supports integrations with workflow managers (Nextflow, Snakemake, Redun), MLOps platforms (Weights & Biases, MLflow, HuggingFace, scVI-tools), cloud storage (S3, GCS, S3-compatible), array stores (TileDB-SOMA, DuckDB), and visualization (Vitessce). Deployment options: local SQLite, cloud storage with SQLite, or cloud storage with PostgreSQL for production. Use cases: scRNA-seq standardization and analysis, flow cytometry/spatial data management, multi-modal dataset integration, computational workflow tracking with reproducibility, biological ontology-based annotation, data lakehouse construction for unified queries, ML pipeline integration with experiment tracking, and FAIR-compliant dataset publishing
- **Modal** - Serverless cloud platform for running Python code with minimal configuration, specialized for AI/ML workloads and scientific computing. Execute functions on powerful GPUs (T4, L4, A10, A100, L40S, H100, H200, B200), scale automatically from zero to thousands of containers, and pay only for compute used. Key features include: declarative container image building with uv/pip/apt package management, automatic autoscaling with configurable limits and buffer containers, GPU acceleration with multi-GPU support (up to 8 GPUs per container), persistent storage via Volumes for model weights and datasets, secret management for API keys and credentials, scheduled jobs with cron expressions, web endpoints for deploying serverless APIs, parallel execution with `.map()` for batch processing, input concurrency for I/O-bound workloads, and resource configuration (CPU cores, memory, disk). Supports custom Docker images, integration with Hugging Face/Weights & Biases, FastAPI for web endpoints, and distributed training. Free tier includes $30/month credits. Use cases: ML model deployment and inference (LLMs, image generation, embeddings), GPU-accelerated training, batch processing large datasets in parallel, scheduled compute-intensive jobs, serverless API deployment with autoscaling, scientific computing requiring distributed compute or specialized hardware, and data pipeline automation
## Cheminformatics & Drug Discovery
- **Datamol** - Molecular manipulation and featurization with enhanced RDKit workflows
- **DeepChem** - Molecular machine learning, graph neural networks, and MoleculeNet benchmarks
- **DiffDock** - Diffusion-based molecular docking for protein-ligand binding prediction
- **MedChem** - Medicinal chemistry analysis, ADMET prediction, and drug-likeness assessment
- **Molfeat** - 100+ molecular featurizers including fingerprints, descriptors, and pretrained models
- **PyTDC** - Therapeutics Data Commons for drug discovery datasets and benchmarks
- **RDKit** - Cheminformatics toolkit for molecular I/O, descriptors, fingerprints, and SMARTS
- **TorchDrug** - PyTorch-based machine learning platform for drug discovery with 40+ datasets, 20+ GNN models for molecular property prediction, protein modeling, knowledge graph reasoning, molecular generation, and retrosynthesis planning
## Proteomics & Mass Spectrometry
- **matchms** - Processing and similarity matching of mass spectrometry data with 40+ filters, spectral library matching (Cosine, Modified Cosine, Neutral Losses), metadata harmonization, molecular fingerprint comparison, and support for multiple file formats (MGF, MSP, mzML, JSON)
- **pyOpenMS** - Comprehensive mass spectrometry data analysis for proteomics and metabolomics (LC-MS/MS processing, peptide identification, feature detection, quantification, chemical calculations, and integration with search engines like Comet, Mascot, MSGF+)
## Medical Imaging & Digital Pathology
- **histolab** - Digital pathology toolkit for whole slide image (WSI) processing and analysis. Provides automated tissue detection, tile extraction for deep learning pipelines, and preprocessing for gigapixel histopathology images. Key features include: multi-format WSI support (SVS, TIFF, NDPI), three tile extraction strategies (RandomTiler for sampling, GridTiler for complete coverage, ScoreTiler for quality-driven selection), automated tissue masks with customizable filters, built-in scorers (NucleiScorer, CellularityScorer), pyramidal image handling, visualization tools (thumbnails, mask overlays, tile previews), and H&E stain decomposition. Supports multiple tissue sections, artifact removal, pen annotation exclusion, and reproducible extraction with seeding. Use cases: creating training datasets for computational pathology, extracting informative tiles for tumor classification, whole-slide tissue characterization, quality assessment of histology samples, automated nuclei density analysis, and preprocessing for digital pathology deep learning workflows
- **PathML** - Comprehensive computational pathology toolkit for whole slide image analysis, tissue segmentation, and machine learning on pathology data. Provides end-to-end workflows for digital pathology research including data loading, preprocessing, feature extraction, and model deployment
- **pydicom** - Pure Python package for working with DICOM (Digital Imaging and Communications in Medicine) files. Provides comprehensive support for reading, writing, and manipulating medical imaging data from CT, MRI, X-ray, ultrasound, PET scans and other modalities. Key features include: pixel data extraction and manipulation with automatic decompression (JPEG/JPEG 2000/RLE), metadata access and modification with 1000+ standardized DICOM tags, image format conversion (PNG/JPEG/TIFF), anonymization tools for removing Protected Health Information (PHI), windowing and display transformations (VOI LUT application), multi-frame and 3D volume processing, DICOM sequence handling, and support for multiple transfer syntaxes. Use cases: medical image analysis, PACS system integration, radiology workflows, research data processing, DICOM anonymization, format conversion, image preprocessing for machine learning, multi-slice volume reconstruction, and clinical imaging pipelines
## Healthcare AI & Clinical Machine Learning
- **NeuroKit2** - Comprehensive biosignal processing toolkit for analyzing physiological data including ECG, EEG, EDA, RSP, PPG, EMG, and EOG signals. Use this skill when processing cardiovascular signals, brain activity, electrodermal responses, respiratory patterns, muscle activity, or eye movements. Key features include: automated signal processing pipelines (cleaning, peak detection, delineation, quality assessment), heart rate variability analysis across time/frequency/nonlinear domains (SDNN, RMSSD, LF/HF, DFA, entropy measures), EEG analysis (frequency band power, microstates, source localization), autonomic nervous system assessment (sympathetic indices, respiratory sinus arrhythmia), comprehensive complexity measures (25+ entropy types, 15+ fractal dimensions, Lyapunov exponents), event-related and interval-related analysis modes, epoch creation and averaging for stimulus-locked responses, multi-signal integration with unified workflows, and extensive signal processing utilities (filtering, decomposition, peak correction, spectral analysis). Includes modular reference documentation across 12 specialized domains. Use cases: heart rate variability for cardiovascular health assessment, EEG microstates for consciousness studies, electrodermal activity for emotion research, respiratory variability analysis, psychophysiology experiments, affective computing, stress monitoring, sleep staging, autonomic dysfunction assessment, biofeedback applications, and multi-modal physiological signal integration for comprehensive human state monitoring
- **PyHealth** - Comprehensive healthcare AI toolkit for developing, testing, and deploying machine learning models with clinical data. Provides specialized tools for electronic health records (EHR), physiological signals, medical imaging, and clinical text analysis. Key features include: 10+ healthcare datasets (MIMIC-III/IV, eICU, OMOP, sleep EEG, COVID-19 CXR), 20+ predefined clinical prediction tasks (mortality, hospital readmission, length of stay, drug recommendation, sleep staging, EEG analysis), 33+ models (Logistic Regression, MLP, CNN, RNN, Transformer, GNN, plus healthcare-specific models like RETAIN, SafeDrug, GAMENet, StageNet), comprehensive data processing (sequence processors, signal processors, medical code translation between ICD-9/10, NDC, RxNorm, ATC systems), training/evaluation utilities (Trainer class, fairness metrics, calibration, uncertainty quantification), and interpretability tools (attention visualization, SHAP, ChEFER). 3x faster than pandas for healthcare data processing. Use cases: ICU mortality prediction, hospital readmission risk assessment, safe medication recommendation with drug-drug interaction constraints, sleep disorder diagnosis from EEG signals, medical code standardization and translation, clinical text to ICD coding, length of stay estimation, and any clinical ML application requiring interpretability, fairness assessment, and calibrated predictions for healthcare deployment
## Protein Engineering & Design
- **ESM (Evolutionary Scale Modeling)** - State-of-the-art protein language models from EvolutionaryScale for protein design, structure prediction, and representation learning. Includes ESM3 (1.4B-98B parameter multimodal generative models for simultaneous reasoning across sequence, structure, and function with chain-of-thought generation, inverse folding, and function-conditioned design) and ESM C (300M-6B parameter efficient embedding models 3x faster than ESM2 for similarity analysis, classification, and feature extraction). Supports local inference with open weights and cloud-based Forge API for scalable batch processing. Use cases: novel protein design, structure prediction from sequence, sequence design from structure, protein embeddings, function annotation, variant generation, and directed evolution workflows
## Machine Learning & Deep Learning
- **aeon** - Comprehensive scikit-learn compatible Python toolkit for time series machine learning providing state-of-the-art algorithms across 7 domains: classification (13 algorithm categories including ROCKET variants, deep learning with InceptionTime/ResNet/FCN, distance-based with DTW/ERP/LCSS, shapelet-based, dictionary methods like BOSS/WEASEL, and hybrid ensembles HIVECOTE), regression (9 categories mirroring classification approaches), clustering (k-means/k-medoids with temporal distances, deep learning autoencoders, spectral methods), forecasting (ARIMA, ETS, Theta, Threshold Autoregressive, TCN, DeepAR), anomaly detection (STOMP/MERLIN matrix profile, clustering-based CBLOF/KMeans, isolation methods, copula-based COPOD), segmentation (ClaSP, FLUSS, HMM, binary segmentation), and similarity search (MASS algorithm, STOMP motif discovery, approximate nearest neighbors). Includes 40+ distance metrics (elastic: DTW/DDTW/WDTW/Shape-DTW, edit-based: ERP/EDR/LCSS/TWE/MSM, lock-step: Euclidean/Manhattan), extensive transformations (ROCKET/MiniRocket/MultiRocket for features, Catch22/TSFresh for statistics, SAX/PAA for symbolic representation, shapelet transforms, wavelets, matrix profile), 20+ deep learning architectures (FCN, ResNet, InceptionTime, TCN, autoencoders with attention mechanisms), comprehensive benchmarking tools (UCR/UEA archives with 100+ datasets, published results repository, statistical testing), and performance-optimized implementations using numba. Features progressive model complexity from fast baselines (MiniRocket: <1 second training, 0.95+ accuracy on many benchmarks) to state-of-the-art ensembles (HIVECOTE V2), GPU acceleration support, and extensive visualization utilities. Use cases: physiological signal classification (ECG, EEG), industrial sensor monitoring, financial forecasting, change point detection, pattern discovery, activity recognition from wearables, predictive maintenance, climate time series analysis, and any sequential data requiring specialized temporal modeling beyond standard ML
- **PufferLib** - High-performance reinforcement learning library achieving 1M-4M steps/second through optimized vectorization, native multi-agent support, and efficient PPO training (PuffeRL). Use this skill for RL training on any environment (Gymnasium, PettingZoo, Atari, Procgen), creating custom PufferEnv environments, developing policies (CNN, LSTM, multi-input architectures), optimizing parallel simulation performance, or scaling multi-agent systems. Includes Ocean suite (20+ environments), seamless framework integration with automatic space flattening, zero-copy vectorization with shared memory buffers, distributed training support, and comprehensive reference guides for training workflows, environment development, vectorization optimization, policy architectures, and third-party integrations
- **PyMC** - Bayesian statistical modeling and probabilistic programming
- **PyMOO** - Multi-objective optimization with evolutionary algorithms
- **PyTorch Lightning** - Deep learning framework that organizes PyTorch code to eliminate boilerplate while maintaining full flexibility. Automates training workflows (40+ tasks including epoch/batch iteration, optimizer steps, gradient management, checkpointing), supports multi-GPU/TPU training with DDP/FSDP/DeepSpeed strategies, includes LightningModule for model organization, Trainer for automation, LightningDataModule for data pipelines, callbacks for extensibility, and integrations with TensorBoard, Wandb, MLflow for experiment tracking
- **scikit-learn** - Industry-standard Python library for classical machine learning providing comprehensive supervised learning (classification: Logistic Regression, SVM, Decision Trees, Random Forests with 17+ variants, Gradient Boosting with XGBoost-compatible HistGradientBoosting, Naive Bayes, KNN, Neural Networks/MLP; regression: Linear, Ridge, Lasso, ElasticNet, SVR, ensemble methods), unsupervised learning (clustering: K-Means, DBSCAN, HDBSCAN, OPTICS, Agglomerative/Hierarchical, Spectral, Gaussian Mixture Models, BIRCH, MeanShift; dimensionality reduction: PCA, Kernel PCA, t-SNE, Isomap, LLE, NMF, TruncatedSVD, FastICA, LDA; outlier detection: IsolationForest, LocalOutlierFactor, OneClassSVM), data preprocessing (scaling: StandardScaler, MinMaxScaler, RobustScaler; encoding: OneHotEncoder, OrdinalEncoder, LabelEncoder; imputation: SimpleImputer, KNNImputer, IterativeImputer; feature engineering: PolynomialFeatures, KBinsDiscretizer, text vectorization with CountVectorizer/TfidfVectorizer), model evaluation (cross-validation: KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold; hyperparameter tuning: GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV; metrics: 30+ evaluation metrics for classification/regression/clustering including accuracy, precision, recall, F1, ROC-AUC, MSE, R², silhouette score), and Pipeline/ColumnTransformer for production-ready workflows. Features consistent API (fit/predict/transform), extensive documentation, integration with NumPy/pandas/SciPy, joblib persistence, and scikit-learn-compatible ecosystem (XGBoost, LightGBM, CatBoost, imbalanced-learn). Optimized implementations using Cython/OpenMP for performance. Use cases: predictive modeling, customer segmentation, anomaly detection, feature engineering, model selection/validation, text classification, image classification (with feature extraction), time series forecasting (with preprocessing), medical diagnosis, fraud detection, recommendation systems, and any tabular data ML task requiring interpretable models or established algorithms
- **scikit-survival** - Survival analysis and time-to-event modeling with censored data. Built on scikit-learn, provides Cox proportional hazards models (CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis with elastic net regularization), ensemble methods (Random Survival Forests, Gradient Boosting), Survival Support Vector Machines (linear and kernel), non-parametric estimators (Kaplan-Meier, Nelson-Aalen), competing risks analysis, and specialized evaluation metrics (concordance index, time-dependent AUC, Brier score). Handles right-censored data, integrates with scikit-learn pipelines, and supports feature selection and hyperparameter tuning via cross-validation
- **SHAP** - Model interpretability and explainability using Shapley values from game theory. Provides unified approach to explain any ML model with TreeExplainer (fast exact explanations for XGBoost/LightGBM/Random Forest), DeepExplainer (TensorFlow/PyTorch neural networks), KernelExplainer (model-agnostic), and LinearExplainer. Includes comprehensive visualizations (waterfall plots for individual predictions, beeswarm plots for global importance, scatter plots for feature relationships, bar/force/heatmap plots), supports model debugging, fairness analysis, feature engineering guidance, and production deployment
- **Stable Baselines3** - PyTorch-based reinforcement learning library providing reliable implementations of RL algorithms (PPO, SAC, DQN, TD3, DDPG, A2C, HER, RecurrentPPO). Use this skill for training RL agents on standard or custom Gymnasium environments, implementing callbacks for monitoring and control, using vectorized environments for parallel training, creating custom environments with proper Gymnasium API implementation, and integrating with deep RL workflows. Includes comprehensive training templates, evaluation utilities, algorithm selection guidance (on-policy vs off-policy, continuous vs discrete actions), support for multi-input policies (dict observations), goal-conditioned learning with HER, and integration with TensorBoard for experiment tracking
- **statsmodels** - Statistical modeling and econometrics (OLS, GLM, logit/probit, ARIMA, time series forecasting, hypothesis testing, diagnostics)
- **Torch Geometric** - Graph Neural Networks for molecular and geometric data
- **Transformers** - State-of-the-art machine learning models for NLP, computer vision, audio, and multimodal tasks. Provides 1M+ pre-trained models accessible via pipelines (text-classification, NER, QA, summarization, translation, text-generation, image-classification, object-detection, ASR, VQA), comprehensive training via Trainer API with distributed training and mixed precision, flexible text generation with multiple decoding strategies (greedy, beam search, sampling), and Auto classes for automatic architecture selection (BERT, GPT, T5, ViT, BART, etc.)
- **UMAP-learn** - Dimensionality reduction and manifold learning
## Materials Science & Chemistry
- **Astropy** - Comprehensive Python library for astronomy and astrophysics providing core functionality for astronomical research and data analysis. Includes coordinate system transformations (ICRS, Galactic, FK5, AltAz), physical units and quantities with automatic dimensional consistency, FITS file operations (reading, writing, manipulating headers and data), cosmological calculations (luminosity distance, lookback time, Hubble parameter, Planck/WMAP models), precise time handling across multiple time scales (UTC, TAI, TT, TDB) and formats (JD, MJD, ISO), table operations with unit support (FITS, CSV, HDF5, VOTable), WCS transformations between pixel and world coordinates, astronomical constants, modeling framework, visualization tools, and statistical functions. Use for celestial coordinate transformations, unit conversions, FITS image/table processing, cosmological distance calculations, barycentric time corrections, catalog cross-matching, and astronomical data analysis
- **COBRApy** - Constraint-based metabolic modeling and flux balance analysis
- **Pymatgen** - Materials structure analysis, phase diagrams, and electronic structure
## Data Analysis & Visualization
- **Dask** - Parallel computing for larger-than-memory datasets with distributed DataFrames, Arrays, Bags, and Futures
- **Data Commons** - Programmatic access to public statistical data from global sources including census bureaus, health organizations, and environmental agencies. Provides unified Python API for querying demographic data, economic indicators, health statistics, and environmental datasets through a knowledge graph interface. Features three main endpoints: Observation (statistical time-series queries for population, GDP, unemployment rates, disease prevalence), Node (knowledge graph exploration for entity relationships and hierarchies), and Resolve (entity identification from names, coordinates, or Wikidata IDs). Seamless Pandas integration for DataFrames, relation expressions for hierarchical queries, data source filtering for consistency, and support for custom Data Commons instances
- **Matplotlib** - Publication-quality plotting and visualization
- **NetworkX** - Comprehensive toolkit for creating, analyzing, and visualizing complex networks and graphs. Supports four graph types (Graph, DiGraph, MultiGraph, MultiDiGraph) with nodes as any hashable objects and rich edge attributes. Provides 100+ algorithms including shortest paths (Dijkstra, Bellman-Ford, A*), centrality measures (degree, betweenness, closeness, eigenvector, PageRank), clustering (coefficients, triangles, transitivity), community detection (modularity-based, label propagation, Girvan-Newman), connectivity analysis (components, cuts, flows), tree algorithms (MST, spanning trees), matching, graph coloring, isomorphism, and traversal (DFS, BFS). Includes 50+ graph generators for classic (complete, cycle, wheel), random (Erdős-Rényi, Barabási-Albert, Watts-Strogatz, stochastic block model), lattice (grid, hexagonal, hypercube), and specialized networks. Supports I/O across formats (edge lists, GraphML, GML, JSON, Pajek, GEXF, DOT) with Pandas/NumPy/SciPy integration. Visualization capabilities include 8+ layout algorithms (spring/force-directed, circular, spectral, Kamada-Kawai), customizable node/edge appearance, interactive visualizations with Plotly/PyVis, and publication-quality figure generation. Use cases: social network analysis, biological networks (protein-protein interactions, gene regulatory networks, metabolic pathways), transportation systems, citation networks, knowledge graphs, web structure analysis, infrastructure networks, and any domain involving pairwise relationships requiring structural analysis or graph-based modeling
- **Polars** - High-performance DataFrame operations with lazy evaluation
- **Seaborn** - Statistical data visualization with dataset-oriented interface, automatic confidence intervals, publication-quality themes, colorblind-safe palettes, and comprehensive support for exploratory analysis, distribution comparisons, correlation matrices, regression plots, and multi-panel figures
- **SimPy** - Process-based discrete-event simulation framework for modeling systems with processes, queues, and resource contention (manufacturing, service operations, network traffic, logistics). Supports generator-based process definition, multiple resource types (Resource, PriorityResource, PreemptiveResource, Container, Store), event-driven scheduling, process interaction mechanisms (signaling, interruption, parallel/sequential execution), real-time simulation synchronized with wall-clock time, and comprehensive monitoring capabilities for utilization, wait times, and queue statistics
- **SymPy** - Symbolic mathematics in Python for exact computation using mathematical symbols rather than numerical approximations. Provides comprehensive support for symbolic algebra (simplification, expansion, factorization), calculus (derivatives, integrals, limits, series), equation solving (algebraic, differential, systems of equations), matrices and linear algebra (eigenvalues, decompositions, solving linear systems), physics (classical mechanics with Lagrangian/Hamiltonian formulations, quantum mechanics, vector analysis, units), number theory (primes, factorization, modular arithmetic, Diophantine equations), geometry (2D/3D analytic geometry), combinatorics (permutations, combinations, partitions, group theory), logic and sets, statistics (probability distributions, random variables), special functions (gamma, Bessel, orthogonal polynomials), and code generation (lambdify to NumPy/SciPy functions, C/Fortran code generation, LaTeX output for documentation). Emphasizes exact arithmetic using rational numbers and symbolic representations, supports assumptions for improved simplification (positive, real, integer), integrates seamlessly with NumPy/SciPy through lambdify for fast numerical evaluation, and enables symbolic-to-numeric pipelines for scientific computing workflows
- **Vaex** - High-performance Python library for lazy, out-of-core DataFrames to process and visualize tabular datasets larger than available RAM. Processes over a billion rows per second through memory-mapped files (HDF5, Apache Arrow), lazy evaluation, and virtual columns (zero memory overhead). Provides instant file opening, efficient aggregations across billions of rows, interactive visualizations without sampling, machine learning pipelines with transformers (scalers, encoders, PCA), and seamless integration with pandas/NumPy/Arrow. Includes comprehensive ML framework (vaex.ml) with feature scaling, categorical encoding, dimensionality reduction, and integration with scikit-learn/XGBoost/LightGBM/CatBoost. Supports distributed computing via Dask, asynchronous operations, and state management for production deployment. Use cases: processing gigabyte to terabyte datasets, fast statistical aggregations on massive data, visualizing billion-row datasets, ML pipelines on big data, converting between data formats, and working with astronomical, financial, or scientific large-scale datasets
- **ReportLab** - Programmatic PDF generation for reports and documents
## Phylogenetics & Trees
- **ETE Toolkit** - Phylogenetic tree manipulation, visualization, and analysis
## Genomics Tools
- **deepTools** - NGS data analysis (ChIP-seq, RNA-seq, ATAC-seq) with BAM/bigWig files
- **FlowIO** - Flow Cytometry Standard (FCS) file reading and manipulation
- **scikit-bio** - Bioinformatics sequence analysis and diversity metrics
- **Zarr** - Chunked, compressed N-dimensional array storage
## Multi-omics & AI Agent Frameworks
- **BIOMNI** - Autonomous biomedical AI agent framework from Stanford SNAP lab for executing complex research tasks across genomics, drug discovery, molecular biology, and clinical analysis. Combines LLM reasoning with code execution and ~11GB of integrated biomedical databases (Ensembl, NCBI Gene, UniProt, PDB, AlphaFold, ClinVar, OMIM, HPO, PubMed, KEGG, Reactome, GO). Supports multiple LLM providers (Claude, GPT-4, Gemini, Groq, Bedrock). Includes A1 agent class for autonomous task decomposition, BiomniEval1 benchmark framework, and MCP server integration. Use cases: CRISPR screening design, single-cell RNA-seq analysis, ADMET prediction, GWAS interpretation, rare disease diagnosis, protein structure analysis, literature synthesis, and multi-omics integration
- **Denario** - Multiagent AI system for scientific research assistance that automates complete research workflows from data analysis through publication. Built on AG2 and LangGraph frameworks, orchestrates specialized agents for hypothesis generation, methodology development, computational analysis, and LaTeX paper writing. Supports multiple LLM providers (Google Vertex AI, OpenAI) with flexible pipeline stages allowing manual or automated inputs. Key features include: end-to-end research automation (data description → idea generation → methodology → results → paper), journal-specific formatting (APS and others), GUI interface via Streamlit, Docker deployment with LaTeX environment, reproducible research with version-controlled outputs, literature search integration, and integration with scientific Python stack (pandas, sklearn, scipy). Provides both programmatic Python API and web-based interface. Use cases: automated hypothesis generation from datasets, research methodology development, computational experiment execution with visualization, publication-ready manuscript generation, time-series analysis research, machine learning experiment automation, and accelerating the complete scientific research lifecycle from ideation to publication
- **HypoGeniC** - Automated hypothesis generation and testing using large language models to accelerate scientific discovery. Provides three frameworks: HypoGeniC (data-driven hypothesis generation from observational data), HypoRefine (synergistic approach combining literature insights with empirical patterns through an agentic system), and Union methods (mechanistic combination of literature and data-driven hypotheses). Features iterative refinement that improves hypotheses by learning from challenging examples, Redis caching for API cost reduction, and customizable YAML-based prompt templates. Includes command-line tools for generation (hypogenic_generation) and testing (hypogenic_inference). Research applications have demonstrated 14.19% accuracy improvement in AI-content detection and 7.44% in deception detection. Use cases: deception detection in reviews, AI-generated content identification, mental stress detection, exploratory research without existing literature, hypothesis-driven analysis in novel domains, and systematic exploration of competing explanations
## Scientific Communication & Publishing
- **Paper-2-Web** - Autonomous pipeline for transforming academic papers into multiple promotional formats using the Paper2All system. Converts LaTeX or PDF papers into: (1) Paper2Web - interactive, layout-aware academic homepages with responsive design, interactive figures, and mobile support; (2) Paper2Video - professional presentation videos with slides, narration, cursor movements, and optional talking-head generation using Hallo2; (3) Paper2Poster - print-ready conference posters with custom dimensions, professional layouts, and institution branding. Supports GPT-4/GPT-4.1 models, batch processing, QR code generation, multi-language content, and quality assessment metrics. Use cases: conference materials, video abstracts, preprint enhancement, research promotion, poster sessions, and academic website creation
## Document Processing & Conversion
- **MarkItDown** - Python utility for converting 20+ file formats to Markdown optimized for LLM processing. Converts Office documents (PDF, DOCX, PPTX, XLSX), images with OCR, audio with transcription, web content (HTML, YouTube transcripts, EPUB), and structured data (CSV, JSON, XML) while preserving document structure (headings, lists, tables, hyperlinks). Key features include: Azure Document Intelligence integration for enhanced PDF table extraction, LLM-powered image descriptions using GPT-4o, batch processing with ZIP archive support, modular installation for specific formats, streaming approach without temporary files, and plugin system for custom converters. Supports Python 3.10+. Use cases: preparing documents for RAG systems, extracting text from PDFs and Office files, transcribing audio to text, performing OCR on images and scanned documents, converting YouTube videos to searchable text, processing HTML and EPUB books, converting structured data to readable format, document analysis pipelines, and LLM training data preparation
## Laboratory Automation & Equipment Control
- **PyLabRobot** - Hardware-agnostic, pure Python SDK for automated and autonomous laboratories. Provides unified interface for controlling liquid handling robots (Hamilton STAR/STARlet, Opentrons OT-2, Tecan EVO), plate readers (BMG CLARIOstar), heater shakers, incubators, centrifuges, pumps, and scales. Key features include: modular resource management system for plates, tips, and containers with hierarchical deck layouts and JSON serialization; comprehensive liquid handling operations (aspirate, dispense, transfer, serial dilutions, plate replication) with automatic tip and volume tracking; backend abstraction enabling hardware-agnostic protocols that work across different robots; ChatterboxBackend for protocol simulation and testing without hardware; browser-based visualizer for real-time 3D deck state visualization; cross-platform support (Windows, macOS, Linux, Raspberry Pi); and integration capabilities for multi-device workflows combining liquid handlers, analytical equipment, and material handling devices. Use cases: automated sample preparation, high-throughput screening, serial dilution protocols, plate reading workflows, laboratory protocol development and validation, robotic liquid handling automation, and reproducible laboratory automation with state tracking and persistence
## Tool Discovery & Research Platforms
- **ToolUniverse** - Unified ecosystem providing standardized access to 600+ scientific tools, models, datasets, and APIs across bioinformatics, cheminformatics, genomics, structural biology, and proteomics. Enables AI agents to function as research scientists through: (1) Tool Discovery - natural language, semantic, and keyword-based search for finding relevant scientific tools (Tool_Finder, Tool_Finder_LLM, Tool_Finder_Keyword); (2) Tool Execution - standardized AI-Tool Interaction Protocol for running tools with consistent interfaces; (3) Tool Composition - sequential and parallel workflow chaining for multi-step research pipelines; (4) Model Context Protocol (MCP) integration for Claude Desktop/Code. Supports drug discovery workflows (disease→targets→structures→screening→candidates), genomics analysis (expression→differential analysis→pathways), clinical genomics (variants→annotation→pathogenicity→disease associations), and cross-domain research. Use cases: accessing scientific databases (OpenTargets, PubChem, UniProt, PDB, ChEMBL, KEGG), protein structure prediction (AlphaFold), molecular docking, pathway enrichment, variant annotation, literature searches, and automated scientific workflows

176
docs/scientific-skills.md Normal file
View File

@@ -0,0 +1,176 @@
# Scientific Skills
## Scientific Databases
- **AlphaFold DB** - Comprehensive AI-predicted protein structure database from DeepMind providing 200M+ high-confidence protein structure predictions covering UniProt reference proteomes and beyond. Includes confidence metrics (pLDDT for per-residue confidence, PAE for pairwise accuracy estimates), structure quality assessment, predicted aligned error matrices, and multiple structure formats (PDB, mmCIF, AlphaFold DB format). Supports programmatic access via REST API, bulk downloads through Google Cloud Storage, and integration with structural analysis tools. Enables structure-based drug discovery, protein function prediction, structural genomics, comparative modeling, and structural bioinformatics research without experimental structure determination
- **ChEMBL** - Comprehensive manually curated database of bioactive molecules with drug-like properties maintained by EMBL-EBI. Contains 2M+ unique compounds, 19M+ bioactivity measurements, 13K+ protein targets, and 1.1M+ assays from 90K+ publications. Provides detailed compound information including chemical structures (SMILES, InChI), bioactivity data (IC50, EC50, Ki, Kd values), target information (protein families, pathways), ADMET properties, drug indications, clinical trial data, and patent information. Features REST API access, web interface, downloadable data files, and integration with other databases (UniProt, PubChem, DrugBank). Use cases: drug discovery, target identification, lead optimization, bioactivity prediction, chemical biology research, and drug repurposing
- **ClinPGx** - Clinical pharmacogenomics database (successor to PharmGKB) providing gene-drug interactions, CPIC clinical guidelines, allele functions, drug labels, and pharmacogenomic annotations for precision medicine and personalized pharmacotherapy (consolidates PharmGKB, CPIC, and PharmCAT resources)
- **ClinVar** - NCBI's public archive of genomic variants and their clinical significance with standardized classifications (pathogenic, benign, VUS), E-utilities API access, and bulk FTP downloads for variant interpretation and precision medicine research
- **ClinicalTrials.gov** - Comprehensive registry of clinical studies conducted worldwide (maintained by U.S. National Library of Medicine) with API v2 access for searching trials by condition, intervention, location, sponsor, study status, and phase; retrieve detailed trial information including eligibility criteria, outcomes, contacts, and locations; export to CSV/JSON formats for analysis (public API, no authentication required, ~50 req/min rate limit)
- **COSMIC** - Catalogue of Somatic Mutations in Cancer, the world's largest database of somatic cancer mutations (millions of mutations across thousands of cancer types, Cancer Gene Census, mutational signatures, structural variants, and drug resistance data)
- **DrugBank** - Comprehensive bioinformatics and cheminformatics database containing detailed drug and drug target information (9,591+ drug entries including 2,037 FDA-approved small molecules, 241 biotech drugs, 96 nutraceuticals, 6,000+ experimental compounds) with 200+ data fields per entry covering chemical structures (SMILES, InChI), pharmacology (mechanism of action, pharmacodynamics, ADME), drug-drug interactions, protein targets (enzymes, transporters, carriers), biological pathways, external identifiers (PubChem, ChEMBL, UniProt), and physicochemical properties for drug discovery, pharmacology research, interaction analysis, target identification, chemical similarity searches, and ADMET predictions
- **ENA (European Nucleotide Archive)** - Comprehensive public repository for nucleotide sequence data and metadata with REST APIs for accessing sequences, assemblies, samples, studies, and reads; supports advanced search, taxonomy lookups, and bulk downloads via FTP/Aspera (rate limit: 50 req/sec)
- **Ensembl** - Genome browser and bioinformatics database providing genomic annotations, sequences, variants, and comparative genomics data for 250+ vertebrate species (Release 115, 2025) with comprehensive REST API for gene lookups, sequence retrieval, variant effect prediction (VEP), ortholog finding, assembly mapping (GRCh37/GRCh38), and region analysis
- **FDA Databases** - Comprehensive access to all FDA (Food and Drug Administration) regulatory databases through openFDA API covering drugs (adverse events, labeling, NDC, recalls, approvals, shortages), medical devices (adverse events, 510k clearances, PMA, UDI, classifications), foods (recalls, adverse events, allergen tracking), animal/veterinary medicines (species-specific adverse events), and substances (UNII/CAS lookup, chemical structures, molecular data) for drug safety research, pharmacovigilance, regulatory compliance, and scientific analysis
- **GEO (Gene Expression Omnibus)** - NCBI's comprehensive public repository for high-throughput gene expression and functional genomics data. Contains 264K+ studies, 8M+ samples, and petabytes of data from microarray, RNA-seq, ChIP-seq, ATAC-seq, and other high-throughput experiments. Provides standardized data submission formats (MINIML, SOFT), programmatic access via Entrez Programming Utilities (E-utilities) and GEOquery R package, bulk FTP downloads, and web-based search and retrieval. Supports data mining, meta-analysis, differential expression analysis, and cross-study comparisons. Includes curated datasets, series records with experimental design, platform annotations, and sample metadata. Use cases: gene expression analysis, biomarker discovery, disease mechanism research, drug response studies, and functional genomics research
- **GWAS Catalog** - NHGRI-EBI catalog of published genome-wide association studies with curated SNP-trait associations (thousands of studies, genome-wide significant associations p≤5×10⁻⁸), full summary statistics, REST API access for variant/trait/gene queries, and FTP downloads for genetic epidemiology and precision medicine research
- **HMDB (Human Metabolome Database)** - Comprehensive metabolomics resource with 220K+ metabolite entries, detailed chemical/biological data, concentration ranges, disease associations, pathways, and spectral data for metabolite identification and biomarker discovery
- **KEGG** - Kyoto Encyclopedia of Genes and Genomes, comprehensive database resource integrating genomic, chemical, and systemic functional information. Provides pathway databases (KEGG PATHWAY with 500+ reference pathways, metabolic pathways, signaling pathways, disease pathways), genome databases (KEGG GENES with gene catalogs from 5,000+ organisms), chemical databases (KEGG COMPOUND, KEGG DRUG, KEGG GLYCAN), and disease/drug databases (KEGG DISEASE, KEGG DRUG). Features pathway enrichment analysis, gene-to-pathway mapping, compound searches, molecular interaction networks, ortholog identification (KO - KEGG Orthology), ID conversion across databases, and visualization tools. Supports REST API access, KEGG Mapper for pathway mapping, and integration with bioinformatics tools. Use cases: pathway enrichment analysis, metabolic pathway reconstruction, drug target identification, comparative genomics, systems biology, and functional annotation of genes
- **Metabolomics Workbench** - NIH Common Fund metabolomics data repository with 4,200+ processed studies, standardized nomenclature (RefMet), mass spectrometry searches, and comprehensive REST API for accessing metabolite structures, study metadata, experimental results, and gene/protein-metabolite associations
- **Open Targets** - Comprehensive therapeutic target identification and validation platform integrating genetics, omics, and chemical data (200M+ evidence strings, target-disease associations with scoring, tractability assessments, safety liabilities, known drugs from ChEMBL, GraphQL API) for drug target discovery, prioritization, evidence evaluation, drug repurposing, competitive intelligence, and mechanism research
- **NCBI Gene** - Comprehensive gene-specific database from NCBI providing curated information about genes from 500+ organisms. Contains gene nomenclature (official symbols, aliases, full names), genomic locations (chromosomal positions, exons, introns), sequences (genomic, mRNA, protein), gene function and phenotypes, pathways and interactions, orthologs and paralogs, variation data (SNPs, mutations), expression data, and cross-references to 200+ external databases (UniProt, Ensembl, HGNC, OMIM, Reactome). Supports programmatic access via E-utilities API (Entrez Programming Utilities) and NCBI Datasets API, bulk downloads, and web interface. Enables gene annotation, comparative genomics, variant interpretation, pathway analysis, and integration with other NCBI resources (PubMed, dbSNP, ClinVar). Use cases: gene information retrieval, variant annotation, functional genomics, disease gene discovery, and bioinformatics workflows
- **Protein Data Bank (PDB)** - Worldwide repository for 3D structural data of proteins, nucleic acids, and biological macromolecules. Contains 200K+ experimentally determined structures from X-ray crystallography, NMR spectroscopy, and cryo-electron microscopy. Provides comprehensive structure information including atomic coordinates, experimental data, structure quality metrics, ligand binding sites, protein-protein interfaces, and metadata (authors, methods, citations). Features advanced search capabilities (by sequence, structure similarity, ligand, organism, resolution), REST API and FTP access, structure visualization tools, and integration with analysis software. Supports structure comparison, homology modeling, drug design, structural biology research, and educational use. Maintained by wwPDB consortium (RCSB PDB, PDBe, PDBj, BMRB). Use cases: structural biology research, drug discovery, protein engineering, molecular modeling, and structural bioinformatics
- **PubChem** - World's largest free chemical information database maintained by NCBI. Contains 110M+ unique chemical compounds, 270M+ bioactivity test results, 300M+ chemical structures, and 1M+ patents. Provides comprehensive compound information including chemical structures (2D/3D structures, SMILES, InChI), physicochemical properties (molecular weight, logP, H-bond donors/acceptors), bioactivity data (assays, targets, pathways), safety and toxicity data, literature references, and vendor information. Features REST API (PUG REST, PUG SOAP, PUG View), web interface with advanced search, bulk downloads, and integration with other NCBI resources. Supports chemical similarity searches, substructure searches, property-based filtering, and cheminformatics analysis. Use cases: drug discovery, chemical biology, lead identification, ADMET prediction, chemical database mining, and molecular property analysis
- **PubMed** - NCBI's comprehensive biomedical literature database containing 35M+ citations from MEDLINE, life science journals, and online books. Provides access to abstracts, full-text articles (when available), MeSH (Medical Subject Headings) terms, author information, publication dates, and citation networks. Features advanced search capabilities with Boolean operators, field tags (author, title, journal, MeSH terms, publication date), filters (article type, species, language, publication date range), and saved searches with email alerts. Supports programmatic access via E-utilities API (Entrez Programming Utilities), bulk downloads, citation export in multiple formats (RIS, BibTeX, MEDLINE), and integration with reference management software. Includes PubMed Central (PMC) for open-access full-text articles. Use cases: literature searches, systematic reviews, citation analysis, research discovery, and staying current with scientific publications
- **Reactome** - Curated pathway database for biological processes and molecular interactions (2,825+ human pathways, 16K+ reactions, 11K+ proteins) with pathway enrichment analysis, expression data analysis, and species comparison using Content Service and Analysis Service APIs
- **STRING** - Protein-protein interaction network database (5000+ genomes, 59.3M proteins, 20B+ interactions) with functional enrichment analysis, interaction partner discovery, and network visualization from experimental data, computational prediction, and text-mining
- **UniProt** - Universal Protein Resource for protein sequences, annotations, and functional information (UniProtKB/Swiss-Prot reviewed entries, TrEMBL unreviewed entries) with REST API access for search, retrieval, ID mapping, and batch operations across 200+ databases
- **USPTO** - United States Patent and Trademark Office data access including patent searches, trademark lookups, patent examination history (PEDS), office actions, assignments, citations, and litigation records; supports PatentSearch API (ElasticSearch-based patent search), TSDR (Trademark Status & Document Retrieval), Patent/Trademark Assignment APIs, and additional specialized APIs for comprehensive IP analysis
- **ZINC** - Free database of commercially-available compounds for virtual screening and drug discovery maintained by UCSF. Contains 230M+ purchasable compounds from 100+ vendors in ready-to-dock 3D formats (SDF, MOL2) with pre-computed conformers. Provides compound information including chemical structures, vendor information and pricing, physicochemical properties (molecular weight, logP, H-bond donors/acceptors, rotatable bonds), drug-likeness filters (Lipinski's Rule of Five, Veber rules), and substructure search capabilities. Features multiple compound subsets (drug-like, lead-like, fragment-like, natural products), downloadable subsets for specific screening campaigns, and integration with molecular docking software (AutoDock, DOCK, Glide). Supports structure-based and ligand-based virtual screening workflows. Use cases: virtual screening campaigns, lead identification, compound library design, high-throughput docking, and drug discovery research
## Scientific Integrations
### Laboratory Information Management Systems (LIMS) & R&D Platforms
- **Benchling Integration** - Toolkit for integrating with Benchling's R&D platform, providing programmatic access to laboratory data management including registry entities (DNA sequences, proteins), inventory systems (samples, containers, locations), electronic lab notebooks (entries, protocols), workflows (tasks, automation), and data exports using Python SDK and REST API
### Cloud Platforms for Genomics & Biomedical Data
- **DNAnexus Integration** - Comprehensive toolkit for working with the DNAnexus cloud platform for genomics and biomedical data analysis. Covers building and deploying apps/applets (Python/Bash), managing data objects (files, records, databases), running analyses and workflows, using the dxpy Python SDK, and configuring app metadata and dependencies (dxapp.json setup, system packages, Docker, assets). Enables processing of FASTQ/BAM/VCF files, bioinformatics pipelines, job execution, workflow orchestration, and platform operations including project management and permissions
### Laboratory Automation
- **Opentrons Integration** - Toolkit for creating, editing, and debugging Opentrons Python Protocol API v2 protocols for laboratory automation using Flex and OT-2 robots. Enables automated liquid handling, pipetting workflows, hardware module control (thermocycler, temperature, magnetic, heater-shaker, absorbance plate reader), labware management, and complex protocol development for biological and chemical experiments
### Electronic Lab Notebooks (ELN)
- **LabArchives Integration** - Toolkit for interacting with LabArchives Electronic Lab Notebook (ELN) REST API. Provides programmatic access to notebooks (backup, retrieval, management), entries (creation, comments, attachments), user authentication, site reports and analytics, and third-party integrations (Protocols.io, GraphPad Prism, SnapGene, Geneious, Jupyter, REDCap). Includes Python scripts for configuration setup, notebook operations, and entry management. Supports multi-regional API endpoints (US, UK, Australia) and OAuth authentication
### Workflow Platforms & Cloud Execution
- **LatchBio Integration** - Integration with the Latch platform for building, deploying, and executing bioinformatics workflows. Provides comprehensive support for creating serverless bioinformatics pipelines using Python decorators, deploying Nextflow/Snakemake pipelines, managing cloud data (LatchFile, LatchDir) and structured Registry (Projects, Tables, Records), configuring computational resources (CPU, GPU, memory, storage), and using pre-built Latch Verified workflows (RNA-seq, AlphaFold, DESeq2, single-cell analysis, CRISPR editing). Enables automatic containerization, UI generation, workflow versioning, and execution on scalable cloud infrastructure with comprehensive data management
### Microscopy & Bio-image Data
- **OMERO Integration** - Toolkit for interacting with OMERO microscopy data management systems using Python. Provides comprehensive access to microscopy images stored in OMERO servers, including dataset and screening data retrieval, pixel data analysis, annotation and metadata management, regions of interest (ROIs) creation and analysis, batch processing, OMERO.scripts development, and OMERO.tables for structured data storage. Essential for researchers working with high-content screening data, multi-dimensional microscopy datasets, or collaborative image repositories
### Protocol Management & Sharing
- **Protocols.io Integration** - Integration with protocols.io API for managing scientific protocols. Enables programmatic access to protocol discovery (search by keywords, DOI, category), protocol lifecycle management (create, update, publish with DOI), step-by-step procedure documentation, collaborative development with workspaces and discussions, file management (upload data, images, documents), experiment tracking and documentation, and data export. Supports OAuth authentication, protocol PDF generation, materials management, threaded comments, workspace permissions, and institutional protocol repositories. Essential for protocol standardization, reproducibility, lab knowledge management, and scientific collaboration
## Scientific Packages
### Bioinformatics & Genomics
- **AnnData** - Python package for handling annotated data matrices, specifically designed for single-cell genomics data. Provides efficient storage and manipulation of high-dimensional data with associated annotations (observations/cells and variables/genes). Key features include: HDF5-based h5ad file format for efficient I/O and compression, integration with pandas DataFrames for metadata, support for sparse matrices (scipy.sparse) for memory efficiency, layered data organization (X for main data matrix, obs for observation annotations, var for variable annotations, obsm/varm for multi-dimensional annotations, obsp/varp for pairwise matrices), and seamless integration with Scanpy, scvi-tools, and other single-cell analysis packages. Supports lazy loading, chunked operations, and conversion to/from other formats (CSV, HDF5, Zarr). Use cases: single-cell RNA-seq data management, multi-modal single-cell data (RNA+ATAC, CITE-seq), spatial transcriptomics, and any high-dimensional annotated data requiring efficient storage and manipulation
- **Arboreto** - Python package for efficient gene regulatory network (GRN) inference from single-cell RNA-seq data using ensemble tree-based methods. Implements GRNBoost2 (gradient boosting-based network inference) and GENIE3 (random forest-based inference) algorithms optimized for large-scale single-cell datasets. Key features include: parallel processing for scalability, support for sparse matrices and large datasets (millions of cells), integration with Scanpy/AnnData workflows, customizable hyperparameters, and output formats compatible with network analysis tools. Provides ranked lists of potential regulatory interactions (transcription factor-target gene pairs) with confidence scores. Use cases: identifying transcription factor-target relationships, reconstructing gene regulatory networks from single-cell data, understanding cell-type-specific regulatory programs, and inferring causal relationships in gene expression
- **BioPython** - Comprehensive Python library for computational biology and bioinformatics providing tools for sequence manipulation, database access, and biological data analysis. Key features include: sequence objects (Seq, SeqRecord, SeqIO) for DNA/RNA/protein sequences with biological alphabet validation, file format parsers (FASTA, FASTQ, GenBank, EMBL, Swiss-Prot, PDB, SAM/BAM, VCF, GFF), NCBI database access (Entrez Programming Utilities for PubMed, GenBank, BLAST, taxonomy), BLAST integration (running searches, parsing results), sequence alignment (pairwise and multiple sequence alignment with Bio.Align), phylogenetics (tree construction and manipulation with Bio.Phylo), population genetics (Hardy-Weinberg, F-statistics), protein structure analysis (PDB parsing, structure calculations), and statistical analysis tools. Supports integration with NumPy, pandas, and other scientific Python libraries. Use cases: sequence analysis, database queries, phylogenetic analysis, sequence alignment, file format conversion, and general bioinformatics workflows
- **BioServices** - Python library providing unified programmatic access to 40+ biological web services and databases. Supports major bioinformatics resources including KEGG (pathway and compound data), UniProt (protein sequences and annotations), ChEBI (chemical entities), ChEMBL (bioactive molecules), Reactome (pathways), IntAct (protein interactions), BioModels (biological models), and many others. Features consistent API across different services, automatic result caching, error handling and retry logic, support for both REST and SOAP web services, and conversion of results to Python objects (dictionaries, lists, BioPython objects). Handles authentication, rate limiting, and API versioning. Use cases: automated data retrieval from multiple biological databases, building bioinformatics pipelines, database integration workflows, and programmatic access to biological web resources without manual web browsing
- **Cellxgene Census** - Python package for querying and analyzing large-scale single-cell RNA-seq data from the CZ CELLxGENE Discover census. Provides access to 50M+ cells across 1,000+ datasets with standardized annotations and metadata. Key features include: efficient data access using TileDB-SOMA format for scalable queries, integration with AnnData and Scanpy for downstream analysis, cell metadata filtering and querying, gene expression retrieval, and support for both human and mouse data. Enables subsetting datasets by cell type, tissue, disease, or other metadata before downloading, reducing data transfer and memory requirements. Supports local caching and batch operations. Use cases: large-scale single-cell analysis, cell-type discovery, cross-dataset comparisons, reference dataset construction, and exploratory analysis of public single-cell data
- **gget** - Command-line tool and Python package for efficient querying of genomic databases with a simple, unified interface. Provides fast access to Ensembl (gene information, sequences, orthologs, variants), UniProt (protein sequences and annotations), NCBI (BLAST searches, gene information), PDB (protein structures), COSMIC (cancer mutations), and other databases. Features include: single-command queries without complex API setup, automatic result formatting, batch query support, integration with pandas DataFrames, and support for both command-line and Python API usage. Optimized for speed and ease of use, making database queries accessible to users without extensive bioinformatics experience. Use cases: quick gene lookups, sequence retrieval, variant annotation, protein structure access, and rapid database queries in bioinformatics workflows
- **geniml** - Genomic interval machine learning toolkit providing unsupervised methods for building ML models on BED files. Key capabilities include Region2Vec (word2vec-style embeddings of genomic regions and region sets using tokenization and neural language modeling), BEDspace (joint embeddings of regions and metadata labels using StarSpace for cross-modal queries), scEmbed (Region2Vec applied to single-cell ATAC-seq data generating cell-level embeddings for clustering and annotation with scanpy integration), consensus peak building (four statistical methods CC/CCF/ML/HMM for creating reference universes from BED collections), and comprehensive utilities (BBClient for BED caching, BEDshift for genomic randomization preserving context, evaluation metrics for embedding quality, Text2BedNN for neural search backends). Part of BEDbase ecosystem. Supports Python API and CLI workflows, pre-trained models on Hugging Face, and integration with gtars for tokenization. Use cases: region similarity searches, dimension reduction of chromatin accessibility data, scATAC-seq clustering and cell-type annotation, metadata-aware genomic queries, universe construction for standardized references, and any ML task requiring genomic region feature vectors
- **gtars** - High-performance Rust toolkit for genomic interval analysis providing specialized tools for overlap detection using IGD (Integrated Genome Database) indexing, coverage track generation (uniwig module for WIG/BigWig formats), genomic tokenization for machine learning applications (TreeTokenizer for deep learning models), reference sequence management (refget protocol compliance), fragment processing for single-cell genomics (barcode-based splitting and cluster analysis), and fragment scoring against reference datasets. Offers Python bindings with NumPy integration, command-line tools (gtars-cli), and Rust library. Key modules include: tokenizers (convert genomic regions to ML tokens), overlaprs (efficient overlap computation), uniwig (ATAC-seq/ChIP-seq/RNA-seq coverage profiles), refget (GA4GH-compliant sequence digests), bbcache (BEDbase.org integration), scoring (fragment enrichment metrics), and fragsplit (single-cell fragment manipulation). Supports parallel processing, memory-mapped files, streaming for large datasets, and serves as foundation for geniml genomic ML package. Ideal for genomic ML preprocessing, regulatory element analysis, variant annotation, chromatin accessibility profiling, and computational genomics workflows
- **pysam** - Read, write, and manipulate genomic data files (SAM/BAM/CRAM alignments, VCF/BCF variants, FASTA/FASTQ sequences) with pileup analysis, coverage calculations, and bioinformatics workflows
- **PyDESeq2** - Python implementation of the DESeq2 differential gene expression analysis method for bulk RNA-seq data. Provides statistical methods for determining differential expression between experimental conditions using negative binomial generalized linear models. Key features include: size factor estimation for library size normalization, dispersion estimation and shrinkage, hypothesis testing with Wald test or likelihood ratio test, multiple testing correction (Benjamini-Hochberg FDR), results filtering and ranking, and integration with pandas DataFrames. Handles complex experimental designs, batch effects, and replicates. Produces fold-change estimates, p-values, and adjusted p-values for each gene. Use cases: identifying differentially expressed genes between conditions, RNA-seq experiment analysis, biomarker discovery, and gene expression studies requiring rigorous statistical analysis
- **Scanpy** - Comprehensive Python toolkit for single-cell RNA-seq data analysis built on AnnData. Provides end-to-end workflows for preprocessing (quality control, normalization, log transformation), dimensionality reduction (PCA, UMAP, t-SNE, ForceAtlas2), clustering (Leiden, Louvain, hierarchical clustering), marker gene identification, trajectory inference (PAGA, diffusion maps), and visualization. Key features include: efficient handling of large datasets (millions of cells) using sparse matrices, integration with scvi-tools for advanced analysis, support for multi-modal data (RNA+ATAC, CITE-seq), batch correction methods, and publication-quality plotting functions. Includes extensive documentation, tutorials, and integration with other single-cell tools. Supports GPU acceleration for certain operations. Use cases: single-cell RNA-seq analysis, cell-type identification, trajectory analysis, batch correction, and comprehensive single-cell genomics workflows
- **scvi-tools** - Probabilistic deep learning models for single-cell omics analysis. PyTorch-based framework providing variational autoencoders (VAEs) for dimensionality reduction, batch correction, differential expression, and data integration across modalities. Includes 25+ models: scVI/scANVI (RNA-seq integration and cell type annotation), totalVI (CITE-seq protein+RNA), MultiVI (multiome RNA+ATAC integration), PeakVI (ATAC-seq analysis), DestVI/Stereoscope/Tangram (spatial transcriptomics deconvolution), MethylVI (methylation), CytoVI (flow/mass cytometry), VeloVI (RNA velocity), contrastiveVI (perturbation studies), and Solo (doublet detection). Supports seamless integration with Scanpy/AnnData ecosystem, GPU acceleration, reference mapping (scArches), and probabilistic differential expression with uncertainty quantification
### Data Management & Infrastructure
- **LaminDB** - Open-source data framework for biology that makes data queryable, traceable, reproducible, and FAIR (Findable, Accessible, Interoperable, Reusable). Provides unified platform combining lakehouse architecture, lineage tracking, feature stores, biological ontologies (via Bionty plugin with 20+ ontologies: genes, proteins, cell types, tissues, diseases, pathways), LIMS, and ELN capabilities through a single Python API. Key features include: automatic data lineage tracking (code, inputs, outputs, environment), versioned artifacts (DataFrame, AnnData, SpatialData, Parquet, Zarr), schema validation and data curation with standardization/synonym mapping, queryable metadata with feature-based filtering, cross-registry traversal, and streaming for large datasets. Supports integrations with workflow managers (Nextflow, Snakemake, Redun), MLOps platforms (Weights & Biases, MLflow, HuggingFace, scVI-tools), cloud storage (S3, GCS, S3-compatible), array stores (TileDB-SOMA, DuckDB), and visualization (Vitessce). Deployment options: local SQLite, cloud storage with SQLite, or cloud storage with PostgreSQL for production. Use cases: scRNA-seq standardization and analysis, flow cytometry/spatial data management, multi-modal dataset integration, computational workflow tracking with reproducibility, biological ontology-based annotation, data lakehouse construction for unified queries, ML pipeline integration with experiment tracking, and FAIR-compliant dataset publishing
- **Modal** - Serverless cloud platform for running Python code with minimal configuration, specialized for AI/ML workloads and scientific computing. Execute functions on powerful GPUs (T4, L4, A10, A100, L40S, H100, H200, B200), scale automatically from zero to thousands of containers, and pay only for compute used. Key features include: declarative container image building with uv/pip/apt package management, automatic autoscaling with configurable limits and buffer containers, GPU acceleration with multi-GPU support (up to 8 GPUs per container), persistent storage via Volumes for model weights and datasets, secret management for API keys and credentials, scheduled jobs with cron expressions, web endpoints for deploying serverless APIs, parallel execution with `.map()` for batch processing, input concurrency for I/O-bound workloads, and resource configuration (CPU cores, memory, disk). Supports custom Docker images, integration with Hugging Face/Weights & Biases, FastAPI for web endpoints, and distributed training. Free tier includes $30/month credits. Use cases: ML model deployment and inference (LLMs, image generation, embeddings), GPU-accelerated training, batch processing large datasets in parallel, scheduled compute-intensive jobs, serverless API deployment with autoscaling, scientific computing requiring distributed compute or specialized hardware, and data pipeline automation
### Cheminformatics & Drug Discovery
- **Datamol** - Python library for molecular manipulation and featurization built on RDKit with enhanced workflows and performance optimizations. Provides utilities for molecular I/O (reading/writing SMILES, SDF, MOL files), molecular standardization and sanitization, molecular transformations (tautomer enumeration, stereoisomer generation), molecular featurization (descriptors, fingerprints, graph representations), parallel processing for large datasets, and integration with machine learning pipelines. Features include: optimized RDKit operations, caching for repeated computations, molecular filtering and preprocessing, and seamless integration with pandas DataFrames. Designed for drug discovery and cheminformatics workflows requiring efficient processing of large compound libraries. Use cases: molecular preprocessing for ML models, compound library management, molecular similarity searches, and cheminformatics data pipelines
- **DeepChem** - Deep learning framework for molecular machine learning and drug discovery built on TensorFlow and PyTorch. Provides implementations of graph neural networks (GCN, GAT, MPNN, AttentiveFP) for molecular property prediction, molecular featurization (molecular graphs, fingerprints, descriptors), pre-trained models, and MoleculeNet benchmark suite (50+ datasets for molecular property prediction, toxicity, ADMET). Key features include: support for both TensorFlow and PyTorch backends, distributed training, hyperparameter optimization, model interpretation tools, and integration with RDKit. Includes datasets for quantum chemistry, toxicity prediction, ADMET properties, and binding affinity prediction. Use cases: molecular property prediction, drug discovery, ADMET prediction, toxicity screening, and molecular machine learning research
- **DiffDock** - State-of-the-art diffusion-based molecular docking method for predicting protein-ligand binding poses and binding affinities. Uses diffusion models to generate diverse, high-quality binding poses without requiring exhaustive search. Key features include: fast inference compared to traditional docking methods, generation of multiple diverse poses, confidence scoring for predictions, and support for flexible ligand docking. Provides pre-trained models and Python API for integration into drug discovery pipelines. Achieves superior performance on standard benchmarks (PDBbind, CASF) compared to traditional docking methods. Use cases: virtual screening, lead optimization, binding pose prediction, structure-based drug design, and initial pose generation for refinement with more expensive methods
- **MedChem** - Python library for medicinal chemistry analysis and drug-likeness assessment. Provides tools for calculating molecular descriptors, ADMET (Absorption, Distribution, Metabolism, Excretion, Toxicity) property prediction, drug-likeness filters (Lipinski's Rule of Five, Veber rules, Egan rules, Muegge rules), molecular complexity metrics, and synthetic accessibility scoring. Features include: integration with RDKit, parallel processing for large datasets, and comprehensive property calculators. Supports filtering compound libraries based on drug-like properties, identifying potential ADMET issues early in drug discovery, and prioritizing compounds for further development. Use cases: lead optimization, compound library filtering, ADMET prediction, drug-likeness assessment, and medicinal chemistry analysis in drug discovery workflows
- **Molfeat** - Comprehensive Python library providing 100+ molecular featurizers for converting molecules into numerical representations suitable for machine learning. Includes molecular fingerprints (ECFP, MACCS, RDKit, Pharmacophore), molecular descriptors (2D/3D descriptors, constitutional, topological, electronic), graph-based representations (molecular graphs, line graphs), and pre-trained models (MolBERT, ChemBERTa, Uni-Mol embeddings). Features unified API across different featurizer types, caching for performance, parallel processing, and integration with popular ML frameworks (scikit-learn, PyTorch, TensorFlow). Supports both traditional cheminformatics descriptors and modern learned representations. Use cases: molecular property prediction, virtual screening, molecular similarity searches, and preparing molecular data for machine learning models
- **PyTDC** - Python library providing access to Therapeutics Data Commons (TDC), a collection of curated datasets and benchmarks for drug discovery and development. Includes datasets for ADMET prediction (absorption, distribution, metabolism, excretion, toxicity), drug-target interactions, drug-drug interactions, drug response prediction, molecular generation, and retrosynthesis. Features standardized data formats, data loaders with automatic preprocessing, benchmark tasks with evaluation metrics, leaderboards for model comparison, and integration with popular ML frameworks. Provides both single-molecule and drug-pair datasets, covering various stages of drug discovery from target identification to clinical outcomes. Use cases: benchmarking ML models for drug discovery, ADMET prediction model development, drug-target interaction prediction, and drug discovery research
- **RDKit** - Open-source cheminformatics toolkit for molecular informatics and drug discovery. Provides comprehensive functionality for molecular I/O (reading/writing SMILES, SDF, MOL, PDB files), molecular descriptors (200+ 2D and 3D descriptors), molecular fingerprints (Morgan, RDKit, MACCS, topological torsions), SMARTS pattern matching for substructure searches, molecular alignment and 3D coordinate generation, pharmacophore perception, reaction handling, and molecular drawing. Features high-performance C++ core with Python bindings, support for large molecule sets, and extensive documentation. Widely used in pharmaceutical industry and academic research. Use cases: molecular property calculation, virtual screening, molecular similarity searches, substructure matching, molecular visualization, and general cheminformatics workflows
- **TorchDrug** - PyTorch-based machine learning platform for drug discovery with 40+ datasets, 20+ GNN models for molecular property prediction, protein modeling, knowledge graph reasoning, molecular generation, and retrosynthesis planning
### Proteomics & Mass Spectrometry
- **matchms** - Processing and similarity matching of mass spectrometry data with 40+ filters, spectral library matching (Cosine, Modified Cosine, Neutral Losses), metadata harmonization, molecular fingerprint comparison, and support for multiple file formats (MGF, MSP, mzML, JSON)
- **pyOpenMS** - Comprehensive mass spectrometry data analysis for proteomics and metabolomics (LC-MS/MS processing, peptide identification, feature detection, quantification, chemical calculations, and integration with search engines like Comet, Mascot, MSGF+)
### Medical Imaging & Digital Pathology
- **histolab** - Digital pathology toolkit for whole slide image (WSI) processing and analysis. Provides automated tissue detection, tile extraction for deep learning pipelines, and preprocessing for gigapixel histopathology images. Key features include: multi-format WSI support (SVS, TIFF, NDPI), three tile extraction strategies (RandomTiler for sampling, GridTiler for complete coverage, ScoreTiler for quality-driven selection), automated tissue masks with customizable filters, built-in scorers (NucleiScorer, CellularityScorer), pyramidal image handling, visualization tools (thumbnails, mask overlays, tile previews), and H&E stain decomposition. Supports multiple tissue sections, artifact removal, pen annotation exclusion, and reproducible extraction with seeding. Use cases: creating training datasets for computational pathology, extracting informative tiles for tumor classification, whole-slide tissue characterization, quality assessment of histology samples, automated nuclei density analysis, and preprocessing for digital pathology deep learning workflows
- **PathML** - Comprehensive computational pathology toolkit for whole slide image analysis, tissue segmentation, and machine learning on pathology data. Provides end-to-end workflows for digital pathology research including data loading, preprocessing, feature extraction, and model deployment
- **pydicom** - Pure Python package for working with DICOM (Digital Imaging and Communications in Medicine) files. Provides comprehensive support for reading, writing, and manipulating medical imaging data from CT, MRI, X-ray, ultrasound, PET scans and other modalities. Key features include: pixel data extraction and manipulation with automatic decompression (JPEG/JPEG 2000/RLE), metadata access and modification with 1000+ standardized DICOM tags, image format conversion (PNG/JPEG/TIFF), anonymization tools for removing Protected Health Information (PHI), windowing and display transformations (VOI LUT application), multi-frame and 3D volume processing, DICOM sequence handling, and support for multiple transfer syntaxes. Use cases: medical image analysis, PACS system integration, radiology workflows, research data processing, DICOM anonymization, format conversion, image preprocessing for machine learning, multi-slice volume reconstruction, and clinical imaging pipelines
### Healthcare AI & Clinical Machine Learning
- **NeuroKit2** - Comprehensive biosignal processing toolkit for analyzing physiological data including ECG, EEG, EDA, RSP, PPG, EMG, and EOG signals. Use this skill when processing cardiovascular signals, brain activity, electrodermal responses, respiratory patterns, muscle activity, or eye movements. Key features include: automated signal processing pipelines (cleaning, peak detection, delineation, quality assessment), heart rate variability analysis across time/frequency/nonlinear domains (SDNN, RMSSD, LF/HF, DFA, entropy measures), EEG analysis (frequency band power, microstates, source localization), autonomic nervous system assessment (sympathetic indices, respiratory sinus arrhythmia), comprehensive complexity measures (25+ entropy types, 15+ fractal dimensions, Lyapunov exponents), event-related and interval-related analysis modes, epoch creation and averaging for stimulus-locked responses, multi-signal integration with unified workflows, and extensive signal processing utilities (filtering, decomposition, peak correction, spectral analysis). Includes modular reference documentation across 12 specialized domains. Use cases: heart rate variability for cardiovascular health assessment, EEG microstates for consciousness studies, electrodermal activity for emotion research, respiratory variability analysis, psychophysiology experiments, affective computing, stress monitoring, sleep staging, autonomic dysfunction assessment, biofeedback applications, and multi-modal physiological signal integration for comprehensive human state monitoring
- **PyHealth** - Comprehensive healthcare AI toolkit for developing, testing, and deploying machine learning models with clinical data. Provides specialized tools for electronic health records (EHR), physiological signals, medical imaging, and clinical text analysis. Key features include: 10+ healthcare datasets (MIMIC-III/IV, eICU, OMOP, sleep EEG, COVID-19 CXR), 20+ predefined clinical prediction tasks (mortality, hospital readmission, length of stay, drug recommendation, sleep staging, EEG analysis), 33+ models (Logistic Regression, MLP, CNN, RNN, Transformer, GNN, plus healthcare-specific models like RETAIN, SafeDrug, GAMENet, StageNet), comprehensive data processing (sequence processors, signal processors, medical code translation between ICD-9/10, NDC, RxNorm, ATC systems), training/evaluation utilities (Trainer class, fairness metrics, calibration, uncertainty quantification), and interpretability tools (attention visualization, SHAP, ChEFER). 3x faster than pandas for healthcare data processing. Use cases: ICU mortality prediction, hospital readmission risk assessment, safe medication recommendation with drug-drug interaction constraints, sleep disorder diagnosis from EEG signals, medical code standardization and translation, clinical text to ICD coding, length of stay estimation, and any clinical ML application requiring interpretability, fairness assessment, and calibrated predictions for healthcare deployment
### Protein Engineering & Design
- **ESM (Evolutionary Scale Modeling)** - State-of-the-art protein language models from EvolutionaryScale for protein design, structure prediction, and representation learning. Includes ESM3 (1.4B-98B parameter multimodal generative models for simultaneous reasoning across sequence, structure, and function with chain-of-thought generation, inverse folding, and function-conditioned design) and ESM C (300M-6B parameter efficient embedding models 3x faster than ESM2 for similarity analysis, classification, and feature extraction). Supports local inference with open weights and cloud-based Forge API for scalable batch processing. Use cases: novel protein design, structure prediction from sequence, sequence design from structure, protein embeddings, function annotation, variant generation, and directed evolution workflows
### Machine Learning & Deep Learning
- **aeon** - Comprehensive scikit-learn compatible Python toolkit for time series machine learning providing state-of-the-art algorithms across 7 domains: classification (13 algorithm categories including ROCKET variants, deep learning with InceptionTime/ResNet/FCN, distance-based with DTW/ERP/LCSS, shapelet-based, dictionary methods like BOSS/WEASEL, and hybrid ensembles HIVECOTE), regression (9 categories mirroring classification approaches), clustering (k-means/k-medoids with temporal distances, deep learning autoencoders, spectral methods), forecasting (ARIMA, ETS, Theta, Threshold Autoregressive, TCN, DeepAR), anomaly detection (STOMP/MERLIN matrix profile, clustering-based CBLOF/KMeans, isolation methods, copula-based COPOD), segmentation (ClaSP, FLUSS, HMM, binary segmentation), and similarity search (MASS algorithm, STOMP motif discovery, approximate nearest neighbors). Includes 40+ distance metrics (elastic: DTW/DDTW/WDTW/Shape-DTW, edit-based: ERP/EDR/LCSS/TWE/MSM, lock-step: Euclidean/Manhattan), extensive transformations (ROCKET/MiniRocket/MultiRocket for features, Catch22/TSFresh for statistics, SAX/PAA for symbolic representation, shapelet transforms, wavelets, matrix profile), 20+ deep learning architectures (FCN, ResNet, InceptionTime, TCN, autoencoders with attention mechanisms), comprehensive benchmarking tools (UCR/UEA archives with 100+ datasets, published results repository, statistical testing), and performance-optimized implementations using numba. Features progressive model complexity from fast baselines (MiniRocket: <1 second training, 0.95+ accuracy on many benchmarks) to state-of-the-art ensembles (HIVECOTE V2), GPU acceleration support, and extensive visualization utilities. Use cases: physiological signal classification (ECG, EEG), industrial sensor monitoring, financial forecasting, change point detection, pattern discovery, activity recognition from wearables, predictive maintenance, climate time series analysis, and any sequential data requiring specialized temporal modeling beyond standard ML
- **PufferLib** - High-performance reinforcement learning library achieving 1M-4M steps/second through optimized vectorization, native multi-agent support, and efficient PPO training (PuffeRL). Use this skill for RL training on any environment (Gymnasium, PettingZoo, Atari, Procgen), creating custom PufferEnv environments, developing policies (CNN, LSTM, multi-input architectures), optimizing parallel simulation performance, or scaling multi-agent systems. Includes Ocean suite (20+ environments), seamless framework integration with automatic space flattening, zero-copy vectorization with shared memory buffers, distributed training support, and comprehensive reference guides for training workflows, environment development, vectorization optimization, policy architectures, and third-party integrations
- **PyMC** - Comprehensive Python library for Bayesian statistical modeling and probabilistic programming. Provides intuitive syntax for building probabilistic models, advanced MCMC sampling algorithms (NUTS, Metropolis-Hastings, Slice sampling), variational inference methods (ADVI, SVGD), Gaussian processes, time series models (ARIMA, state space models), and model comparison tools (WAIC, LOO). Features include: automatic differentiation via Aesara (formerly Theano), GPU acceleration support, parallel sampling, model diagnostics and convergence checking, and integration with ArviZ for visualization and analysis. Supports hierarchical models, mixture models, survival analysis, and custom distributions. Use cases: Bayesian data analysis, uncertainty quantification, A/B testing, time series forecasting, hierarchical modeling, and probabilistic machine learning
- **PyMOO** - Python framework for multi-objective optimization using evolutionary algorithms. Provides implementations of state-of-the-art algorithms including NSGA-II, NSGA-III, MOEA/D, SPEA2, and reference-point based methods. Features include: support for constrained and unconstrained optimization, multiple problem types (continuous, discrete, mixed-variable), performance indicators (hypervolume, IGD, GD), visualization tools (Pareto front plots, convergence plots), and parallel evaluation support. Supports custom problem definitions, algorithm configuration, and result analysis. Designed for engineering design, parameter optimization, and any problem requiring optimization of multiple conflicting objectives simultaneously. Use cases: multi-objective optimization problems, Pareto-optimal solution finding, engineering design optimization, and research in evolutionary computation
- **PyTorch Lightning** - Deep learning framework that organizes PyTorch code to eliminate boilerplate while maintaining full flexibility. Automates training workflows (40+ tasks including epoch/batch iteration, optimizer steps, gradient management, checkpointing), supports multi-GPU/TPU training with DDP/FSDP/DeepSpeed strategies, includes LightningModule for model organization, Trainer for automation, LightningDataModule for data pipelines, callbacks for extensibility, and integrations with TensorBoard, Wandb, MLflow for experiment tracking
- **scikit-learn** - Industry-standard Python library for classical machine learning providing comprehensive supervised learning (classification: Logistic Regression, SVM, Decision Trees, Random Forests with 17+ variants, Gradient Boosting with XGBoost-compatible HistGradientBoosting, Naive Bayes, KNN, Neural Networks/MLP; regression: Linear, Ridge, Lasso, ElasticNet, SVR, ensemble methods), unsupervised learning (clustering: K-Means, DBSCAN, HDBSCAN, OPTICS, Agglomerative/Hierarchical, Spectral, Gaussian Mixture Models, BIRCH, MeanShift; dimensionality reduction: PCA, Kernel PCA, t-SNE, Isomap, LLE, NMF, TruncatedSVD, FastICA, LDA; outlier detection: IsolationForest, LocalOutlierFactor, OneClassSVM), data preprocessing (scaling: StandardScaler, MinMaxScaler, RobustScaler; encoding: OneHotEncoder, OrdinalEncoder, LabelEncoder; imputation: SimpleImputer, KNNImputer, IterativeImputer; feature engineering: PolynomialFeatures, KBinsDiscretizer, text vectorization with CountVectorizer/TfidfVectorizer), model evaluation (cross-validation: KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold; hyperparameter tuning: GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV; metrics: 30+ evaluation metrics for classification/regression/clustering including accuracy, precision, recall, F1, ROC-AUC, MSE, R², silhouette score), and Pipeline/ColumnTransformer for production-ready workflows. Features consistent API (fit/predict/transform), extensive documentation, integration with NumPy/pandas/SciPy, joblib persistence, and scikit-learn-compatible ecosystem (XGBoost, LightGBM, CatBoost, imbalanced-learn). Optimized implementations using Cython/OpenMP for performance. Use cases: predictive modeling, customer segmentation, anomaly detection, feature engineering, model selection/validation, text classification, image classification (with feature extraction), time series forecasting (with preprocessing), medical diagnosis, fraud detection, recommendation systems, and any tabular data ML task requiring interpretable models or established algorithms
- **scikit-survival** - Survival analysis and time-to-event modeling with censored data. Built on scikit-learn, provides Cox proportional hazards models (CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis with elastic net regularization), ensemble methods (Random Survival Forests, Gradient Boosting), Survival Support Vector Machines (linear and kernel), non-parametric estimators (Kaplan-Meier, Nelson-Aalen), competing risks analysis, and specialized evaluation metrics (concordance index, time-dependent AUC, Brier score). Handles right-censored data, integrates with scikit-learn pipelines, and supports feature selection and hyperparameter tuning via cross-validation
- **SHAP** - Model interpretability and explainability using Shapley values from game theory. Provides unified approach to explain any ML model with TreeExplainer (fast exact explanations for XGBoost/LightGBM/Random Forest), DeepExplainer (TensorFlow/PyTorch neural networks), KernelExplainer (model-agnostic), and LinearExplainer. Includes comprehensive visualizations (waterfall plots for individual predictions, beeswarm plots for global importance, scatter plots for feature relationships, bar/force/heatmap plots), supports model debugging, fairness analysis, feature engineering guidance, and production deployment
- **Stable Baselines3** - PyTorch-based reinforcement learning library providing reliable implementations of RL algorithms (PPO, SAC, DQN, TD3, DDPG, A2C, HER, RecurrentPPO). Use this skill for training RL agents on standard or custom Gymnasium environments, implementing callbacks for monitoring and control, using vectorized environments for parallel training, creating custom environments with proper Gymnasium API implementation, and integrating with deep RL workflows. Includes comprehensive training templates, evaluation utilities, algorithm selection guidance (on-policy vs off-policy, continuous vs discrete actions), support for multi-input policies (dict observations), goal-conditioned learning with HER, and integration with TensorBoard for experiment tracking
- **statsmodels** - Statistical modeling and econometrics (OLS, GLM, logit/probit, ARIMA, time series forecasting, hypothesis testing, diagnostics)
- **Torch Geometric** - Graph Neural Networks for molecular and geometric data
- **Transformers** - State-of-the-art machine learning models for NLP, computer vision, audio, and multimodal tasks. Provides 1M+ pre-trained models accessible via pipelines (text-classification, NER, QA, summarization, translation, text-generation, image-classification, object-detection, ASR, VQA), comprehensive training via Trainer API with distributed training and mixed precision, flexible text generation with multiple decoding strategies (greedy, beam search, sampling), and Auto classes for automatic architecture selection (BERT, GPT, T5, ViT, BART, etc.)
- **UMAP-learn** - Python implementation of Uniform Manifold Approximation and Projection (UMAP) for dimensionality reduction and manifold learning. Provides fast, scalable nonlinear dimensionality reduction that preserves both local and global structure of high-dimensional data. Key features include: support for both supervised and unsupervised dimensionality reduction, ability to handle mixed data types, integration with scikit-learn API, and efficient implementation using numba for performance. Produces low-dimensional embeddings (typically 2D or 3D) suitable for visualization and downstream analysis. Often outperforms t-SNE in preserving global structure while maintaining local neighborhoods. Use cases: data visualization, feature extraction, preprocessing for machine learning, single-cell data analysis, and exploratory data analysis of high-dimensional datasets
### Materials Science & Chemistry
- **Astropy** - Comprehensive Python library for astronomy and astrophysics providing core functionality for astronomical research and data analysis. Includes coordinate system transformations (ICRS, Galactic, FK5, AltAz), physical units and quantities with automatic dimensional consistency, FITS file operations (reading, writing, manipulating headers and data), cosmological calculations (luminosity distance, lookback time, Hubble parameter, Planck/WMAP models), precise time handling across multiple time scales (UTC, TAI, TT, TDB) and formats (JD, MJD, ISO), table operations with unit support (FITS, CSV, HDF5, VOTable), WCS transformations between pixel and world coordinates, astronomical constants, modeling framework, visualization tools, and statistical functions. Use for celestial coordinate transformations, unit conversions, FITS image/table processing, cosmological distance calculations, barycentric time corrections, catalog cross-matching, and astronomical data analysis
- **COBRApy** - Python package for constraint-based reconstruction and analysis (COBRA) of metabolic networks. Provides tools for building, manipulating, and analyzing genome-scale metabolic models (GEMs). Key features include: flux balance analysis (FBA) for predicting optimal metabolic fluxes, flux variability analysis (FVA), gene knockout simulations, pathway analysis, model validation, and integration with other COBRA Toolbox formats (SBML, JSON). Supports various optimization objectives (biomass production, ATP production, metabolite production), constraint handling (reaction bounds, gene-protein-reaction associations), and model comparison. Includes utilities for model construction, gap filling, and model refinement. Use cases: metabolic engineering, systems biology, biotechnology applications, understanding cellular metabolism, and predicting metabolic phenotypes
- **Pymatgen** - Python Materials Genomics (pymatgen) library for materials science computation and analysis. Provides comprehensive tools for crystal structure manipulation, phase diagram construction, electronic structure analysis, and materials property calculations. Key features include: structure objects with symmetry analysis, space group determination, structure matching and comparison, phase diagram generation from formation energies, band structure and density of states analysis, defect calculations, surface and interface analysis, and integration with DFT codes (VASP, Quantum ESPRESSO, ABINIT). Supports Materials Project database integration, structure file I/O (CIF, POSCAR, VASP), and high-throughput materials screening workflows. Use cases: materials discovery, crystal structure analysis, phase stability prediction, electronic structure calculations, and computational materials science research
### Data Analysis & Visualization
- **Dask** - Parallel computing for larger-than-memory datasets with distributed DataFrames, Arrays, Bags, and Futures
- **Data Commons** - Programmatic access to public statistical data from global sources including census bureaus, health organizations, and environmental agencies. Provides unified Python API for querying demographic data, economic indicators, health statistics, and environmental datasets through a knowledge graph interface. Features three main endpoints: Observation (statistical time-series queries for population, GDP, unemployment rates, disease prevalence), Node (knowledge graph exploration for entity relationships and hierarchies), and Resolve (entity identification from names, coordinates, or Wikidata IDs). Seamless Pandas integration for DataFrames, relation expressions for hierarchical queries, data source filtering for consistency, and support for custom Data Commons instances
- **Matplotlib** - Comprehensive Python plotting library for creating publication-quality static, animated, and interactive visualizations. Provides extensive customization options for creating figures, subplots, axes, and annotations. Key features include: support for multiple plot types (line, scatter, bar, histogram, contour, 3D, and many more), extensive customization (colors, fonts, styles, layouts), multiple backends (PNG, PDF, SVG, interactive backends), LaTeX integration for mathematical notation, and integration with NumPy and pandas. Includes specialized modules (pyplot for MATLAB-like interface, artist layer for fine-grained control, backend layer for rendering). Supports complex multi-panel figures, color maps, legends, and annotations. Use cases: scientific figure creation, data visualization, exploratory data analysis, publication graphics, and any application requiring high-quality plots
- **NetworkX** - Comprehensive toolkit for creating, analyzing, and visualizing complex networks and graphs. Supports four graph types (Graph, DiGraph, MultiGraph, MultiDiGraph) with nodes as any hashable objects and rich edge attributes. Provides 100+ algorithms including shortest paths (Dijkstra, Bellman-Ford, A*), centrality measures (degree, betweenness, closeness, eigenvector, PageRank), clustering (coefficients, triangles, transitivity), community detection (modularity-based, label propagation, Girvan-Newman), connectivity analysis (components, cuts, flows), tree algorithms (MST, spanning trees), matching, graph coloring, isomorphism, and traversal (DFS, BFS). Includes 50+ graph generators for classic (complete, cycle, wheel), random (Erdős-Rényi, Barabási-Albert, Watts-Strogatz, stochastic block model), lattice (grid, hexagonal, hypercube), and specialized networks. Supports I/O across formats (edge lists, GraphML, GML, JSON, Pajek, GEXF, DOT) with Pandas/NumPy/SciPy integration. Visualization capabilities include 8+ layout algorithms (spring/force-directed, circular, spectral, Kamada-Kawai), customizable node/edge appearance, interactive visualizations with Plotly/PyVis, and publication-quality figure generation. Use cases: social network analysis, biological networks (protein-protein interactions, gene regulatory networks, metabolic pathways), transportation systems, citation networks, knowledge graphs, web structure analysis, infrastructure networks, and any domain involving pairwise relationships requiring structural analysis or graph-based modeling
- **Polars** - High-performance DataFrame library written in Rust with Python bindings, designed for fast data manipulation and analysis. Provides lazy evaluation for query optimization, efficient memory usage, and parallel processing. Key features include: DataFrame operations (filtering, grouping, joining, aggregations), support for large datasets (larger than RAM), integration with pandas and NumPy, expression API for complex transformations, and support for multiple data formats (CSV, Parquet, JSON, Excel, Arrow). Features query optimization through lazy evaluation, automatic parallelization, and efficient memory management. Often 5-30x faster than pandas for many operations. Use cases: large-scale data processing, ETL pipelines, data analysis workflows, and high-performance data manipulation tasks
- **Seaborn** - Statistical data visualization with dataset-oriented interface, automatic confidence intervals, publication-quality themes, colorblind-safe palettes, and comprehensive support for exploratory analysis, distribution comparisons, correlation matrices, regression plots, and multi-panel figures
- **SimPy** - Process-based discrete-event simulation framework for modeling systems with processes, queues, and resource contention (manufacturing, service operations, network traffic, logistics). Supports generator-based process definition, multiple resource types (Resource, PriorityResource, PreemptiveResource, Container, Store), event-driven scheduling, process interaction mechanisms (signaling, interruption, parallel/sequential execution), real-time simulation synchronized with wall-clock time, and comprehensive monitoring capabilities for utilization, wait times, and queue statistics
- **SymPy** - Symbolic mathematics in Python for exact computation using mathematical symbols rather than numerical approximations. Provides comprehensive support for symbolic algebra (simplification, expansion, factorization), calculus (derivatives, integrals, limits, series), equation solving (algebraic, differential, systems of equations), matrices and linear algebra (eigenvalues, decompositions, solving linear systems), physics (classical mechanics with Lagrangian/Hamiltonian formulations, quantum mechanics, vector analysis, units), number theory (primes, factorization, modular arithmetic, Diophantine equations), geometry (2D/3D analytic geometry), combinatorics (permutations, combinations, partitions, group theory), logic and sets, statistics (probability distributions, random variables), special functions (gamma, Bessel, orthogonal polynomials), and code generation (lambdify to NumPy/SciPy functions, C/Fortran code generation, LaTeX output for documentation). Emphasizes exact arithmetic using rational numbers and symbolic representations, supports assumptions for improved simplification (positive, real, integer), integrates seamlessly with NumPy/SciPy through lambdify for fast numerical evaluation, and enables symbolic-to-numeric pipelines for scientific computing workflows
- **Vaex** - High-performance Python library for lazy, out-of-core DataFrames to process and visualize tabular datasets larger than available RAM. Processes over a billion rows per second through memory-mapped files (HDF5, Apache Arrow), lazy evaluation, and virtual columns (zero memory overhead). Provides instant file opening, efficient aggregations across billions of rows, interactive visualizations without sampling, machine learning pipelines with transformers (scalers, encoders, PCA), and seamless integration with pandas/NumPy/Arrow. Includes comprehensive ML framework (vaex.ml) with feature scaling, categorical encoding, dimensionality reduction, and integration with scikit-learn/XGBoost/LightGBM/CatBoost. Supports distributed computing via Dask, asynchronous operations, and state management for production deployment. Use cases: processing gigabyte to terabyte datasets, fast statistical aggregations on massive data, visualizing billion-row datasets, ML pipelines on big data, converting between data formats, and working with astronomical, financial, or scientific large-scale datasets
- **ReportLab** - Python library for programmatic PDF generation and document creation. Provides comprehensive tools for creating PDFs from scratch including text formatting, tables, graphics, images, charts, and complex layouts. Key features include: high-level Platypus framework for document layout, low-level canvas API for precise control, support for fonts (TrueType, Type 1), vector graphics, image embedding, page templates, headers/footers, and multi-page documents. Supports barcodes, forms, encryption, and digital signatures. Can generate reports, invoices, certificates, and complex documents programmatically. Use cases: automated report generation, document creation, invoice generation, certificate printing, and any application requiring programmatic PDF creation
### Phylogenetics & Trees
- **ETE Toolkit** - Python library for phylogenetic tree manipulation, visualization, and analysis. Provides comprehensive tools for working with phylogenetic trees including tree construction, manipulation (pruning, collapsing, rooting), tree comparison (Robinson-Foulds distance, tree reconciliation), annotation (node colors, labels, branch styles), and publication-quality visualization. Key features include: support for multiple tree formats (Newick, Nexus, PhyloXML), integration with phylogenetic software (PhyML, RAxML, FastTree), tree annotation with metadata, interactive tree visualization, and export to various image formats (PNG, PDF, SVG). Supports species trees, gene trees, and reconciliation analysis. Use cases: phylogenetic analysis, tree visualization, evolutionary biology research, comparative genomics, and teaching phylogenetics
### Genomics Tools
- **deepTools** - Comprehensive suite of Python tools for exploring and visualizing next-generation sequencing (NGS) data, particularly ChIP-seq, RNA-seq, and ATAC-seq experiments. Provides command-line tools and Python API for processing BAM and bigWig files. Key features include: quality control metrics (plotFingerprint, plotCorrelation), coverage track generation (bamCoverage for creating bigWig files), matrix generation for heatmaps (computeMatrix, plotHeatmap, plotProfile), comparative analysis (multiBigwigSummary, plotPCA), and efficient handling of large files. Supports normalization methods, binning options, and various visualization outputs. Designed for high-throughput analysis workflows and publication-quality figure generation. Use cases: ChIP-seq peak visualization, RNA-seq coverage analysis, ATAC-seq signal tracks, comparative genomics, and NGS data exploration
- **FlowIO** - Python library for reading and manipulating Flow Cytometry Standard (FCS) files, the standard format for flow cytometry data. Provides efficient parsing of FCS files (versions 2.0, 3.0, 3.1), access to event data (fluorescence intensities, scatter parameters), metadata extraction (keywords, parameters, acquisition settings), and conversion to pandas DataFrames or NumPy arrays. Features include: support for large FCS files, handling of multiple data segments, access to text segments and analysis segments, and integration with flow cytometry analysis workflows. Enables programmatic access to flow cytometry data for downstream analysis, visualization, and machine learning applications. Use cases: flow cytometry data analysis, high-throughput screening, immune cell profiling, and automated processing of FCS files
- **scikit-bio** - Python library for bioinformatics providing data structures, algorithms, and parsers for biological sequence analysis. Built on NumPy, SciPy, and pandas. Key features include: sequence objects (DNA, RNA, protein sequences) with biological alphabet validation, sequence alignment algorithms (local, global, semiglobal), phylogenetic tree manipulation, diversity metrics (alpha diversity, beta diversity, phylogenetic diversity), distance metrics for sequences and communities, file format parsers (FASTA, FASTQ, QIIME formats, Newick), and statistical analysis tools. Provides scikit-learn compatible transformers for machine learning workflows. Supports efficient processing of large sequence datasets. Use cases: sequence analysis, microbial ecology (16S rRNA analysis), metagenomics, phylogenetic analysis, and bioinformatics research requiring sequence manipulation and diversity calculations
- **Zarr** - Python library implementing the Zarr chunked, compressed N-dimensional array storage format. Provides efficient storage and access to large multi-dimensional arrays with chunking and compression. Key features include: support for NumPy-like arrays with chunked storage, multiple compression codecs (zlib, blosc, lz4, zstd), support for various data types, efficient partial array reading (only load needed chunks), support for both local filesystem and cloud storage (S3, GCS, Azure), and integration with NumPy, Dask, and Xarray. Enables working with arrays larger than available RAM through lazy loading and efficient chunk access. Supports parallel read/write operations and is optimized for cloud storage backends. Use cases: large-scale scientific data storage, cloud-based array storage, out-of-core array operations, and efficient storage of multi-dimensional datasets (genomics, imaging, climate data)
### Multi-omics & AI Agent Frameworks
- **BIOMNI** - Autonomous biomedical AI agent framework from Stanford SNAP lab for executing complex research tasks across genomics, drug discovery, molecular biology, and clinical analysis. Combines LLM reasoning with code execution and ~11GB of integrated biomedical databases (Ensembl, NCBI Gene, UniProt, PDB, AlphaFold, ClinVar, OMIM, HPO, PubMed, KEGG, Reactome, GO). Supports multiple LLM providers (Claude, GPT-4, Gemini, Groq, Bedrock). Includes A1 agent class for autonomous task decomposition, BiomniEval1 benchmark framework, and MCP server integration. Use cases: CRISPR screening design, single-cell RNA-seq analysis, ADMET prediction, GWAS interpretation, rare disease diagnosis, protein structure analysis, literature synthesis, and multi-omics integration
- **Denario** - Multiagent AI system for scientific research assistance that automates complete research workflows from data analysis through publication. Built on AG2 and LangGraph frameworks, orchestrates specialized agents for hypothesis generation, methodology development, computational analysis, and LaTeX paper writing. Supports multiple LLM providers (Google Vertex AI, OpenAI) with flexible pipeline stages allowing manual or automated inputs. Key features include: end-to-end research automation (data description → idea generation → methodology → results → paper), journal-specific formatting (APS and others), GUI interface via Streamlit, Docker deployment with LaTeX environment, reproducible research with version-controlled outputs, literature search integration, and integration with scientific Python stack (pandas, sklearn, scipy). Provides both programmatic Python API and web-based interface. Use cases: automated hypothesis generation from datasets, research methodology development, computational experiment execution with visualization, publication-ready manuscript generation, time-series analysis research, machine learning experiment automation, and accelerating the complete scientific research lifecycle from ideation to publication
- **HypoGeniC** - Automated hypothesis generation and testing using large language models to accelerate scientific discovery. Provides three frameworks: HypoGeniC (data-driven hypothesis generation from observational data), HypoRefine (synergistic approach combining literature insights with empirical patterns through an agentic system), and Union methods (mechanistic combination of literature and data-driven hypotheses). Features iterative refinement that improves hypotheses by learning from challenging examples, Redis caching for API cost reduction, and customizable YAML-based prompt templates. Includes command-line tools for generation (hypogenic_generation) and testing (hypogenic_inference). Research applications have demonstrated 14.19% accuracy improvement in AI-content detection and 7.44% in deception detection. Use cases: deception detection in reviews, AI-generated content identification, mental stress detection, exploratory research without existing literature, hypothesis-driven analysis in novel domains, and systematic exploration of competing explanations
### Scientific Communication & Publishing
- **Paper-2-Web** - Autonomous pipeline for transforming academic papers into multiple promotional formats using the Paper2All system. Converts LaTeX or PDF papers into: (1) Paper2Web - interactive, layout-aware academic homepages with responsive design, interactive figures, and mobile support; (2) Paper2Video - professional presentation videos with slides, narration, cursor movements, and optional talking-head generation using Hallo2; (3) Paper2Poster - print-ready conference posters with custom dimensions, professional layouts, and institution branding. Supports GPT-4/GPT-4.1 models, batch processing, QR code generation, multi-language content, and quality assessment metrics. Use cases: conference materials, video abstracts, preprint enhancement, research promotion, poster sessions, and academic website creation
### Document Processing & Conversion
- **MarkItDown** - Python utility for converting 20+ file formats to Markdown optimized for LLM processing. Converts Office documents (PDF, DOCX, PPTX, XLSX), images with OCR, audio with transcription, web content (HTML, YouTube transcripts, EPUB), and structured data (CSV, JSON, XML) while preserving document structure (headings, lists, tables, hyperlinks). Key features include: Azure Document Intelligence integration for enhanced PDF table extraction, LLM-powered image descriptions using GPT-4o, batch processing with ZIP archive support, modular installation for specific formats, streaming approach without temporary files, and plugin system for custom converters. Supports Python 3.10+. Use cases: preparing documents for RAG systems, extracting text from PDFs and Office files, transcribing audio to text, performing OCR on images and scanned documents, converting YouTube videos to searchable text, processing HTML and EPUB books, converting structured data to readable format, document analysis pipelines, and LLM training data preparation
### Laboratory Automation & Equipment Control
- **PyLabRobot** - Hardware-agnostic, pure Python SDK for automated and autonomous laboratories. Provides unified interface for controlling liquid handling robots (Hamilton STAR/STARlet, Opentrons OT-2, Tecan EVO), plate readers (BMG CLARIOstar), heater shakers, incubators, centrifuges, pumps, and scales. Key features include: modular resource management system for plates, tips, and containers with hierarchical deck layouts and JSON serialization; comprehensive liquid handling operations (aspirate, dispense, transfer, serial dilutions, plate replication) with automatic tip and volume tracking; backend abstraction enabling hardware-agnostic protocols that work across different robots; ChatterboxBackend for protocol simulation and testing without hardware; browser-based visualizer for real-time 3D deck state visualization; cross-platform support (Windows, macOS, Linux, Raspberry Pi); and integration capabilities for multi-device workflows combining liquid handlers, analytical equipment, and material handling devices. Use cases: automated sample preparation, high-throughput screening, serial dilution protocols, plate reading workflows, laboratory protocol development and validation, robotic liquid handling automation, and reproducible laboratory automation with state tracking and persistence
### Tool Discovery & Research Platforms
- **ToolUniverse** - Unified ecosystem providing standardized access to 600+ scientific tools, models, datasets, and APIs across bioinformatics, cheminformatics, genomics, structural biology, and proteomics. Enables AI agents to function as research scientists through: (1) Tool Discovery - natural language, semantic, and keyword-based search for finding relevant scientific tools (Tool_Finder, Tool_Finder_LLM, Tool_Finder_Keyword); (2) Tool Execution - standardized AI-Tool Interaction Protocol for running tools with consistent interfaces; (3) Tool Composition - sequential and parallel workflow chaining for multi-step research pipelines; (4) Model Context Protocol (MCP) integration for Claude Desktop/Code. Supports drug discovery workflows (disease→targets→structures→screening→candidates), genomics analysis (expression→differential analysis→pathways), clinical genomics (variants→annotation→pathogenicity→disease associations), and cross-domain research. Use cases: accessing scientific databases (OpenTargets, PubChem, UniProt, PDB, ChEMBL, KEGG), protein structure prediction (AlphaFold), molecular docking, pathway enrichment, variant annotation, literature searches, and automated scientific workflows
## Scientific Thinking & Analysis
### Analysis & Methodology
- **Exploratory Data Analysis** - Comprehensive EDA toolkit with automated statistics, visualizations, and insights for any tabular dataset
- **Hypothesis Generation** - Structured frameworks for generating and evaluating scientific hypotheses
- **Literature Review** - Systematic literature search and review toolkit with support for multiple scientific databases (PubMed, bioRxiv, Google Scholar), citation management with multiple citation styles (APA, AMA, Vancouver, Chicago, IEEE, Nature, Science), citation verification and deduplication, search strategies (Boolean operators, MeSH terms, field tags), PDF report generation with formatted references, and comprehensive templates for conducting systematic reviews following PRISMA guidelines
- **Peer Review** - Comprehensive toolkit for conducting high-quality scientific peer review with structured evaluation of methodology, statistics, reproducibility, ethics, and presentation across all scientific disciplines
- **Scientific Brainstorming** - Conversational brainstorming partner for generating novel research ideas, exploring connections, challenging assumptions, and developing creative approaches through structured ideation workflows
- **Scientific Critical Thinking** - Tools and approaches for rigorous scientific reasoning and evaluation
- **Scientific Visualization** - Best practices and templates for creating publication-quality scientific figures with matplotlib and seaborn, including statistical plots with automatic confidence intervals, colorblind-safe palettes, multi-panel figures, heatmaps, and journal-specific formatting
- **Scientific Writing** - Comprehensive toolkit for writing, structuring, and formatting scientific research papers using IMRAD format, multiple citation styles (APA, AMA, Vancouver, Chicago, IEEE), reporting guidelines (CONSORT, STROBE, PRISMA), effective figures and tables, field-specific terminology, venue-specific structure expectations, and core writing principles for clarity, conciseness, and accuracy across all scientific disciplines
- **Statistical Analysis** - Comprehensive statistical testing, power analysis, and experimental design
### Document Processing
- **DOCX** - Comprehensive document creation, editing, and analysis with support for tracked changes, comments, formatting preservation, and text extraction
- **PDF** - PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms
- **PPTX** - Presentation creation, editing, and analysis with support for layouts, comments, and speaker notes
- **XLSX** - Spreadsheet creation, editing, and analysis with support for formulas, formatting, data analysis, and visualization

View File

@@ -1,20 +0,0 @@
# Scientific Thinking & Analysis
## Analysis & Methodology
- **Exploratory Data Analysis** - Comprehensive EDA toolkit with automated statistics, visualizations, and insights for any tabular dataset
- **Hypothesis Generation** - Structured frameworks for generating and evaluating scientific hypotheses
- **Literature Review** - Systematic literature search and review toolkit with support for multiple scientific databases (PubMed, bioRxiv, Google Scholar), citation management with multiple citation styles (APA, AMA, Vancouver, Chicago, IEEE, Nature, Science), citation verification and deduplication, search strategies (Boolean operators, MeSH terms, field tags), PDF report generation with formatted references, and comprehensive templates for conducting systematic reviews following PRISMA guidelines
- **Peer Review** - Comprehensive toolkit for conducting high-quality scientific peer review with structured evaluation of methodology, statistics, reproducibility, ethics, and presentation across all scientific disciplines
- **Scientific Brainstorming** - Conversational brainstorming partner for generating novel research ideas, exploring connections, challenging assumptions, and developing creative approaches through structured ideation workflows
- **Scientific Critical Thinking** - Tools and approaches for rigorous scientific reasoning and evaluation
- **Scientific Visualization** - Best practices and templates for creating publication-quality scientific figures with matplotlib and seaborn, including statistical plots with automatic confidence intervals, colorblind-safe palettes, multi-panel figures, heatmaps, and journal-specific formatting
- **Scientific Writing** - Comprehensive toolkit for writing, structuring, and formatting scientific research papers using IMRAD format, multiple citation styles (APA, AMA, Vancouver, Chicago, IEEE), reporting guidelines (CONSORT, STROBE, PRISMA), effective figures and tables, field-specific terminology, venue-specific structure expectations, and core writing principles for clarity, conciseness, and accuracy across all scientific disciplines
- **Statistical Analysis** - Comprehensive statistical testing, power analysis, and experimental design
## Document Processing
- **DOCX** - Comprehensive document creation, editing, and analysis with support for tracked changes, comments, formatting preservation, and text extraction
- **PDF** - PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms
- **PPTX** - Presentation creation, editing, and analysis with support for layouts, comments, and speaker notes
- **XLSX** - Spreadsheet creation, editing, and analysis with support for formulas, formatting, data analysis, and visualization

View File

@@ -1,80 +0,0 @@
---
name: context-initialization
description: "Always Auto-invoked skill that creates/updates workspace AGENTS.md to instruct the agent to always search for existing skills before attempting any scientific task."
---
# Context Initialization
## Overview
This skill automatically creates or updates an `AGENTS.md` file in the workspace root that instructs the agent to use existing skills before attempting to solve scientific tasks. This ensures the agent uses documented patterns, authentication methods, working examples, and best practices from the repository's skills rather than inventing solutions from scratch.
## When to Use This Skill
This skill is automatically invoked when:
- The agent initializes in this workspace
- User begins any scientific task (database access, package usage, platform integration, or methodology)
- User mentions specific databases, packages, platforms, or research methods
- Any scientific data retrieval, analysis, or research task is started
**No manual invocation required** - this skill runs automatically.
## What This Skill Does
Creates or updates `AGENTS.md` in the workspace root with instructions for the agent to:
1. **Search first**: Look for relevant skills across all skill categories before writing code
2. **Use existing patterns**: Apply documented API access patterns, workflows, and examples
3. **Follow best practices**: Use rate limits, authentication, configurations, and established methodologies
4. **Adapt examples**: Leverage working code examples from `scripts/` folders and reference documentation
**Important**: If `AGENTS.md` already exists in the workspace, this skill will update it intelligently rather than overwriting it. This preserves any custom instructions or modifications while ensuring the essential skill-search directives are present.
## Skill Categories
This unified context initialization covers four major skill categories:
### Database Access Tasks
- Search `scientific-databases/` for 24+ database skills
- Use documented API endpoints and authentication patterns
- Apply working code examples and best practices
- Follow rate limits and error handling patterns
### Scientific Package Usage
- Search `scientific-packages/` for 40+ Python package skills
- Use installation instructions and API usage examples
- Apply best practices and common patterns
- Leverage working scripts and reference documentation
### Laboratory Platform Integration
- Search `scientific-integrations/` for 6+ platform integration skills
- Use authentication and setup instructions
- Apply API access patterns and platform-specific best practices
- Leverage working integration examples
### Scientific Analysis & Research Methods
- Search `scientific-thinking/` for methodology skills
- Use established data analysis frameworks (EDA, statistical analysis)
- Apply research methodologies (hypothesis generation, brainstorming, critical thinking)
- Leverage communication skills (scientific writing, visualization, peer review)
- Use document processing skills (DOCX, PDF, PPTX, XLSX)
## Implementation
When invoked, this skill manages the workspace `AGENTS.md` file as follows:
- **If `AGENTS.md` does not exist**: Creates a new file using the complete template from `references/AGENTS.md`
- **If `AGENTS.md` already exists**: Updates the file to ensure the essential skill-search directives are present, while preserving any existing custom content or modifications
The file includes sections instructing the agent to search for and use existing skills across all scientific task categories.
The complete reference template is available in `references/AGENTS.md`.
## Benefits
By centralizing context initialization, this skill ensures:
- **Consistency**: The agent always uses the same approach across all skill types
- **Efficiency**: One initialization covers all scientific tasks
- **Maintainability**: Updates to the initialization strategy occur in one place
- **Completeness**: The agent is reminded to search across all available skill categories

View File

@@ -1,73 +0,0 @@
# Reference: Complete Context Initialization Template
This is the complete reference template for what gets added to the workspace root `AGENTS.md` file.
---
# Agent Scientific Skills - Working Instructions
## IMPORTANT: Use Available Skills First
Before attempting any scientific task, use available skills.
---
## Database Access Tasks
**Before writing any database access code, use available skills in this repository.**
This repository contains skills for 24+ scientific databases. Each skill includes:
- API endpoints and authentication patterns
- Working code examples
- Best practices and rate limits
- Example scripts
Always use available database skills before writing custom database access code.
---
## Scientific Package Usage
**Before writing analysis code with scientific packages, use available skills in this repository.**
This repository contains skills for 40+ scientific Python packages. Each skill includes:
- Installation instructions
- Complete API usage examples
- Best practices and common patterns
- Working scripts and reference documentation
Always use available package skills before writing custom analysis code.
---
## Laboratory Platform Integration
**Before writing any platform integration code, use available skills in this repository.**
This repository contains skills for 6+ laboratory platforms and cloud services. Each skill includes:
- Authentication and setup instructions
- API access patterns
- Working integration examples
- Platform-specific best practices
Always use available integration skills before writing custom platform code.
---
## Scientific Analysis & Research Methods
**Before attempting any analysis, writing, or research task, use available methodology skills in this repository.**
This repository contains skills for scientific methodologies including:
- Data analysis frameworks (EDA, statistical analysis)
- Research methodologies (hypothesis generation, brainstorming, critical thinking)
- Communication skills (scientific writing, visualization, peer review)
- Document processing (DOCX, PDF, PPTX, XLSX)
Always use available methodology skills before attempting scientific analysis or writing tasks.
---
*This file is auto-generated by context-initialization skills. It ensures the agent uses available skills before attempting to solve scientific tasks from scratch.*

Some files were not shown because too many files have changed in this diff Show More