Files
claude-scientific-skills/scientific-skills/scvelo/scripts/rna_velocity_workflow.py
huangkuanlin 7f94783fab Add scVelo RNA velocity analysis workflow and IQ-TREE reference documentation
- Introduced a comprehensive RNA velocity analysis pipeline using scVelo, including data loading, preprocessing, velocity estimation, and visualization.
- Added a script for running RNA velocity analysis with customizable parameters and output options.
- Created detailed documentation for IQ-TREE 2 phylogenetic inference, covering command syntax, model selection, bootstrapping methods, and output interpretation.
- Included references for velocity models and their mathematical framework, along with a comparison of different models.
- Enhanced the scVelo skill documentation with installation instructions, use cases, and best practices for RNA velocity analysis.
2026-03-03 07:15:36 -05:00

233 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
RNA Velocity Analysis Workflow using scVelo
===========================================
Complete pipeline from raw data to velocity visualization.
Usage:
python rna_velocity_workflow.py
Or import and use run_velocity_analysis() with your AnnData object.
"""
import scvelo as scv
import scanpy as sc
import numpy as np
import matplotlib
matplotlib.use('Agg') # Non-interactive backend
import matplotlib.pyplot as plt
import os
def run_velocity_analysis(
adata,
groupby="leiden",
n_top_genes=2000,
n_neighbors=30,
mode="dynamical",
n_jobs=4,
output_dir="velocity_results",
):
"""
Complete RNA velocity analysis workflow.
Parameters
----------
adata : AnnData
AnnData object with 'spliced' and 'unspliced' layers.
Should already have UMAP and cluster annotations.
groupby : str
Column in adata.obs for cell type labels.
n_top_genes : int
Number of top highly variable genes.
n_neighbors : int
Number of neighbors for moment computation.
mode : str
Velocity model: 'stochastic' (fast) or 'dynamical' (accurate).
n_jobs : int
Parallel jobs for dynamical model fitting.
output_dir : str
Directory for saving output figures.
Returns
-------
AnnData with velocity annotations.
"""
os.makedirs(output_dir, exist_ok=True)
# ── Settings ──────────────────────────────────────────────────────────────
scv.settings.verbosity = 2
scv.settings.figdir = output_dir
# ── Step 1: Check layers ───────────────────────────────────────────────────
assert "spliced" in adata.layers, "Missing 'spliced' layer. Run velocyto first."
assert "unspliced" in adata.layers, "Missing 'unspliced' layer. Run velocyto first."
print(f"Input: {adata.n_obs} cells × {adata.n_vars} genes")
# ── Step 2: Preprocessing ─────────────────────────────────────────────────
print("Step 1/5: Preprocessing...")
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=n_top_genes)
if "neighbors" not in adata.uns:
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=30)
scv.pp.moments(adata, n_pcs=30, n_neighbors=n_neighbors)
print(f" {adata.n_vars} velocity genes selected")
# ── Step 3: Velocity estimation ────────────────────────────────────────────
print(f"Step 2/5: Fitting velocity model ({mode})...")
if mode == "dynamical":
scv.tl.recover_dynamics(adata, n_jobs=n_jobs)
scv.tl.velocity(adata, mode=mode)
scv.tl.velocity_graph(adata)
print(" Velocity graph computed")
# ── Step 4: Downstream analyses ────────────────────────────────────────────
print("Step 3/5: Computing latent time and confidence...")
scv.tl.velocity_confidence(adata)
scv.tl.velocity_pseudotime(adata)
if mode == "dynamical":
scv.tl.latent_time(adata)
if groupby in adata.obs.columns:
scv.tl.rank_velocity_genes(adata, groupby=groupby, min_corr=0.3)
# ── Step 5: Visualization ─────────────────────────────────────────────────
print("Step 4/5: Generating figures...")
# Stream plot
scv.pl.velocity_embedding_stream(
adata,
basis="umap",
color=groupby,
title="RNA Velocity",
save=f"{output_dir}/velocity_stream.png",
)
# Arrow plot
scv.pl.velocity_embedding(
adata,
arrow_length=3,
arrow_size=2,
color=groupby,
basis="umap",
save=f"{output_dir}/velocity_arrows.png",
)
# Pseudotime
scv.pl.scatter(
adata,
color="velocity_pseudotime",
cmap="gnuplot",
title="Velocity Pseudotime",
save=f"{output_dir}/pseudotime.png",
)
if mode == "dynamical" and "latent_time" in adata.obs:
scv.pl.scatter(
adata,
color="latent_time",
color_map="gnuplot",
title="Latent Time",
save=f"{output_dir}/latent_time.png",
)
# Speed and coherence
scv.pl.scatter(
adata,
c=["velocity_length", "velocity_confidence"],
cmap="coolwarm",
perc=[5, 95],
save=f"{output_dir}/velocity_quality.png",
)
# Top driver genes heatmap (dynamical only)
if mode == "dynamical" and "fit_likelihood" in adata.var:
top_genes = adata.var["fit_likelihood"].sort_values(ascending=False).index[:50]
scv.pl.heatmap(
adata,
var_names=top_genes,
sortby="latent_time",
col_color=groupby,
n_convolve=50,
save=f"{output_dir}/driver_gene_heatmap.png",
)
# ── Step 6: Save results ───────────────────────────────────────────────────
print("Step 5/5: Saving results...")
output_h5ad = os.path.join(output_dir, "adata_velocity.h5ad")
adata.write_h5ad(output_h5ad)
print(f" Saved to {output_h5ad}")
# Summary statistics
confidence = adata.obs["velocity_confidence"].dropna()
print("\nSummary:")
print(f" Velocity model: {mode}")
print(f" Cells: {adata.n_obs}")
print(f" Velocity genes: {adata.n_vars}")
print(f" Mean velocity confidence: {confidence.mean():.3f}")
print(f" High-confidence cells (>0.7): {(confidence > 0.7).sum()} ({(confidence > 0.7).mean():.1%})")
if mode == "dynamical" and "fit_likelihood" in adata.var:
good_genes = (adata.var["fit_likelihood"] > 0.1).sum()
print(f" Well-fit genes (likelihood>0.1): {good_genes}")
print(f"\nOutput files saved to: {output_dir}/")
return adata
def load_from_loom(loom_path, processed_h5ad=None):
"""
Load velocity data from velocyto loom file.
Args:
loom_path: Path to velocyto output loom file
processed_h5ad: Optional path to pre-processed Scanpy h5ad file
"""
adata_loom = scv.read(loom_path, cache=True)
if processed_h5ad:
adata_processed = sc.read_h5ad(processed_h5ad)
# Merge: keep processed metadata and add velocity layers
adata = scv.utils.merge(adata_processed, adata_loom)
else:
adata = adata_loom
# Run basic Scanpy pipeline
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.5)
return adata
if __name__ == "__main__":
# Example usage with simulated data (for testing)
print("scVelo RNA Velocity Workflow - Demo Mode")
print("=" * 50)
# Load example dataset
adata = scv.datasets.pancreas()
print(f"Loaded pancreas dataset: {adata}")
# Run analysis
adata = run_velocity_analysis(
adata,
groupby="clusters",
n_top_genes=2000,
mode="dynamical",
n_jobs=2,
output_dir="pancreas_velocity",
)
print("\nAnalysis complete!")
print(f"Key results:")
print(f" adata.layers['velocity']: velocity per gene per cell")
print(f" adata.obs['latent_time']: pseudotime from dynamics")
print(f" adata.obs['velocity_confidence']: per-cell confidence")
if "rank_velocity_genes" in adata.uns:
print(f" adata.uns['rank_velocity_genes']: driver genes per cluster")