Initial commit

2026-03-28 07:33:45 +08:00 · 2025-10-19 14:01:29 -07:00
parent d85386c32b
commit 152d0d54de
15 changed files with 4569 additions and 0 deletions
--- a/scientific-packages/anndata/references/workflows_best_practices.md
+++ b/scientific-packages/anndata/references/workflows_best_practices.md
@@ -0,0 +1,438 @@
+# AnnData Workflows and Best Practices
+
+## Common Workflows
+
+### 1. Single-Cell RNA-seq Analysis Workflow
+
+#### Loading Data
+```python
+import anndata as ad
+import numpy as np
+import pandas as pd
+
+# Load from 10X format
+adata = ad.read_mtx('matrix.mtx')
+adata.var_names = pd.read_csv('genes.tsv', sep='\t', header=None)[0]
+adata.obs_names = pd.read_csv('barcodes.tsv', sep='\t', header=None)[0]
+
+# Or load from pre-processed h5ad
+adata = ad.read_h5ad('preprocessed_data.h5ad')
+```
+
+#### Quality Control
+```python
+# Calculate QC metrics
+adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
+adata.obs['total_counts'] = adata.X.sum(axis=1)
+
+# Filter cells
+adata = adata[adata.obs.n_genes > 200]
+adata = adata[adata.obs.total_counts < 10000]
+
+# Filter genes
+min_cells = 3
+adata = adata[:, (adata.X > 0).sum(axis=0) >= min_cells]
+```
+
+#### Normalization and Preprocessing
+```python
+# Store raw counts
+adata.layers['counts'] = adata.X.copy()
+
+# Normalize
+adata.X = adata.X / adata.obs.total_counts.values[:, None] * 1e4
+
+# Log transform
+adata.layers['log1p'] = np.log1p(adata.X)
+adata.X = adata.layers['log1p']
+
+# Identify highly variable genes
+gene_variance = adata.X.var(axis=0)
+adata.var['highly_variable'] = gene_variance > np.percentile(gene_variance, 90)
+```
+
+#### Dimensionality Reduction
+```python
+# PCA
+from sklearn.decomposition import PCA
+pca = PCA(n_components=50)
+adata.obsm['X_pca'] = pca.fit_transform(adata.X)
+
+# Store PCA variance
+adata.uns['pca'] = {'variance_ratio': pca.explained_variance_ratio_}
+
+# UMAP
+from umap import UMAP
+umap = UMAP(n_components=2)
+adata.obsm['X_umap'] = umap.fit_transform(adata.obsm['X_pca'])
+```
+
+#### Clustering
+```python
+# Store cluster assignments
+adata.obs['clusters'] = pd.Categorical(['cluster_0', 'cluster_1', ...])
+
+# Store cluster centroids
+centroids = np.array([...])
+adata.varm['cluster_centroids'] = centroids
+```
+
+#### Save Results
+```python
+# Save complete analysis
+adata.write('analyzed_data.h5ad', compression='gzip')
+```
+
+### 2. Batch Integration Workflow
+
+```python
+import anndata as ad
+
+# Load multiple batches
+batch1 = ad.read_h5ad('batch1.h5ad')
+batch2 = ad.read_h5ad('batch2.h5ad')
+batch3 = ad.read_h5ad('batch3.h5ad')
+
+# Concatenate with batch labels
+adata = ad.concat(
+    [batch1, batch2, batch3],
+    axis=0,
+    label='batch',
+    keys=['batch1', 'batch2', 'batch3'],
+    index_unique='-'
+)
+
+# Batch effect correction would go here
+# (using external tools like Harmony, Scanorama, etc.)
+
+# Store corrected embeddings
+adata.obsm['X_pca_corrected'] = corrected_pca
+adata.obsm['X_umap_corrected'] = corrected_umap
+```
+
+### 3. Memory-Efficient Large Dataset Workflow
+
+```python
+import anndata as ad
+
+# Read in backed mode
+adata = ad.read_h5ad('large_dataset.h5ad', backed='r')
+
+# Check backing status
+print(f"Is backed: {adata.isbacked}")
+print(f"File: {adata.filename}")
+
+# Work with chunks
+for chunk in adata.chunk_X(chunk_size=1000):
+    # Process chunk
+    result = process_chunk(chunk)
+
+# Close file when done
+adata.file.close()
+```
+
+### 4. Multi-Dataset Comparison Workflow
+
+```python
+import anndata as ad
+
+# Load datasets
+datasets = {
+    'study1': ad.read_h5ad('study1.h5ad'),
+    'study2': ad.read_h5ad('study2.h5ad'),
+    'study3': ad.read_h5ad('study3.h5ad')
+}
+
+# Outer join to keep all genes
+combined = ad.concat(
+    list(datasets.values()),
+    axis=0,
+    join='outer',
+    label='study',
+    keys=list(datasets.keys()),
+    merge='first'
+)
+
+# Handle missing data
+combined.X[np.isnan(combined.X)] = 0
+
+# Add dataset-specific metadata
+combined.uns['datasets'] = {
+    'study1': {'date': '2023-01', 'n_samples': datasets['study1'].n_obs},
+    'study2': {'date': '2023-06', 'n_samples': datasets['study2'].n_obs},
+    'study3': {'date': '2024-01', 'n_samples': datasets['study3'].n_obs}
+}
+```
+
+## Best Practices
+
+### Memory Management
+
+#### Use Sparse Matrices
+```python
+from scipy.sparse import csr_matrix
+
+# Convert to sparse if data is sparse
+if density < 0.3:  # Less than 30% non-zero
+    adata.X = csr_matrix(adata.X)
+```
+
+#### Use Backed Mode for Large Files
+```python
+# Read with backing
+adata = ad.read_h5ad('large_file.h5ad', backed='r')
+
+# Only load what you need
+subset = adata[:1000, :500].copy()  # Now in memory
+```
+
+#### Convert Strings to Categoricals
+```python
+# Efficient storage for repeated strings
+adata.strings_to_categoricals()
+
+# Or manually
+adata.obs['cell_type'] = pd.Categorical(adata.obs['cell_type'])
+```
+
+### Data Organization
+
+#### Use Layers for Different Representations
+```python
+# Store multiple versions of the data
+adata.layers['counts'] = raw_counts
+adata.layers['normalized'] = normalized_data
+adata.layers['log1p'] = log_transformed_data
+adata.layers['scaled'] = scaled_data
+```
+
+#### Use obsm/varm for Multi-Dimensional Annotations
+```python
+# Embeddings
+adata.obsm['X_pca'] = pca_coordinates
+adata.obsm['X_umap'] = umap_coordinates
+adata.obsm['X_tsne'] = tsne_coordinates
+
+# Gene loadings
+adata.varm['PCs'] = principal_components
+```
+
+#### Use uns for Analysis Metadata
+```python
+# Store parameters
+adata.uns['preprocessing'] = {
+    'normalization': 'TPM',
+    'min_genes': 200,
+    'min_cells': 3,
+    'date': '2024-01-15'
+}
+
+# Store analysis results
+adata.uns['differential_expression'] = {
+    'method': 't-test',
+    'p_value_threshold': 0.05
+}
+```
+
+### Subsetting and Views
+
+#### Understand View vs Copy
+```python
+# Subsetting returns a view
+subset = adata[adata.obs.cell_type == 'B cell']  # View
+print(subset.is_view)  # True
+
+# Views are memory efficient but modifications affect original
+subset.obs['new_column'] = value  # Modifies original adata
+
+# Create independent copy when needed
+subset_copy = adata[adata.obs.cell_type == 'B cell'].copy()
+```
+
+#### Chain Operations Efficiently
+```python
+# Bad - creates multiple intermediate views
+temp1 = adata[adata.obs.batch == 'batch1']
+temp2 = temp1[temp1.obs.n_genes > 200]
+result = temp2[:, temp2.var.highly_variable].copy()
+
+# Good - chain operations
+result = adata[
+    (adata.obs.batch == 'batch1') & (adata.obs.n_genes > 200),
+    adata.var.highly_variable
+].copy()
+```
+
+### File I/O
+
+#### Use Compression
+```python
+# Save with compression
+adata.write('data.h5ad', compression='gzip')
+```
+
+#### Choose the Right Format
+```python
+# H5AD for general use (good compression, fast)
+adata.write_h5ad('data.h5ad')
+
+# Zarr for cloud storage and parallel access
+adata.write_zarr('data.zarr')
+
+# Loom for compatibility with other tools
+adata.write_loom('data.loom')
+```
+
+#### Close File Connections
+```python
+# Use context manager pattern
+adata = ad.read_h5ad('file.h5ad', backed='r')
+try:
+    # Work with data
+    process(adata)
+finally:
+    adata.file.close()
+```
+
+### Concatenation
+
+#### Choose Appropriate Join Strategy
+```python
+# Inner join - only common features (safe, may lose data)
+combined = ad.concat([adata1, adata2], join='inner')
+
+# Outer join - all features (keeps all data, may introduce zeros)
+combined = ad.concat([adata1, adata2], join='outer')
+```
+
+#### Track Data Sources
+```python
+# Add source labels
+combined = ad.concat(
+    [adata1, adata2, adata3],
+    label='dataset',
+    keys=['exp1', 'exp2', 'exp3']
+)
+
+# Make indices unique
+combined = ad.concat(
+    [adata1, adata2, adata3],
+    index_unique='-'
+)
+```
+
+#### Handle Variable-Specific Metadata
+```python
+# Use merge strategy for var annotations
+combined = ad.concat(
+    [adata1, adata2],
+    merge='same',  # Keep only identical annotations
+    join='outer'
+)
+```
+
+### Naming Conventions
+
+#### Use Consistent Naming
+```python
+# Embeddings: X_<method>
+adata.obsm['X_pca']
+adata.obsm['X_umap']
+adata.obsm['X_tsne']
+
+# Layers: descriptive names
+adata.layers['counts']
+adata.layers['log1p']
+adata.layers['scaled']
+
+# Observations: snake_case
+adata.obs['cell_type']
+adata.obs['n_genes']
+adata.obs['total_counts']
+```
+
+#### Make Indices Unique
+```python
+# Ensure unique names
+adata.obs_names_make_unique()
+adata.var_names_make_unique()
+```
+
+### Error Handling
+
+#### Validate Data Structure
+```python
+# Check dimensions
+assert adata.n_obs > 0, "No observations in data"
+assert adata.n_vars > 0, "No variables in data"
+
+# Check for NaN values
+if np.isnan(adata.X).any():
+    print("Warning: NaN values detected")
+
+# Check for negative values in count data
+if (adata.X < 0).any():
+    print("Warning: Negative values in count data")
+```
+
+#### Handle Missing Data
+```python
+# Check for missing annotations
+if adata.obs['cell_type'].isna().any():
+    print("Warning: Missing cell type annotations")
+    # Fill or remove
+    adata = adata[~adata.obs['cell_type'].isna()]
+```
+
+## Common Pitfalls
+
+### 1. Forgetting to Copy Views
+```python
+# BAD - modifies original
+subset = adata[adata.obs.condition == 'treated']
+subset.X = transformed_data  # Changes original adata!
+
+# GOOD
+subset = adata[adata.obs.condition == 'treated'].copy()
+subset.X = transformed_data  # Only changes subset
+```
+
+### 2. Mixing Backed and In-Memory Operations
+```python
+# BAD - trying to modify backed data
+adata = ad.read_h5ad('file.h5ad', backed='r')
+adata.X[0, 0] = 100  # Error: can't modify backed data
+
+# GOOD - load to memory first
+adata = ad.read_h5ad('file.h5ad', backed='r')
+adata = adata.to_memory()
+adata.X[0, 0] = 100  # Works
+```
+
+### 3. Not Using Categoricals for Metadata
+```python
+# BAD - stores as strings (memory inefficient)
+adata.obs['cell_type'] = ['B cell', 'T cell', ...] * 1000
+
+# GOOD - use categorical
+adata.obs['cell_type'] = pd.Categorical(['B cell', 'T cell', ...] * 1000)
+```
+
+### 4. Incorrect Concatenation Axis
+```python
+# Concatenating observations (cells)
+combined = ad.concat([adata1, adata2], axis=0)  # Correct
+
+# Concatenating variables (genes) - rare
+combined = ad.concat([adata1, adata2], axis=1)  # Less common
+```
+
+### 5. Not Preserving Raw Data
+```python
+# BAD - loses original data
+adata.X = normalized_data
+
+# GOOD - preserve original
+adata.layers['counts'] = adata.X.copy()
+adata.X = normalized_data
+```