mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
- Add comprehensive TileDB-VCF skill by Jeremy Leipzig - Covers open source TileDB-VCF for learning and moderate-scale work - Emphasizes TileDB-Cloud for large-scale production genomics (1000+ samples) - Includes detailed reference documentation: * ingestion.md - Dataset creation and VCF ingestion * querying.md - Efficient variant queries * export.md - Data export and format conversion * population_genomics.md - GWAS and population analysis workflows - Features accurate TileDB-Cloud API patterns from official repository - Highlights scale transition: open source → TileDB-Cloud for enterprise
523 lines
14 KiB
Markdown
523 lines
14 KiB
Markdown
# TileDB-VCF Querying Guide
|
|
|
|
Complete guide to efficiently querying variant data from TileDB-VCF datasets with optimal performance and memory usage.
|
|
|
|
## Basic Querying
|
|
|
|
### Opening a Dataset
|
|
```python
|
|
import tiledbvcf
|
|
import pandas as pd
|
|
|
|
# Open dataset for reading
|
|
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
|
|
|
|
# With custom configuration
|
|
config = tiledbvcf.ReadConfig(
|
|
memory_budget=2048,
|
|
tiledb_config={"sm.tile_cache_size": "1000000000"}
|
|
)
|
|
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
|
|
```
|
|
|
|
### Simple Region Query
|
|
```python
|
|
# Query single region
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "pos_end", "alleles"],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
print(df.head())
|
|
```
|
|
|
|
### Multi-Region Query
|
|
```python
|
|
# Query multiple regions efficiently
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=[
|
|
"chr1:1000000-2000000",
|
|
"chr2:500000-1500000",
|
|
"chr22:20000000-21000000"
|
|
]
|
|
)
|
|
```
|
|
|
|
## Sample Filtering
|
|
|
|
### Query Specific Samples
|
|
```python
|
|
# Query subset of samples
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
|
|
)
|
|
```
|
|
|
|
### Sample Pattern Matching
|
|
```python
|
|
# Get all sample names first
|
|
sample_names = ds.sample_names()
|
|
print(f"Total samples: {len(sample_names)}")
|
|
|
|
# Filter samples by pattern
|
|
case_samples = [s for s in sample_names if s.startswith("CASE_")]
|
|
control_samples = [s for s in sample_names if s.startswith("CTRL_")]
|
|
|
|
# Query case samples
|
|
case_df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=case_samples
|
|
)
|
|
```
|
|
|
|
## Attribute Selection
|
|
|
|
### Available Attributes
|
|
```python
|
|
# List all available attributes
|
|
attrs = ds.attributes()
|
|
print("Available attributes:")
|
|
for attr in attrs:
|
|
print(f" {attr}")
|
|
```
|
|
|
|
### Core Variant Attributes
|
|
```python
|
|
# Essential variant information
|
|
df = ds.read(
|
|
attrs=[
|
|
"sample_name", # Sample identifier
|
|
"contig", # Chromosome
|
|
"pos_start", # Start position (1-based)
|
|
"pos_end", # End position (1-based)
|
|
"alleles", # REF and ALT alleles
|
|
"id", # Variant ID (rs numbers, etc.)
|
|
"qual", # Quality score
|
|
"filters" # FILTER field values
|
|
],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
```
|
|
|
|
### INFO Field Access
|
|
```python
|
|
# Access INFO fields (prefix with 'info_')
|
|
df = ds.read(
|
|
attrs=[
|
|
"sample_name", "pos_start", "alleles",
|
|
"info_AF", # Allele frequency
|
|
"info_AC", # Allele count
|
|
"info_AN", # Allele number
|
|
"info_DP", # Total depth
|
|
"info_ExcessHet" # Excess heterozygosity
|
|
],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
```
|
|
|
|
### FORMAT Field Access
|
|
```python
|
|
# Access FORMAT fields (prefix with 'fmt_')
|
|
df = ds.read(
|
|
attrs=[
|
|
"sample_name", "pos_start", "alleles",
|
|
"fmt_GT", # Genotype
|
|
"fmt_DP", # Read depth
|
|
"fmt_GQ", # Genotype quality
|
|
"fmt_AD", # Allelic depths
|
|
"fmt_PL" # Phred-scaled likelihoods
|
|
],
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002"]
|
|
)
|
|
```
|
|
|
|
## Advanced Filtering
|
|
|
|
### Quality-Based Filtering
|
|
```python
|
|
# Query with quality information
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "qual", "fmt_GQ", "fmt_DP"],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
|
|
# Filter high-quality variants in pandas
|
|
high_qual_df = df[
|
|
(df["qual"] >= 30) &
|
|
(df["fmt_GQ"] >= 20) &
|
|
(df["fmt_DP"] >= 10)
|
|
]
|
|
```
|
|
|
|
### Genotype Filtering
|
|
```python
|
|
# Query genotype data
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
|
|
# Filter for heterozygous variants
|
|
# fmt_GT format: [allele1, allele2] where -1 = missing
|
|
het_variants = df[
|
|
df["fmt_GT"].apply(lambda gt:
|
|
isinstance(gt, list) and len(gt) == 2 and
|
|
gt[0] != gt[1] and -1 not in gt
|
|
)
|
|
]
|
|
```
|
|
|
|
## Performance Optimization
|
|
|
|
### Memory-Efficient Streaming
|
|
```python
|
|
# For large result sets, use iterator pattern
|
|
def stream_variants(dataset, regions, chunk_size=100000):
|
|
"""Stream variants in chunks to manage memory"""
|
|
total_variants = 0
|
|
|
|
for region in regions:
|
|
print(f"Processing region: {region}")
|
|
|
|
# Query region
|
|
df = dataset.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=[region]
|
|
)
|
|
|
|
# Process in chunks
|
|
for i in range(0, len(df), chunk_size):
|
|
chunk = df.iloc[i:i+chunk_size]
|
|
yield chunk
|
|
total_variants += len(chunk)
|
|
|
|
if i + chunk_size < len(df):
|
|
print(f" Processed {i + chunk_size:,} variants...")
|
|
|
|
print(f"Total variants processed: {total_variants:,}")
|
|
|
|
# Usage
|
|
regions = ["chr1", "chr2", "chr3"]
|
|
for chunk in stream_variants(ds, regions):
|
|
# Process chunk
|
|
pass
|
|
```
|
|
|
|
### Parallel Region Processing
|
|
```python
|
|
import multiprocessing as mp
|
|
from functools import partial
|
|
|
|
def query_region(dataset_uri, region, samples=None):
|
|
"""Query single region - can be parallelized"""
|
|
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
|
|
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=[region],
|
|
samples=samples
|
|
)
|
|
|
|
return region, df
|
|
|
|
def parallel_query(dataset_uri, regions, samples=None, n_processes=4):
|
|
"""Query multiple regions in parallel"""
|
|
query_func = partial(query_region, dataset_uri, samples=samples)
|
|
|
|
with mp.Pool(n_processes) as pool:
|
|
results = pool.map(query_func, regions)
|
|
|
|
# Combine results
|
|
combined_df = pd.concat([df for _, df in results], ignore_index=True)
|
|
return combined_df
|
|
|
|
# Usage
|
|
regions = [f"chr{i}:1000000-2000000" for i in range(1, 23)]
|
|
df = parallel_query("my_dataset", regions, n_processes=8)
|
|
```
|
|
|
|
### Query Optimization Strategies
|
|
```python
|
|
# Optimize tile cache for repeated region access
|
|
config = tiledbvcf.ReadConfig(
|
|
memory_budget=4096,
|
|
tiledb_config={
|
|
"sm.tile_cache_size": "2000000000", # 2GB cache
|
|
"sm.mem.reader.sparse_global_order.ratio_tile_ranges": "0.5",
|
|
"sm.mem.reader.sparse_unordered.ratio_coords": "0.5"
|
|
}
|
|
)
|
|
|
|
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
|
|
|
|
# Combine nearby regions to reduce query overhead
|
|
def optimize_regions(regions, max_gap=1000000):
|
|
"""Merge nearby regions to reduce query count"""
|
|
if not regions:
|
|
return regions
|
|
|
|
# Parse regions (simplified - assumes chr:start-end format)
|
|
parsed = []
|
|
for region in regions:
|
|
chrom, pos_range = region.split(":")
|
|
start, end = map(int, pos_range.split("-"))
|
|
parsed.append((chrom, start, end))
|
|
|
|
# Sort by chromosome and position
|
|
parsed.sort()
|
|
|
|
merged = []
|
|
current_chrom, current_start, current_end = parsed[0]
|
|
|
|
for chrom, start, end in parsed[1:]:
|
|
if chrom == current_chrom and start - current_end <= max_gap:
|
|
# Merge regions
|
|
current_end = max(current_end, end)
|
|
else:
|
|
# Add current region and start new one
|
|
merged.append(f"{current_chrom}:{current_start}-{current_end}")
|
|
current_chrom, current_start, current_end = chrom, start, end
|
|
|
|
# Add final region
|
|
merged.append(f"{current_chrom}:{current_start}-{current_end}")
|
|
|
|
return merged
|
|
```
|
|
|
|
## Complex Queries
|
|
|
|
### Population-Specific Queries
|
|
```python
|
|
# Query specific populations
|
|
def query_population(ds, regions, population_file):
|
|
"""Query variants for specific population"""
|
|
# Read population assignments
|
|
pop_df = pd.read_csv(population_file) # sample_id, population
|
|
|
|
populations = {}
|
|
for _, row in pop_df.iterrows():
|
|
pop = row['population']
|
|
if pop not in populations:
|
|
populations[pop] = []
|
|
populations[pop].append(row['sample_id'])
|
|
|
|
results = {}
|
|
for pop_name, samples in populations.items():
|
|
print(f"Querying {pop_name}: {len(samples)} samples")
|
|
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=regions,
|
|
samples=samples
|
|
)
|
|
|
|
results[pop_name] = df
|
|
|
|
return results
|
|
|
|
# Usage
|
|
regions = ["chr1:1000000-2000000"]
|
|
pop_results = query_population(ds, regions, "population_assignments.csv")
|
|
```
|
|
|
|
### Allele Frequency Calculation
|
|
```python
|
|
def calculate_allele_frequencies(df):
|
|
"""Calculate allele frequencies from genotype data"""
|
|
af_results = []
|
|
|
|
# Group by variant position
|
|
for (contig, pos), group in df.groupby(['contig', 'pos_start']):
|
|
alleles = group['alleles'].iloc[0] # REF and ALT alleles
|
|
genotypes = group['fmt_GT'].values
|
|
|
|
# Count alleles
|
|
allele_counts = {}
|
|
total_alleles = 0
|
|
|
|
for gt in genotypes:
|
|
if isinstance(gt, list) and -1 not in gt: # Valid genotype
|
|
for allele_idx in gt:
|
|
allele_counts[allele_idx] = allele_counts.get(allele_idx, 0) + 1
|
|
total_alleles += 1
|
|
|
|
# Calculate frequencies
|
|
frequencies = {}
|
|
for allele_idx, count in allele_counts.items():
|
|
if allele_idx < len(alleles):
|
|
allele = alleles[allele_idx]
|
|
freq = count / total_alleles if total_alleles > 0 else 0
|
|
frequencies[allele] = freq
|
|
|
|
af_results.append({
|
|
'contig': contig,
|
|
'pos': pos,
|
|
'alleles': alleles,
|
|
'frequencies': frequencies,
|
|
'sample_count': len(group)
|
|
})
|
|
|
|
return af_results
|
|
|
|
# Usage
|
|
df = ds.read(
|
|
attrs=["contig", "pos_start", "alleles", "fmt_GT"],
|
|
regions=["chr1:1000000-1010000"],
|
|
samples=sample_list[:100] # Subset for faster calculation
|
|
)
|
|
|
|
af_results = calculate_allele_frequencies(df)
|
|
```
|
|
|
|
### Rare Variant Analysis
|
|
```python
|
|
def find_rare_variants(ds, regions, samples, max_af=0.01):
|
|
"""Find rare variants (MAF < threshold)"""
|
|
# Query with allele frequency info
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "info_AF", "fmt_GT"],
|
|
regions=regions,
|
|
samples=samples
|
|
)
|
|
|
|
# Filter for rare variants
|
|
rare_df = df[df["info_AF"] < max_af]
|
|
|
|
# Group by variant and count carriers
|
|
rare_summary = []
|
|
for (pos, alleles), group in rare_df.groupby(['pos_start', 'alleles']):
|
|
# Count carriers (samples with non-ref genotypes)
|
|
carriers = []
|
|
for _, row in group.iterrows():
|
|
gt = row['fmt_GT']
|
|
if isinstance(gt, list) and any(allele > 0 for allele in gt if allele != -1):
|
|
carriers.append(row['sample_name'])
|
|
|
|
rare_summary.append({
|
|
'pos': pos,
|
|
'alleles': alleles,
|
|
'af': group['info_AF'].iloc[0],
|
|
'carrier_count': len(carriers),
|
|
'carriers': carriers
|
|
})
|
|
|
|
return pd.DataFrame(rare_summary)
|
|
|
|
# Usage
|
|
rare_variants = find_rare_variants(
|
|
ds,
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=sample_list,
|
|
max_af=0.005 # 0.5% frequency threshold
|
|
)
|
|
```
|
|
|
|
## Cloud Querying
|
|
|
|
### S3 Dataset Queries
|
|
```python
|
|
# Configure for S3 access
|
|
s3_config = tiledbvcf.ReadConfig(
|
|
memory_budget=2048,
|
|
tiledb_config={
|
|
"vfs.s3.region": "us-east-1",
|
|
"vfs.s3.use_virtual_addressing": "true",
|
|
"sm.tile_cache_size": "1000000000"
|
|
}
|
|
)
|
|
|
|
# Query S3-hosted dataset
|
|
ds = tiledbvcf.Dataset(
|
|
uri="s3://my-bucket/vcf-dataset",
|
|
mode="r",
|
|
cfg=s3_config
|
|
)
|
|
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=["chr1:1000000-2000000"]
|
|
)
|
|
```
|
|
|
|
### Federated Queries
|
|
```python
|
|
def federated_query(dataset_uris, regions, samples):
|
|
"""Query multiple datasets and combine results"""
|
|
combined_results = []
|
|
|
|
for dataset_uri in dataset_uris:
|
|
print(f"Querying dataset: {dataset_uri}")
|
|
|
|
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
|
|
|
|
# Check which samples are available in this dataset
|
|
available_samples = set(ds.sample_names())
|
|
query_samples = [s for s in samples if s in available_samples]
|
|
|
|
if query_samples:
|
|
df = ds.read(
|
|
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
|
regions=regions,
|
|
samples=query_samples
|
|
)
|
|
df['dataset'] = dataset_uri
|
|
combined_results.append(df)
|
|
|
|
if combined_results:
|
|
return pd.concat(combined_results, ignore_index=True)
|
|
else:
|
|
return pd.DataFrame()
|
|
|
|
# Usage
|
|
datasets = [
|
|
"s3://data-bucket/study1-dataset",
|
|
"s3://data-bucket/study2-dataset",
|
|
"local_dataset"
|
|
]
|
|
results = federated_query(datasets, ["chr1:1000000-2000000"], ["SAMPLE_001"])
|
|
```
|
|
|
|
## Troubleshooting
|
|
|
|
### Common Query Issues
|
|
```python
|
|
# Handle empty results gracefully
|
|
def safe_query(ds, **kwargs):
|
|
"""Query with error handling"""
|
|
try:
|
|
df = ds.read(**kwargs)
|
|
if df.empty:
|
|
print("Query returned no results")
|
|
return pd.DataFrame()
|
|
return df
|
|
|
|
except Exception as e:
|
|
print(f"Query failed: {e}")
|
|
return pd.DataFrame()
|
|
|
|
# Check dataset contents before querying
|
|
def inspect_dataset(ds):
|
|
"""Inspect dataset properties"""
|
|
print(f"Sample count: {len(ds.sample_names())}")
|
|
print(f"Sample names (first 10): {ds.sample_names()[:10]}")
|
|
print(f"Available attributes: {ds.attributes()}")
|
|
|
|
# Try small test query
|
|
try:
|
|
test_df = ds.read(
|
|
attrs=["sample_name", "pos_start"],
|
|
regions=["chr1:1-1000"]
|
|
)
|
|
print(f"Test query returned {len(test_df)} variants")
|
|
except Exception as e:
|
|
print(f"Test query failed: {e}")
|
|
|
|
# Usage
|
|
inspect_dataset(ds)
|
|
```
|
|
|
|
This comprehensive querying guide covers all aspects of efficiently retrieving and filtering variant data from TileDB-VCF datasets, from basic queries to complex population genomics analyses. |