mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
- Add comprehensive TileDB-VCF skill by Jeremy Leipzig - Covers open source TileDB-VCF for learning and moderate-scale work - Emphasizes TileDB-Cloud for large-scale production genomics (1000+ samples) - Includes detailed reference documentation: * ingestion.md - Dataset creation and VCF ingestion * querying.md - Efficient variant queries * export.md - Data export and format conversion * population_genomics.md - GWAS and population analysis workflows - Features accurate TileDB-Cloud API patterns from official repository - Highlights scale transition: open source → TileDB-Cloud for enterprise
14 KiB
14 KiB
TileDB-VCF Querying Guide
Complete guide to efficiently querying variant data from TileDB-VCF datasets with optimal performance and memory usage.
Basic Querying
Opening a Dataset
import tiledbvcf
import pandas as pd
# Open dataset for reading
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
# With custom configuration
config = tiledbvcf.ReadConfig(
memory_budget=2048,
tiledb_config={"sm.tile_cache_size": "1000000000"}
)
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
Simple Region Query
# Query single region
df = ds.read(
attrs=["sample_name", "pos_start", "pos_end", "alleles"],
regions=["chr1:1000000-2000000"]
)
print(df.head())
Multi-Region Query
# Query multiple regions efficiently
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[
"chr1:1000000-2000000",
"chr2:500000-1500000",
"chr22:20000000-21000000"
]
)
Sample Filtering
Query Specific Samples
# Query subset of samples
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"],
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
)
Sample Pattern Matching
# Get all sample names first
sample_names = ds.sample_names()
print(f"Total samples: {len(sample_names)}")
# Filter samples by pattern
case_samples = [s for s in sample_names if s.startswith("CASE_")]
control_samples = [s for s in sample_names if s.startswith("CTRL_")]
# Query case samples
case_df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"],
samples=case_samples
)
Attribute Selection
Available Attributes
# List all available attributes
attrs = ds.attributes()
print("Available attributes:")
for attr in attrs:
print(f" {attr}")
Core Variant Attributes
# Essential variant information
df = ds.read(
attrs=[
"sample_name", # Sample identifier
"contig", # Chromosome
"pos_start", # Start position (1-based)
"pos_end", # End position (1-based)
"alleles", # REF and ALT alleles
"id", # Variant ID (rs numbers, etc.)
"qual", # Quality score
"filters" # FILTER field values
],
regions=["chr1:1000000-2000000"]
)
INFO Field Access
# Access INFO fields (prefix with 'info_')
df = ds.read(
attrs=[
"sample_name", "pos_start", "alleles",
"info_AF", # Allele frequency
"info_AC", # Allele count
"info_AN", # Allele number
"info_DP", # Total depth
"info_ExcessHet" # Excess heterozygosity
],
regions=["chr1:1000000-2000000"]
)
FORMAT Field Access
# Access FORMAT fields (prefix with 'fmt_')
df = ds.read(
attrs=[
"sample_name", "pos_start", "alleles",
"fmt_GT", # Genotype
"fmt_DP", # Read depth
"fmt_GQ", # Genotype quality
"fmt_AD", # Allelic depths
"fmt_PL" # Phred-scaled likelihoods
],
regions=["chr1:1000000-2000000"],
samples=["SAMPLE_001", "SAMPLE_002"]
)
Advanced Filtering
Quality-Based Filtering
# Query with quality information
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "qual", "fmt_GQ", "fmt_DP"],
regions=["chr1:1000000-2000000"]
)
# Filter high-quality variants in pandas
high_qual_df = df[
(df["qual"] >= 30) &
(df["fmt_GQ"] >= 20) &
(df["fmt_DP"] >= 10)
]
Genotype Filtering
# Query genotype data
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"]
)
# Filter for heterozygous variants
# fmt_GT format: [allele1, allele2] where -1 = missing
het_variants = df[
df["fmt_GT"].apply(lambda gt:
isinstance(gt, list) and len(gt) == 2 and
gt[0] != gt[1] and -1 not in gt
)
]
Performance Optimization
Memory-Efficient Streaming
# For large result sets, use iterator pattern
def stream_variants(dataset, regions, chunk_size=100000):
"""Stream variants in chunks to manage memory"""
total_variants = 0
for region in regions:
print(f"Processing region: {region}")
# Query region
df = dataset.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[region]
)
# Process in chunks
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i+chunk_size]
yield chunk
total_variants += len(chunk)
if i + chunk_size < len(df):
print(f" Processed {i + chunk_size:,} variants...")
print(f"Total variants processed: {total_variants:,}")
# Usage
regions = ["chr1", "chr2", "chr3"]
for chunk in stream_variants(ds, regions):
# Process chunk
pass
Parallel Region Processing
import multiprocessing as mp
from functools import partial
def query_region(dataset_uri, region, samples=None):
"""Query single region - can be parallelized"""
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[region],
samples=samples
)
return region, df
def parallel_query(dataset_uri, regions, samples=None, n_processes=4):
"""Query multiple regions in parallel"""
query_func = partial(query_region, dataset_uri, samples=samples)
with mp.Pool(n_processes) as pool:
results = pool.map(query_func, regions)
# Combine results
combined_df = pd.concat([df for _, df in results], ignore_index=True)
return combined_df
# Usage
regions = [f"chr{i}:1000000-2000000" for i in range(1, 23)]
df = parallel_query("my_dataset", regions, n_processes=8)
Query Optimization Strategies
# Optimize tile cache for repeated region access
config = tiledbvcf.ReadConfig(
memory_budget=4096,
tiledb_config={
"sm.tile_cache_size": "2000000000", # 2GB cache
"sm.mem.reader.sparse_global_order.ratio_tile_ranges": "0.5",
"sm.mem.reader.sparse_unordered.ratio_coords": "0.5"
}
)
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
# Combine nearby regions to reduce query overhead
def optimize_regions(regions, max_gap=1000000):
"""Merge nearby regions to reduce query count"""
if not regions:
return regions
# Parse regions (simplified - assumes chr:start-end format)
parsed = []
for region in regions:
chrom, pos_range = region.split(":")
start, end = map(int, pos_range.split("-"))
parsed.append((chrom, start, end))
# Sort by chromosome and position
parsed.sort()
merged = []
current_chrom, current_start, current_end = parsed[0]
for chrom, start, end in parsed[1:]:
if chrom == current_chrom and start - current_end <= max_gap:
# Merge regions
current_end = max(current_end, end)
else:
# Add current region and start new one
merged.append(f"{current_chrom}:{current_start}-{current_end}")
current_chrom, current_start, current_end = chrom, start, end
# Add final region
merged.append(f"{current_chrom}:{current_start}-{current_end}")
return merged
Complex Queries
Population-Specific Queries
# Query specific populations
def query_population(ds, regions, population_file):
"""Query variants for specific population"""
# Read population assignments
pop_df = pd.read_csv(population_file) # sample_id, population
populations = {}
for _, row in pop_df.iterrows():
pop = row['population']
if pop not in populations:
populations[pop] = []
populations[pop].append(row['sample_id'])
results = {}
for pop_name, samples in populations.items():
print(f"Querying {pop_name}: {len(samples)} samples")
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=regions,
samples=samples
)
results[pop_name] = df
return results
# Usage
regions = ["chr1:1000000-2000000"]
pop_results = query_population(ds, regions, "population_assignments.csv")
Allele Frequency Calculation
def calculate_allele_frequencies(df):
"""Calculate allele frequencies from genotype data"""
af_results = []
# Group by variant position
for (contig, pos), group in df.groupby(['contig', 'pos_start']):
alleles = group['alleles'].iloc[0] # REF and ALT alleles
genotypes = group['fmt_GT'].values
# Count alleles
allele_counts = {}
total_alleles = 0
for gt in genotypes:
if isinstance(gt, list) and -1 not in gt: # Valid genotype
for allele_idx in gt:
allele_counts[allele_idx] = allele_counts.get(allele_idx, 0) + 1
total_alleles += 1
# Calculate frequencies
frequencies = {}
for allele_idx, count in allele_counts.items():
if allele_idx < len(alleles):
allele = alleles[allele_idx]
freq = count / total_alleles if total_alleles > 0 else 0
frequencies[allele] = freq
af_results.append({
'contig': contig,
'pos': pos,
'alleles': alleles,
'frequencies': frequencies,
'sample_count': len(group)
})
return af_results
# Usage
df = ds.read(
attrs=["contig", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-1010000"],
samples=sample_list[:100] # Subset for faster calculation
)
af_results = calculate_allele_frequencies(df)
Rare Variant Analysis
def find_rare_variants(ds, regions, samples, max_af=0.01):
"""Find rare variants (MAF < threshold)"""
# Query with allele frequency info
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "info_AF", "fmt_GT"],
regions=regions,
samples=samples
)
# Filter for rare variants
rare_df = df[df["info_AF"] < max_af]
# Group by variant and count carriers
rare_summary = []
for (pos, alleles), group in rare_df.groupby(['pos_start', 'alleles']):
# Count carriers (samples with non-ref genotypes)
carriers = []
for _, row in group.iterrows():
gt = row['fmt_GT']
if isinstance(gt, list) and any(allele > 0 for allele in gt if allele != -1):
carriers.append(row['sample_name'])
rare_summary.append({
'pos': pos,
'alleles': alleles,
'af': group['info_AF'].iloc[0],
'carrier_count': len(carriers),
'carriers': carriers
})
return pd.DataFrame(rare_summary)
# Usage
rare_variants = find_rare_variants(
ds,
regions=["chr1:1000000-2000000"],
samples=sample_list,
max_af=0.005 # 0.5% frequency threshold
)
Cloud Querying
S3 Dataset Queries
# Configure for S3 access
s3_config = tiledbvcf.ReadConfig(
memory_budget=2048,
tiledb_config={
"vfs.s3.region": "us-east-1",
"vfs.s3.use_virtual_addressing": "true",
"sm.tile_cache_size": "1000000000"
}
)
# Query S3-hosted dataset
ds = tiledbvcf.Dataset(
uri="s3://my-bucket/vcf-dataset",
mode="r",
cfg=s3_config
)
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"]
)
Federated Queries
def federated_query(dataset_uris, regions, samples):
"""Query multiple datasets and combine results"""
combined_results = []
for dataset_uri in dataset_uris:
print(f"Querying dataset: {dataset_uri}")
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
# Check which samples are available in this dataset
available_samples = set(ds.sample_names())
query_samples = [s for s in samples if s in available_samples]
if query_samples:
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=regions,
samples=query_samples
)
df['dataset'] = dataset_uri
combined_results.append(df)
if combined_results:
return pd.concat(combined_results, ignore_index=True)
else:
return pd.DataFrame()
# Usage
datasets = [
"s3://data-bucket/study1-dataset",
"s3://data-bucket/study2-dataset",
"local_dataset"
]
results = federated_query(datasets, ["chr1:1000000-2000000"], ["SAMPLE_001"])
Troubleshooting
Common Query Issues
# Handle empty results gracefully
def safe_query(ds, **kwargs):
"""Query with error handling"""
try:
df = ds.read(**kwargs)
if df.empty:
print("Query returned no results")
return pd.DataFrame()
return df
except Exception as e:
print(f"Query failed: {e}")
return pd.DataFrame()
# Check dataset contents before querying
def inspect_dataset(ds):
"""Inspect dataset properties"""
print(f"Sample count: {len(ds.sample_names())}")
print(f"Sample names (first 10): {ds.sample_names()[:10]}")
print(f"Available attributes: {ds.attributes()}")
# Try small test query
try:
test_df = ds.read(
attrs=["sample_name", "pos_start"],
regions=["chr1:1-1000"]
)
print(f"Test query returned {len(test_df)} variants")
except Exception as e:
print(f"Test query failed: {e}")
# Usage
inspect_dataset(ds)
This comprehensive querying guide covers all aspects of efficiently retrieving and filtering variant data from TileDB-VCF datasets, from basic queries to complex population genomics analyses.