mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
- Add comprehensive TileDB-VCF skill by Jeremy Leipzig - Covers open source TileDB-VCF for learning and moderate-scale work - Emphasizes TileDB-Cloud for large-scale production genomics (1000+ samples) - Includes detailed reference documentation: * ingestion.md - Dataset creation and VCF ingestion * querying.md - Efficient variant queries * export.md - Data export and format conversion * population_genomics.md - GWAS and population analysis workflows - Features accurate TileDB-Cloud API patterns from official repository - Highlights scale transition: open source → TileDB-Cloud for enterprise
569 lines
16 KiB
Markdown
569 lines
16 KiB
Markdown
# TileDB-VCF Export Guide
|
|
|
|
Complete guide to exporting data from TileDB-VCF datasets in various formats for downstream analysis and integration with other genomics tools.
|
|
|
|
## VCF/BCF Export
|
|
|
|
### Basic VCF Export
|
|
```python
|
|
import tiledbvcf
|
|
|
|
# Open dataset for reading
|
|
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
|
|
|
|
# Export specific regions as VCF
|
|
ds.export_vcf(
|
|
uri="output.vcf.gz",
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
|
|
)
|
|
```
|
|
|
|
### BCF Export (Binary VCF)
|
|
```python
|
|
# Export as compressed BCF for faster processing
|
|
ds.export_bcf(
|
|
uri="output.bcf",
|
|
regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002"]
|
|
)
|
|
```
|
|
|
|
### Large-Scale Export
|
|
```python
|
|
# Export entire chromosomes efficiently
|
|
def export_chromosome(ds, chrom, output_dir, samples=None):
|
|
"""Export full chromosome data"""
|
|
output_path = f"{output_dir}/chr{chrom}.bcf"
|
|
|
|
print(f"Exporting chromosome {chrom}")
|
|
ds.export_bcf(
|
|
uri=output_path,
|
|
regions=[f"chr{chrom}"],
|
|
samples=samples
|
|
)
|
|
print(f"Exported to {output_path}")
|
|
|
|
# Export all autosomes
|
|
for chrom in range(1, 23):
|
|
export_chromosome(ds, chrom, "exported_data")
|
|
```
|
|
|
|
## TSV Export
|
|
|
|
### Basic TSV Export
|
|
```python
|
|
# Export as tab-separated values
|
|
ds.export_tsv(
|
|
uri="variants.tsv",
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002"],
|
|
tsv_fields=["CHR", "POS", "REF", "ALT", "S:GT", "S:DP"]
|
|
)
|
|
```
|
|
|
|
### Custom Field Selection
|
|
```python
|
|
# Define custom TSV fields
|
|
custom_fields = [
|
|
"CHR", # Chromosome
|
|
"POS", # Position
|
|
"ID", # Variant ID
|
|
"REF", # Reference allele
|
|
"ALT", # Alternative allele
|
|
"QUAL", # Quality score
|
|
"FILTER", # Filter status
|
|
"I:AF", # INFO: Allele frequency
|
|
"I:AC", # INFO: Allele count
|
|
"I:AN", # INFO: Allele number
|
|
"S:GT", # Sample: Genotype
|
|
"S:DP", # Sample: Depth
|
|
"S:GQ" # Sample: Genotype quality
|
|
]
|
|
|
|
ds.export_tsv(
|
|
uri="detailed_variants.tsv",
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002"],
|
|
tsv_fields=custom_fields
|
|
)
|
|
```
|
|
|
|
### Population-Specific Exports
|
|
```python
|
|
def export_population_data(ds, regions, population_file, output_prefix):
|
|
"""Export data for different populations separately"""
|
|
import pandas as pd
|
|
|
|
# Read population assignments
|
|
pop_df = pd.read_csv(population_file)
|
|
|
|
populations = {}
|
|
for _, row in pop_df.iterrows():
|
|
pop = row['population']
|
|
if pop not in populations:
|
|
populations[pop] = []
|
|
populations[pop].append(row['sample_id'])
|
|
|
|
# Export each population
|
|
for pop_name, samples in populations.items():
|
|
output_file = f"{output_prefix}_{pop_name}.tsv"
|
|
|
|
print(f"Exporting {pop_name}: {len(samples)} samples")
|
|
|
|
ds.export_tsv(
|
|
uri=output_file,
|
|
regions=regions,
|
|
samples=samples,
|
|
tsv_fields=["CHR", "POS", "REF", "ALT", "S:GT", "S:AF"]
|
|
)
|
|
|
|
print(f"Exported {pop_name} data to {output_file}")
|
|
|
|
# Usage
|
|
export_population_data(
|
|
ds,
|
|
regions=["chr1:1000000-2000000"],
|
|
population_file="populations.csv",
|
|
output_prefix="population_variants"
|
|
)
|
|
```
|
|
|
|
## Pandas DataFrame Export
|
|
|
|
### Query to DataFrame
|
|
```python
|
|
# Export query results as pandas DataFrame for analysis
|
|
df = ds.read(
|
|
attrs=["sample_name", "contig", "pos_start", "alleles", "fmt_GT", "info_AF"],
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
|
|
)
|
|
|
|
# Save DataFrame to various formats
|
|
df.to_csv("variants.csv", index=False)
|
|
df.to_parquet("variants.parquet")
|
|
df.to_pickle("variants.pkl")
|
|
```
|
|
|
|
### Processed Data Export
|
|
```python
|
|
def export_processed_variants(ds, regions, samples, output_file):
|
|
"""Export processed variant data with calculated metrics"""
|
|
# Query raw data
|
|
df = ds.read(
|
|
attrs=["sample_name", "contig", "pos_start", "pos_end",
|
|
"alleles", "fmt_GT", "fmt_DP", "fmt_GQ", "info_AF"],
|
|
regions=regions,
|
|
samples=samples
|
|
)
|
|
|
|
# Add calculated columns
|
|
df['variant_id'] = df['contig'] + ':' + df['pos_start'].astype(str)
|
|
|
|
# Parse genotypes
|
|
def parse_genotype(gt):
|
|
if isinstance(gt, list) and len(gt) == 2:
|
|
if -1 in gt:
|
|
return "missing"
|
|
elif gt[0] == gt[1]:
|
|
return "homozygous"
|
|
else:
|
|
return "heterozygous"
|
|
return "unknown"
|
|
|
|
df['genotype_type'] = df['fmt_GT'].apply(parse_genotype)
|
|
|
|
# Filter high-quality variants
|
|
high_qual = df[
|
|
(df['fmt_DP'] >= 10) &
|
|
(df['fmt_GQ'] >= 20) &
|
|
(df['genotype_type'] != 'missing')
|
|
]
|
|
|
|
# Export processed data
|
|
high_qual.to_csv(output_file, index=False)
|
|
print(f"Exported {len(high_qual)} high-quality variants to {output_file}")
|
|
|
|
return high_qual
|
|
|
|
# Usage
|
|
processed_df = export_processed_variants(
|
|
ds,
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=ds.sample_names()[:50], # First 50 samples
|
|
output_file="high_quality_variants.csv"
|
|
)
|
|
```
|
|
|
|
## Streaming Export for Large Datasets
|
|
|
|
### Chunked Export
|
|
```python
|
|
def streaming_export(ds, regions, samples, output_file, chunk_size=100000):
|
|
"""Export large datasets in chunks to manage memory"""
|
|
import csv
|
|
|
|
total_variants = 0
|
|
|
|
with open(output_file, 'w', newline='') as f:
|
|
writer = None
|
|
header_written = False
|
|
|
|
for region in regions:
|
|
print(f"Processing region: {region}")
|
|
|
|
# Query region
|
|
df = ds.read(
|
|
attrs=["sample_name", "contig", "pos_start", "alleles", "fmt_GT"],
|
|
regions=[region],
|
|
samples=samples
|
|
)
|
|
|
|
if df.empty:
|
|
continue
|
|
|
|
# Process in chunks
|
|
for i in range(0, len(df), chunk_size):
|
|
chunk = df.iloc[i:i+chunk_size]
|
|
|
|
# Write header on first chunk
|
|
if not header_written:
|
|
writer = csv.writer(f)
|
|
writer.writerow(chunk.columns)
|
|
header_written = True
|
|
|
|
# Write chunk data
|
|
for _, row in chunk.iterrows():
|
|
writer.writerow(row.values)
|
|
|
|
total_variants += len(chunk)
|
|
|
|
if i + chunk_size < len(df):
|
|
print(f" Processed {i + chunk_size:,} variants...")
|
|
|
|
print(f"Exported {total_variants:,} variants to {output_file}")
|
|
|
|
# Usage
|
|
regions = [f"chr{i}" for i in range(1, 23)] # All autosomes
|
|
streaming_export(ds, regions, ds.sample_names(), "genome_wide_variants.csv")
|
|
```
|
|
|
|
### Parallel Export
|
|
```python
|
|
import multiprocessing as mp
|
|
import os
|
|
|
|
def export_region_chunk(args):
|
|
"""Export single region - for parallel processing"""
|
|
dataset_uri, region, samples, output_dir = args
|
|
|
|
# Create separate dataset instance for each process
|
|
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
|
|
|
|
# Generate output filename
|
|
region_safe = region.replace(":", "_").replace("-", "_")
|
|
output_file = os.path.join(output_dir, f"variants_{region_safe}.tsv")
|
|
|
|
# Export region
|
|
ds.export_tsv(
|
|
uri=output_file,
|
|
regions=[region],
|
|
samples=samples,
|
|
tsv_fields=["CHR", "POS", "REF", "ALT", "S:GT", "S:DP"]
|
|
)
|
|
|
|
return region, output_file
|
|
|
|
def parallel_export(dataset_uri, regions, samples, output_dir, n_processes=4):
|
|
"""Export multiple regions in parallel"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Prepare arguments for parallel processing
|
|
args = [(dataset_uri, region, samples, output_dir) for region in regions]
|
|
|
|
# Export in parallel
|
|
with mp.Pool(n_processes) as pool:
|
|
results = pool.map(export_region_chunk, args)
|
|
|
|
# Combine results if needed
|
|
output_files = [output_file for _, output_file in results]
|
|
print(f"Exported {len(output_files)} region files to {output_dir}")
|
|
|
|
return output_files
|
|
|
|
# Usage
|
|
regions = [f"chr{i}:1-50000000" for i in range(1, 23)] # First half of each chromosome
|
|
output_files = parallel_export(
|
|
dataset_uri="my_dataset",
|
|
regions=regions,
|
|
samples=ds.sample_names()[:100],
|
|
output_dir="parallel_export",
|
|
n_processes=8
|
|
)
|
|
```
|
|
|
|
## Integration with Analysis Tools
|
|
|
|
### PLINK Format Export
|
|
```python
|
|
def export_for_plink(ds, regions, samples, output_prefix):
|
|
"""Export data in format suitable for PLINK analysis"""
|
|
# Query variant data
|
|
df = ds.read(
|
|
attrs=["sample_name", "contig", "pos_start", "id", "alleles", "fmt_GT"],
|
|
regions=regions,
|
|
samples=samples
|
|
)
|
|
|
|
# Prepare PLINK-compatible data
|
|
plink_data = []
|
|
for _, row in df.iterrows():
|
|
gt = row['fmt_GT']
|
|
if isinstance(gt, list) and len(gt) == 2 and -1 not in gt:
|
|
# Convert genotype to PLINK format (0/1/2)
|
|
alleles = row['alleles']
|
|
if len(alleles) >= 2:
|
|
ref_allele = alleles[0]
|
|
alt_allele = alleles[1]
|
|
|
|
# Count alternative alleles
|
|
alt_count = sum(1 for allele in gt if allele == 1)
|
|
|
|
plink_data.append({
|
|
'sample': row['sample_name'],
|
|
'chr': row['contig'],
|
|
'pos': row['pos_start'],
|
|
'id': row['id'] if row['id'] else f"{row['contig']}_{row['pos_start']}",
|
|
'ref': ref_allele,
|
|
'alt': alt_allele,
|
|
'genotype': alt_count
|
|
})
|
|
|
|
# Save as PLINK-compatible format
|
|
plink_df = pd.DataFrame(plink_data)
|
|
|
|
# Pivot for PLINK .raw format
|
|
plink_matrix = plink_df.pivot_table(
|
|
index='sample',
|
|
columns=['chr', 'pos', 'id'],
|
|
values='genotype',
|
|
fill_value=-9 # Missing data code
|
|
)
|
|
|
|
# Save files
|
|
plink_matrix.to_csv(f"{output_prefix}.raw", sep='\t')
|
|
|
|
# Create map file
|
|
map_data = plink_df[['chr', 'id', 'pos']].drop_duplicates()
|
|
map_data['genetic_distance'] = 0 # Placeholder
|
|
map_data = map_data[['chr', 'id', 'genetic_distance', 'pos']]
|
|
map_data.to_csv(f"{output_prefix}.map", sep='\t', header=False, index=False)
|
|
|
|
print(f"Exported PLINK files: {output_prefix}.raw, {output_prefix}.map")
|
|
|
|
# Usage
|
|
export_for_plink(
|
|
ds,
|
|
regions=["chr22"], # Start with smaller chromosome
|
|
samples=ds.sample_names()[:100],
|
|
output_prefix="plink_data"
|
|
)
|
|
```
|
|
|
|
### VEP Annotation Preparation
|
|
```python
|
|
def export_for_vep(ds, regions, output_file):
|
|
"""Export variants for VEP (Variant Effect Predictor) annotation"""
|
|
# Query essential variant information
|
|
df = ds.read(
|
|
attrs=["contig", "pos_start", "pos_end", "alleles", "id"],
|
|
regions=regions
|
|
)
|
|
|
|
# Prepare VEP input format
|
|
vep_data = []
|
|
for _, row in df.iterrows():
|
|
alleles = row['alleles']
|
|
if len(alleles) >= 2:
|
|
ref = alleles[0]
|
|
for alt in alleles[1:]: # Can have multiple ALT alleles
|
|
vep_data.append({
|
|
'chr': row['contig'],
|
|
'start': row['pos_start'],
|
|
'end': row['pos_end'],
|
|
'allele': f"{ref}/{alt}",
|
|
'strand': '+',
|
|
'id': row['id'] if row['id'] else '.'
|
|
})
|
|
|
|
vep_df = pd.DataFrame(vep_data)
|
|
|
|
# Save VEP input format
|
|
vep_df.to_csv(
|
|
output_file,
|
|
sep='\t',
|
|
header=False,
|
|
index=False,
|
|
columns=['chr', 'start', 'end', 'allele', 'strand', 'id']
|
|
)
|
|
|
|
print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}")
|
|
|
|
# Usage
|
|
export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt")
|
|
```
|
|
|
|
## Cloud Export
|
|
|
|
### S3 Export
|
|
```python
|
|
def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix):
|
|
"""Export data directly to S3"""
|
|
import boto3
|
|
|
|
# Configure for S3
|
|
config = tiledbvcf.ReadConfig(
|
|
tiledb_config={
|
|
"vfs.s3.region": "us-east-1",
|
|
"vfs.s3.multipart_part_size": "50MB"
|
|
}
|
|
)
|
|
|
|
# Export to S3 paths
|
|
for i, region in enumerate(regions):
|
|
region_safe = region.replace(":", "_").replace("-", "_")
|
|
s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf"
|
|
|
|
print(f"Exporting region {i+1}/{len(regions)}: {region}")
|
|
|
|
ds.export_bcf(
|
|
uri=s3_uri,
|
|
regions=[region],
|
|
samples=samples
|
|
)
|
|
|
|
print(f"Exported to {s3_uri}")
|
|
|
|
# Usage
|
|
export_to_s3(
|
|
ds,
|
|
regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
|
|
samples=ds.sample_names()[:50],
|
|
s3_bucket="my-genomics-bucket",
|
|
s3_prefix="exported_variants"
|
|
)
|
|
```
|
|
|
|
## Export Validation
|
|
|
|
### Data Integrity Checks
|
|
```python
|
|
def validate_export(original_ds, export_file, regions, samples):
|
|
"""Validate exported data against original dataset"""
|
|
import pysam
|
|
|
|
# Count variants in original dataset
|
|
original_df = original_ds.read(
|
|
attrs=["sample_name", "pos_start"],
|
|
regions=regions,
|
|
samples=samples
|
|
)
|
|
original_count = len(original_df)
|
|
|
|
# Count variants in exported file
|
|
try:
|
|
if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'):
|
|
vcf = pysam.VariantFile(export_file)
|
|
export_count = sum(1 for _ in vcf)
|
|
vcf.close()
|
|
elif export_file.endswith('.tsv') or export_file.endswith('.csv'):
|
|
export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',')
|
|
export_count = len(export_df)
|
|
else:
|
|
print(f"Unknown file format: {export_file}")
|
|
return False
|
|
|
|
# Compare counts
|
|
if original_count == export_count:
|
|
print(f"✓ Export validation passed: {export_count} variants")
|
|
return True
|
|
else:
|
|
print(f"✗ Export validation failed: {original_count} original vs {export_count} exported")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"✗ Export validation error: {e}")
|
|
return False
|
|
|
|
# Usage
|
|
success = validate_export(
|
|
ds,
|
|
"output.bcf",
|
|
regions=["chr1:1000000-2000000"],
|
|
samples=["SAMPLE_001", "SAMPLE_002"]
|
|
)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Efficient Export Strategies
|
|
```python
|
|
# 1. Optimize for intended use case
|
|
def choose_export_format(use_case, file_size_mb):
|
|
"""Choose optimal export format based on use case"""
|
|
if use_case == "downstream_analysis":
|
|
if file_size_mb > 1000:
|
|
return "BCF" # Compressed binary
|
|
else:
|
|
return "VCF" # Text format
|
|
|
|
elif use_case == "data_sharing":
|
|
return "VCF.gz" # Standard compressed format
|
|
|
|
elif use_case == "statistical_analysis":
|
|
return "TSV" # Easy to process
|
|
|
|
elif use_case == "database_import":
|
|
return "CSV" # Universal format
|
|
|
|
else:
|
|
return "VCF" # Default
|
|
|
|
# 2. Batch processing for large exports
|
|
def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000):
|
|
"""Export data in batches based on variant count"""
|
|
current_batch = []
|
|
current_count = 0
|
|
batch_num = 1
|
|
|
|
for region in regions:
|
|
# Estimate variant count (approximate)
|
|
test_df = ds.read(
|
|
attrs=["pos_start"],
|
|
regions=[region],
|
|
samples=samples[:10] # Small sample for estimation
|
|
)
|
|
estimated_variants = len(test_df) * len(samples) // 10
|
|
|
|
if current_count + estimated_variants > max_variants_per_file and current_batch:
|
|
# Export current batch
|
|
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
|
|
batch_num += 1
|
|
current_batch = [region]
|
|
current_count = estimated_variants
|
|
else:
|
|
current_batch.append(region)
|
|
current_count += estimated_variants
|
|
|
|
# Export final batch
|
|
if current_batch:
|
|
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
|
|
|
|
def export_batch(ds, regions, samples, output_file):
|
|
"""Export a batch of regions"""
|
|
print(f"Exporting batch to {output_file}")
|
|
ds.export_bcf(uri=output_file, regions=regions, samples=samples)
|
|
```
|
|
|
|
This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows. |