Remove multiple advanced export sections

- Remove VEP annotation preparation section
- Remove Cloud Export (S3) section
- Remove Export Validation section
- Remove Efficient Export Strategies section
- Simplify export guide to focus on core export functionality
- Maintain essential VCF/BCF and TSV export examples
This commit is contained in:
Jeremy Leipzig
2026-02-24 11:17:41 -07:00
parent 518261c4f2
commit e3a7a85122

View File

@@ -199,199 +199,10 @@ export_for_plink(
) )
``` ```
### VEP Annotation Preparation
```python
def export_for_vep(ds, regions, output_file):
"""Export variants for VEP (Variant Effect Predictor) annotation"""
# Query essential variant information
df = ds.read(
attrs=["contig", "pos_start", "pos_end", "alleles", "id"],
regions=regions
)
# Prepare VEP input format
vep_data = []
for _, row in df.iterrows():
alleles = row['alleles']
if len(alleles) >= 2:
ref = alleles[0]
for alt in alleles[1:]: # Can have multiple ALT alleles
vep_data.append({
'chr': row['contig'],
'start': row['pos_start'],
'end': row['pos_end'],
'allele': f"{ref}/{alt}",
'strand': '+',
'id': row['id'] if row['id'] else '.'
})
vep_df = pd.DataFrame(vep_data)
# Save VEP input format
vep_df.to_csv(
output_file,
sep='\t',
header=False,
index=False,
columns=['chr', 'start', 'end', 'allele', 'strand', 'id']
)
print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}")
# Usage
export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt")
```
## Cloud Export
### S3 Export
```python
def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix):
"""Export data directly to S3"""
import boto3
# Configure for S3
config = tiledbvcf.ReadConfig(
tiledb_config={
"vfs.s3.region": "us-east-1",
"vfs.s3.multipart_part_size": "50MB"
}
)
# Export to S3 paths
for i, region in enumerate(regions):
region_safe = region.replace(":", "_").replace("-", "_")
s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf"
print(f"Exporting region {i+1}/{len(regions)}: {region}")
ds.export_bcf(
uri=s3_uri,
regions=[region],
samples=samples
)
print(f"Exported to {s3_uri}")
# Usage
export_to_s3(
ds,
regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
samples=ds.sample_names()[:50],
s3_bucket="my-genomics-bucket",
s3_prefix="exported_variants"
)
```
## Export Validation
### Data Integrity Checks
```python
def validate_export(original_ds, export_file, regions, samples):
"""Validate exported data against original dataset"""
import pysam
# Count variants in original dataset
original_df = original_ds.read(
attrs=["sample_name", "pos_start"],
regions=regions,
samples=samples
)
original_count = len(original_df)
# Count variants in exported file
try:
if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'):
vcf = pysam.VariantFile(export_file)
export_count = sum(1 for _ in vcf)
vcf.close()
elif export_file.endswith('.tsv') or export_file.endswith('.csv'):
export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',')
export_count = len(export_df)
else:
print(f"Unknown file format: {export_file}")
return False
# Compare counts
if original_count == export_count:
print(f"✓ Export validation passed: {export_count} variants")
return True
else:
print(f"✗ Export validation failed: {original_count} original vs {export_count} exported")
return False
except Exception as e:
print(f"✗ Export validation error: {e}")
return False
# Usage
success = validate_export(
ds,
"output.bcf",
regions=["chr1:1000000-2000000"],
samples=["SAMPLE_001", "SAMPLE_002"]
)
```
## Best Practices ## Best Practices
### Efficient Export Strategies
```python
# 1. Optimize for intended use case
def choose_export_format(use_case, file_size_mb):
"""Choose optimal export format based on use case"""
if use_case == "downstream_analysis":
if file_size_mb > 1000:
return "BCF" # Compressed binary
else:
return "VCF" # Text format
elif use_case == "data_sharing":
return "VCF.gz" # Standard compressed format
elif use_case == "statistical_analysis":
return "TSV" # Easy to process
elif use_case == "database_import":
return "CSV" # Universal format
else:
return "VCF" # Default
# 2. Batch processing for large exports
def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000):
"""Export data in batches based on variant count"""
current_batch = []
current_count = 0
batch_num = 1
for region in regions:
# Estimate variant count (approximate)
test_df = ds.read(
attrs=["pos_start"],
regions=[region],
samples=samples[:10] # Small sample for estimation
)
estimated_variants = len(test_df) * len(samples) // 10
if current_count + estimated_variants > max_variants_per_file and current_batch:
# Export current batch
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
batch_num += 1
current_batch = [region]
current_count = estimated_variants
else:
current_batch.append(region)
current_count += estimated_variants
# Export final batch
if current_batch:
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
def export_batch(ds, regions, samples, output_file):
"""Export a batch of regions"""
print(f"Exporting batch to {output_file}")
ds.export_bcf(uri=output_file, regions=regions, samples=samples)
```
This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows. This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.