diff --git a/scientific-skills/tiledbvcf/references/export.md b/scientific-skills/tiledbvcf/references/export.md index 902855d..6ea3ffb 100644 --- a/scientific-skills/tiledbvcf/references/export.md +++ b/scientific-skills/tiledbvcf/references/export.md @@ -199,199 +199,10 @@ export_for_plink( ) ``` -### VEP Annotation Preparation -```python -def export_for_vep(ds, regions, output_file): - """Export variants for VEP (Variant Effect Predictor) annotation""" - # Query essential variant information - df = ds.read( - attrs=["contig", "pos_start", "pos_end", "alleles", "id"], - regions=regions - ) - # Prepare VEP input format - vep_data = [] - for _, row in df.iterrows(): - alleles = row['alleles'] - if len(alleles) >= 2: - ref = alleles[0] - for alt in alleles[1:]: # Can have multiple ALT alleles - vep_data.append({ - 'chr': row['contig'], - 'start': row['pos_start'], - 'end': row['pos_end'], - 'allele': f"{ref}/{alt}", - 'strand': '+', - 'id': row['id'] if row['id'] else '.' - }) - vep_df = pd.DataFrame(vep_data) - - # Save VEP input format - vep_df.to_csv( - output_file, - sep='\t', - header=False, - index=False, - columns=['chr', 'start', 'end', 'allele', 'strand', 'id'] - ) - - print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}") - -# Usage -export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt") -``` - -## Cloud Export - -### S3 Export -```python -def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix): - """Export data directly to S3""" - import boto3 - - # Configure for S3 - config = tiledbvcf.ReadConfig( - tiledb_config={ - "vfs.s3.region": "us-east-1", - "vfs.s3.multipart_part_size": "50MB" - } - ) - - # Export to S3 paths - for i, region in enumerate(regions): - region_safe = region.replace(":", "_").replace("-", "_") - s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf" - - print(f"Exporting region {i+1}/{len(regions)}: {region}") - - ds.export_bcf( - uri=s3_uri, - regions=[region], - samples=samples - ) - - print(f"Exported to {s3_uri}") - -# Usage -export_to_s3( - ds, - regions=["chr1:1000000-2000000", "chr2:500000-1500000"], - samples=ds.sample_names()[:50], - s3_bucket="my-genomics-bucket", - s3_prefix="exported_variants" -) -``` - -## Export Validation - -### Data Integrity Checks -```python -def validate_export(original_ds, export_file, regions, samples): - """Validate exported data against original dataset""" - import pysam - - # Count variants in original dataset - original_df = original_ds.read( - attrs=["sample_name", "pos_start"], - regions=regions, - samples=samples - ) - original_count = len(original_df) - - # Count variants in exported file - try: - if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'): - vcf = pysam.VariantFile(export_file) - export_count = sum(1 for _ in vcf) - vcf.close() - elif export_file.endswith('.tsv') or export_file.endswith('.csv'): - export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',') - export_count = len(export_df) - else: - print(f"Unknown file format: {export_file}") - return False - - # Compare counts - if original_count == export_count: - print(f"✓ Export validation passed: {export_count} variants") - return True - else: - print(f"✗ Export validation failed: {original_count} original vs {export_count} exported") - return False - - except Exception as e: - print(f"✗ Export validation error: {e}") - return False - -# Usage -success = validate_export( - ds, - "output.bcf", - regions=["chr1:1000000-2000000"], - samples=["SAMPLE_001", "SAMPLE_002"] -) -``` ## Best Practices -### Efficient Export Strategies -```python -# 1. Optimize for intended use case -def choose_export_format(use_case, file_size_mb): - """Choose optimal export format based on use case""" - if use_case == "downstream_analysis": - if file_size_mb > 1000: - return "BCF" # Compressed binary - else: - return "VCF" # Text format - - elif use_case == "data_sharing": - return "VCF.gz" # Standard compressed format - - elif use_case == "statistical_analysis": - return "TSV" # Easy to process - - elif use_case == "database_import": - return "CSV" # Universal format - - else: - return "VCF" # Default - -# 2. Batch processing for large exports -def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000): - """Export data in batches based on variant count""" - current_batch = [] - current_count = 0 - batch_num = 1 - - for region in regions: - # Estimate variant count (approximate) - test_df = ds.read( - attrs=["pos_start"], - regions=[region], - samples=samples[:10] # Small sample for estimation - ) - estimated_variants = len(test_df) * len(samples) // 10 - - if current_count + estimated_variants > max_variants_per_file and current_batch: - # Export current batch - export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf") - batch_num += 1 - current_batch = [region] - current_count = estimated_variants - else: - current_batch.append(region) - current_count += estimated_variants - - # Export final batch - if current_batch: - export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf") - -def export_batch(ds, regions, samples, output_file): - """Export a batch of regions""" - print(f"Exporting batch to {output_file}") - ds.export_bcf(uri=output_file, regions=regions, samples=samples) -``` This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows. \ No newline at end of file