Remove multiple advanced export sections

- Remove VEP annotation preparation section - Remove Cloud Export (S3) section - Remove Export Validation section - Remove Efficient Export Strategies section - Simplify export guide to focus on core export functionality - Maintain essential VCF/BCF and TSV export examples
2026-03-27 07:09:27 +08:00 · 2026-02-24 11:17:41 -07:00
parent 518261c4f2
commit e3a7a85122
1 changed files with 0 additions and 189 deletions
--- a/scientific-skills/tiledbvcf/references/export.md
+++ b/scientific-skills/tiledbvcf/references/export.md
@@ -199,199 +199,10 @@ export_for_plink(
 )
 ```

-### VEP Annotation Preparation
-```python
-def export_for_vep(ds, regions, output_file):
-    """Export variants for VEP (Variant Effect Predictor) annotation"""
-    # Query essential variant information
-    df = ds.read(
-        attrs=["contig", "pos_start", "pos_end", "alleles", "id"],
-        regions=regions
-    )

-    # Prepare VEP input format
-    vep_data = []
-    for _, row in df.iterrows():
-        alleles = row['alleles']
-        if len(alleles) >= 2:
-            ref = alleles[0]
-            for alt in alleles[1:]:  # Can have multiple ALT alleles
-                vep_data.append({
-                    'chr': row['contig'],
-                    'start': row['pos_start'],
-                    'end': row['pos_end'],
-                    'allele': f"{ref}/{alt}",
-                    'strand': '+',
-                    'id': row['id'] if row['id'] else '.'
-                })

-    vep_df = pd.DataFrame(vep_data)
-
-    # Save VEP input format
-    vep_df.to_csv(
-        output_file,
-        sep='\t',
-        header=False,
-        index=False,
-        columns=['chr', 'start', 'end', 'allele', 'strand', 'id']
-    )
-
-    print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}")
-
-# Usage
-export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt")
-```
-
-## Cloud Export
-
-### S3 Export
-```python
-def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix):
-    """Export data directly to S3"""
-    import boto3
-
-    # Configure for S3
-    config = tiledbvcf.ReadConfig(
-        tiledb_config={
-            "vfs.s3.region": "us-east-1",
-            "vfs.s3.multipart_part_size": "50MB"
-        }
-    )
-
-    # Export to S3 paths
-    for i, region in enumerate(regions):
-        region_safe = region.replace(":", "_").replace("-", "_")
-        s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf"
-
-        print(f"Exporting region {i+1}/{len(regions)}: {region}")
-
-        ds.export_bcf(
-            uri=s3_uri,
-            regions=[region],
-            samples=samples
-        )
-
-        print(f"Exported to {s3_uri}")
-
-# Usage
-export_to_s3(
-    ds,
-    regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
-    samples=ds.sample_names()[:50],
-    s3_bucket="my-genomics-bucket",
-    s3_prefix="exported_variants"
-)
-```
-
-## Export Validation
-
-### Data Integrity Checks
-```python
-def validate_export(original_ds, export_file, regions, samples):
-    """Validate exported data against original dataset"""
-    import pysam
-
-    # Count variants in original dataset
-    original_df = original_ds.read(
-        attrs=["sample_name", "pos_start"],
-        regions=regions,
-        samples=samples
-    )
-    original_count = len(original_df)
-
-    # Count variants in exported file
-    try:
-        if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'):
-            vcf = pysam.VariantFile(export_file)
-            export_count = sum(1 for _ in vcf)
-            vcf.close()
-        elif export_file.endswith('.tsv') or export_file.endswith('.csv'):
-            export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',')
-            export_count = len(export_df)
-        else:
-            print(f"Unknown file format: {export_file}")
-            return False
-
-        # Compare counts
-        if original_count == export_count:
-            print(f"✓ Export validation passed: {export_count} variants")
-            return True
-        else:
-            print(f"✗ Export validation failed: {original_count} original vs {export_count} exported")
-            return False
-
-    except Exception as e:
-        print(f"✗ Export validation error: {e}")
-        return False
-
-# Usage
-success = validate_export(
-    ds,
-    "output.bcf",
-    regions=["chr1:1000000-2000000"],
-    samples=["SAMPLE_001", "SAMPLE_002"]
-)
-```

 ## Best Practices

-### Efficient Export Strategies
-```python
-# 1. Optimize for intended use case
-def choose_export_format(use_case, file_size_mb):
-    """Choose optimal export format based on use case"""
-    if use_case == "downstream_analysis":
-        if file_size_mb > 1000:
-            return "BCF"  # Compressed binary
-        else:
-            return "VCF"  # Text format
-
-    elif use_case == "data_sharing":
-        return "VCF.gz"  # Standard compressed format
-
-    elif use_case == "statistical_analysis":
-        return "TSV"  # Easy to process
-
-    elif use_case == "database_import":
-        return "CSV"  # Universal format
-
-    else:
-        return "VCF"  # Default
-
-# 2. Batch processing for large exports
-def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000):
-    """Export data in batches based on variant count"""
-    current_batch = []
-    current_count = 0
-    batch_num = 1
-
-    for region in regions:
-        # Estimate variant count (approximate)
-        test_df = ds.read(
-            attrs=["pos_start"],
-            regions=[region],
-            samples=samples[:10]  # Small sample for estimation
-        )
-        estimated_variants = len(test_df) * len(samples) // 10
-
-        if current_count + estimated_variants > max_variants_per_file and current_batch:
-            # Export current batch
-            export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
-            batch_num += 1
-            current_batch = [region]
-            current_count = estimated_variants
-        else:
-            current_batch.append(region)
-            current_count += estimated_variants
-
-    # Export final batch
-    if current_batch:
-        export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
-
-def export_batch(ds, regions, samples, output_file):
-    """Export a batch of regions"""
-    print(f"Exporting batch to {output_file}")
-    ds.export_bcf(uri=output_file, regions=regions, samples=samples)
-```

 This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.