Remove multiple advanced export sections

- Remove VEP annotation preparation section - Remove Cloud Export (S3) section - Remove Export Validation section - Remove Efficient Export Strategies section - Simplify export guide to focus on core export functionality - Maintain essential VCF/BCF and TSV export examples
2026-03-27 07:09:27 +08:00 · 2026-02-24 11:17:41 -07:00
parent 518261c4f2
commit e3a7a85122
1 changed files with 0 additions and 189 deletions
--- a/scientific-skills/tiledbvcf/references/export.md
+++ b/scientific-skills/tiledbvcf/references/export.md
@@ -199,199 +199,10 @@ export_for_plink(
 )
 ```
 ### VEP Annotation Preparation
 ```python
 def export_for_vep(ds, regions, output_file):
    """Export variants for VEP (Variant Effect Predictor) annotation"""
    # Query essential variant information
    df = ds.read(
        attrs=["contig", "pos_start", "pos_end", "alleles", "id"],
        regions=regions
    )
    # Prepare VEP input format
    vep_data = []
    for _, row in df.iterrows():
        alleles = row['alleles']
        if len(alleles) >= 2:
            ref = alleles[0]
            for alt in alleles[1:]:  # Can have multiple ALT alleles
                vep_data.append({
                    'chr': row['contig'],
                    'start': row['pos_start'],
                    'end': row['pos_end'],
                    'allele': f"{ref}/{alt}",
                    'strand': '+',
                    'id': row['id'] if row['id'] else '.'
                })
    vep_df = pd.DataFrame(vep_data)
    # Save VEP input format
    vep_df.to_csv(
        output_file,
        sep='\t',
        header=False,
        index=False,
        columns=['chr', 'start', 'end', 'allele', 'strand', 'id']
    )
    print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}")
 # Usage
 export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt")
 ```
 ## Cloud Export
 ### S3 Export
 ```python
 def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix):
    """Export data directly to S3"""
    import boto3
    # Configure for S3
    config = tiledbvcf.ReadConfig(
        tiledb_config={
            "vfs.s3.region": "us-east-1",
            "vfs.s3.multipart_part_size": "50MB"
        }
    )
    # Export to S3 paths
    for i, region in enumerate(regions):
        region_safe = region.replace(":", "_").replace("-", "_")
        s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf"
        print(f"Exporting region {i+1}/{len(regions)}: {region}")
        ds.export_bcf(
            uri=s3_uri,
            regions=[region],
            samples=samples
        )
        print(f"Exported to {s3_uri}")
 # Usage
 export_to_s3(
    ds,
    regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
    samples=ds.sample_names()[:50],
    s3_bucket="my-genomics-bucket",
    s3_prefix="exported_variants"
 )
 ```
 ## Export Validation
 ### Data Integrity Checks
 ```python
 def validate_export(original_ds, export_file, regions, samples):
    """Validate exported data against original dataset"""
    import pysam
    # Count variants in original dataset
    original_df = original_ds.read(
        attrs=["sample_name", "pos_start"],
        regions=regions,
        samples=samples
    )
    original_count = len(original_df)
    # Count variants in exported file
    try:
        if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'):
            vcf = pysam.VariantFile(export_file)
            export_count = sum(1 for _ in vcf)
            vcf.close()
        elif export_file.endswith('.tsv') or export_file.endswith('.csv'):
            export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',')
            export_count = len(export_df)
        else:
            print(f"Unknown file format: {export_file}")
            return False
        # Compare counts
        if original_count == export_count:
            print(f"✓ Export validation passed: {export_count} variants")
            return True
        else:
            print(f"✗ Export validation failed: {original_count} original vs {export_count} exported")
            return False
    except Exception as e:
        print(f"✗ Export validation error: {e}")
        return False
 # Usage
 success = validate_export(
    ds,
    "output.bcf",
    regions=["chr1:1000000-2000000"],
    samples=["SAMPLE_001", "SAMPLE_002"]
 )
 ```
 ## Best Practices
 ### Efficient Export Strategies
 ```python
 # 1. Optimize for intended use case
 def choose_export_format(use_case, file_size_mb):
    """Choose optimal export format based on use case"""
    if use_case == "downstream_analysis":
        if file_size_mb > 1000:
            return "BCF"  # Compressed binary
        else:
            return "VCF"  # Text format
    elif use_case == "data_sharing":
        return "VCF.gz"  # Standard compressed format
    elif use_case == "statistical_analysis":
        return "TSV"  # Easy to process
    elif use_case == "database_import":
        return "CSV"  # Universal format
    else:
        return "VCF"  # Default
 # 2. Batch processing for large exports
 def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000):
    """Export data in batches based on variant count"""
    current_batch = []
    current_count = 0
    batch_num = 1
    for region in regions:
        # Estimate variant count (approximate)
        test_df = ds.read(
            attrs=["pos_start"],
            regions=[region],
            samples=samples[:10]  # Small sample for estimation
        )
        estimated_variants = len(test_df) * len(samples) // 10
        if current_count + estimated_variants > max_variants_per_file and current_batch:
            # Export current batch
            export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
            batch_num += 1
            current_batch = [region]
            current_count = estimated_variants
        else:
            current_batch.append(region)
            current_count += estimated_variants
    # Export final batch
    if current_batch:
        export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
 def export_batch(ds, regions, samples, output_file):
    """Export a batch of regions"""
    print(f"Exporting batch to {output_file}")
    ds.export_bcf(uri=output_file, regions=regions, samples=samples)
 ```
 This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.