Remove references/export.md and references/ingestion.md

- Delete detailed export and ingestion reference documentation - Update main skill to remove references to deleted files - Simplify skill to focus on core querying and population genomics - Keep querying.md and population_genomics.md reference files
2026-03-27 07:09:27 +08:00 · 2026-02-24 11:27:59 -07:00
parent ba2afda31c
commit c576d2e66a
3 changed files with 0 additions and 645 deletions
--- a/scientific-skills/tiledbvcf/SKILL.md
+++ b/scientific-skills/tiledbvcf/SKILL.md
@@ -339,12 +339,8 @@ config = tiledbvcf.ReadConfig(

 Detailed documentation for each major capability:

- **ingestion.md** - Complete guide to dataset creation and VCF/BCF ingestion, including parallel processing, memory optimization, and error handling
-
 - **querying.md** - Complete guide to efficient variant queries, including region specification, attribute selection, filtering strategies, and performance optimization

- **export.md** - Complete guide to data export in various formats, including VCF/BCF export, TSV generation, and integration with downstream analysis tools
-
 - **population_genomics.md** - Practical examples of population genomics workflows, including GWAS preparation, quality control, allele frequency analysis, and integration with analysis tools

 ## Getting Help
@@ -353,9 +349,7 @@ Detailed documentation for each major capability:

 For detailed information on specific operations, refer to the appropriate reference document:

- Creating datasets or ingesting VCF files → `ingestion.md`
 - Querying variant data efficiently → `querying.md`
- Exporting data or integrating with other tools → `export.md`
 - Population genomics workflows → `population_genomics.md`

 **Open Source Documentation:**
--- a/scientific-skills/tiledbvcf/references/export.md
+++ b/scientific-skills/tiledbvcf/references/export.md
@@ -1,208 +0,0 @@
-# TileDB-VCF Export Guide
-
-Complete guide to exporting data from TileDB-VCF datasets in various formats for downstream analysis and integration with other genomics tools.
-
-## VCF/BCF Export
-
-### Basic VCF Export
-```python
-import tiledbvcf
-
-# Open dataset for reading
-ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
-
-# Export specific regions as VCF
-ds.export_vcf(
-    uri="output.vcf.gz",
-    regions=["chr1:1000000-2000000"],
-    samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
-)
-```
-
-### BCF Export (Binary VCF)
-```python
-# Export as compressed BCF for faster processing
-ds.export_bcf(
-    uri="output.bcf",
-    regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
-    samples=["SAMPLE_001", "SAMPLE_002"]
-)
-```
-
-### Large-Scale Export
-```python
-# Export entire chromosomes efficiently
-def export_chromosome(ds, chrom, output_dir, samples=None):
-    """Export full chromosome data"""
-    output_path = f"{output_dir}/chr{chrom}.bcf"
-
-    print(f"Exporting chromosome {chrom}")
-    ds.export_bcf(
-        uri=output_path,
-        regions=[f"chr{chrom}"],
-        samples=samples
-    )
-    print(f"Exported to {output_path}")
-
-# Export all autosomes
-for chrom in range(1, 23):
-    export_chromosome(ds, chrom, "exported_data")
-```
-
-## TSV Export
-
-### Basic TSV Export
-```python
-# Export as tab-separated values
-ds.export_tsv(
-    uri="variants.tsv",
-    regions=["chr1:1000000-2000000"],
-    samples=["SAMPLE_001", "SAMPLE_002"],
-    tsv_fields=["CHR", "POS", "REF", "ALT", "S:GT", "S:DP"]
-)
-```
-
-
-## Pandas DataFrame Export
-
-### Query to DataFrame
-```python
-# Export query results as pandas DataFrame for analysis
-df = ds.read(
-    attrs=["sample_name", "contig", "pos_start", "alleles", "fmt_GT", "info_AF"],
-    regions=["chr1:1000000-2000000"],
-    samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
-)
-
-# Save DataFrame to various formats
-df.to_csv("variants.csv", index=False)
-df.to_parquet("variants.parquet")
-df.to_pickle("variants.pkl")
-```
-
-### Processed Data Export
-```python
-def export_processed_variants(ds, regions, samples, output_file):
-    """Export processed variant data with calculated metrics"""
-    # Query raw data
-    df = ds.read(
-        attrs=["sample_name", "contig", "pos_start", "pos_end",
-               "alleles", "fmt_GT", "fmt_DP", "fmt_GQ", "info_AF"],
-        regions=regions,
-        samples=samples
-    )
-
-    # Add calculated columns
-    df['variant_id'] = df['contig'] + ':' + df['pos_start'].astype(str)
-
-    # Parse genotypes
-    def parse_genotype(gt):
-        if isinstance(gt, list) and len(gt) == 2:
-            if -1 in gt:
-                return "missing"
-            elif gt[0] == gt[1]:
-                return "homozygous"
-            else:
-                return "heterozygous"
-        return "unknown"
-
-    df['genotype_type'] = df['fmt_GT'].apply(parse_genotype)
-
-    # Filter high-quality variants
-    high_qual = df[
-        (df['fmt_DP'] >= 10) &
-        (df['fmt_GQ'] >= 20) &
-        (df['genotype_type'] != 'missing')
-    ]
-
-    # Export processed data
-    high_qual.to_csv(output_file, index=False)
-    print(f"Exported {len(high_qual)} high-quality variants to {output_file}")
-
-    return high_qual
-
-# Usage
-processed_df = export_processed_variants(
-    ds,
-    regions=["chr1:1000000-2000000"],
-    samples=ds.sample_names()[:50],  # First 50 samples
-    output_file="high_quality_variants.csv"
-)
-```
-
-
-## Integration with Analysis Tools
-
-### PLINK Format Export
-```python
-def export_for_plink(ds, regions, samples, output_prefix):
-    """Export data in format suitable for PLINK analysis"""
-    # Query variant data
-    df = ds.read(
-        attrs=["sample_name", "contig", "pos_start", "id", "alleles", "fmt_GT"],
-        regions=regions,
-        samples=samples
-    )
-
-    # Prepare PLINK-compatible data
-    plink_data = []
-    for _, row in df.iterrows():
-        gt = row['fmt_GT']
-        if isinstance(gt, list) and len(gt) == 2 and -1 not in gt:
-            # Convert genotype to PLINK format (0/1/2)
-            alleles = row['alleles']
-            if len(alleles) >= 2:
-                ref_allele = alleles[0]
-                alt_allele = alleles[1]
-
-                # Count alternative alleles
-                alt_count = sum(1 for allele in gt if allele == 1)
-
-                plink_data.append({
-                    'sample': row['sample_name'],
-                    'chr': row['contig'],
-                    'pos': row['pos_start'],
-                    'id': row['id'] if row['id'] else f"{row['contig']}_{row['pos_start']}",
-                    'ref': ref_allele,
-                    'alt': alt_allele,
-                    'genotype': alt_count
-                })
-
-    # Save as PLINK-compatible format
-    plink_df = pd.DataFrame(plink_data)
-
-    # Pivot for PLINK .raw format
-    plink_matrix = plink_df.pivot_table(
-        index='sample',
-        columns=['chr', 'pos', 'id'],
-        values='genotype',
-        fill_value=-9  # Missing data code
-    )
-
-    # Save files
-    plink_matrix.to_csv(f"{output_prefix}.raw", sep='\t')
-
-    # Create map file
-    map_data = plink_df[['chr', 'id', 'pos']].drop_duplicates()
-    map_data['genetic_distance'] = 0  # Placeholder
-    map_data = map_data[['chr', 'id', 'genetic_distance', 'pos']]
-    map_data.to_csv(f"{output_prefix}.map", sep='\t', header=False, index=False)
-
-    print(f"Exported PLINK files: {output_prefix}.raw, {output_prefix}.map")
-
-# Usage
-export_for_plink(
-    ds,
-    regions=["chr22"],  # Start with smaller chromosome
-    samples=ds.sample_names()[:100],
-    output_prefix="plink_data"
-)
-```
-
-
-
-
-## Best Practices
-
-
-This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.
--- a/scientific-skills/tiledbvcf/references/ingestion.md
+++ b/scientific-skills/tiledbvcf/references/ingestion.md
@@ -1,431 +0,0 @@
-# TileDB-VCF Ingestion Guide
-
-Complete guide to creating TileDB-VCF datasets and ingesting VCF/BCF files with optimal performance and reliability.
-
-## Important Requirements
-
-**Before ingesting VCF files, ensure they meet these requirements:**
-
- **Single-sample VCFs only**: Multi-sample VCFs are not supported by TileDB-VCF
- **Index files required**: All VCF/BCF files must have corresponding index files:
-  - `.csi` files (created with `bcftools index`)
-  - `.tbi` files (created with `tabix`)
-
-```bash
-# Create indexes if they don't exist
-bcftools index sample.vcf.gz        # Creates sample.vcf.gz.csi
-# OR
-tabix -p vcf sample.vcf.gz          # Creates sample.vcf.gz.tbi
-```
-
-## Dataset Creation
-
-### Basic Dataset Creation
-```python
-import tiledbvcf
-
-# Create a new dataset
-ds = tiledbvcf.Dataset(uri="my_dataset", mode="w")
-```
-
-### Advanced Configuration
-```python
-# Custom configuration for large datasets
-config = tiledbvcf.ReadConfig(
-    memory_budget=4096,  # MB
-    tiledb_config={
-        "sm.tile_cache_size": "2000000000",  # 2GB tile cache
-        "sm.mem.total_budget": "4000000000",  # 4GB total memory
-        "vfs.file.posix_file_permissions": "644"
-    }
-)
-
-ds = tiledbvcf.Dataset(
-    uri="large_dataset",
-    mode="w",
-    cfg=config
-)
-```
-
-### Cloud Dataset Creation
-```python
-# S3 dataset with credentials
-config = tiledbvcf.ReadConfig(
-    tiledb_config={
-        "vfs.s3.aws_access_key_id": "YOUR_KEY",
-        "vfs.s3.aws_secret_access_key": "YOUR_SECRET",
-        "vfs.s3.region": "us-east-1"
-    }
-)
-
-ds = tiledbvcf.Dataset(
-    uri="s3://my-bucket/vcf-dataset",
-    mode="w",
-    cfg=config
-)
-```
-
-## Single Sample Ingestion
-
-### Basic Ingestion
-```python
-# Ingest a single VCF file
-ds.ingest_samples(["sample1.vcf.gz"])
-
-# Multiple files at once
-ds.ingest_samples([
-    "sample1.vcf.gz",
-    "sample2.vcf.gz",
-    "sample3.vcf.gz"
-])
-```
-
-### Custom Sample Names
-```python
-# Override sample names from VCF headers
-ds.ingest_samples(
-    ["data/unknown_sample.vcf.gz"],
-    sample_names=["SAMPLE_001"]
-)
-```
-
-### Ingestion with Validation
-```python
-# Enable additional validation during ingestion
-try:
-    ds.ingest_samples(
-        ["sample1.vcf.gz"],
-        contig_fragment_merging=True,  # Merge fragments on same contig
-        resume=False  # Start fresh (don't resume)
-    )
-except Exception as e:
-    print(f"Ingestion failed: {e}")
-```
-
-## Parallel Ingestion
-
-### Multi-threaded Ingestion
-```python
-# Configure for parallel ingestion
-config = tiledbvcf.ReadConfig(
-    tiledb_config={
-        "sm.num_async_threads": "8",
-        "sm.num_reader_threads": "4",
-        "sm.num_writer_threads": "4"
-    }
-)
-
-ds = tiledbvcf.Dataset(uri="dataset", mode="w", cfg=config)
-
-# Ingest multiple files in parallel
-file_list = [f"sample_{i}.vcf.gz" for i in range(1, 101)]
-ds.ingest_samples(file_list)
-```
-
-### Batched Processing
-```python
-# Process files in batches to manage memory
-import glob
-
-vcf_files = glob.glob("*.vcf.gz")
-batch_size = 10
-
-for i in range(0, len(vcf_files), batch_size):
-    batch = vcf_files[i:i+batch_size]
-    print(f"Processing batch {i//batch_size + 1}: {len(batch)} files")
-
-    try:
-        ds.ingest_samples(batch)
-        print(f"Successfully ingested batch {i//batch_size + 1}")
-    except Exception as e:
-        print(f"Error in batch {i//batch_size + 1}: {e}")
-        # Continue with next batch
-        continue
-```
-
-## Incremental Addition
-
-### Adding New Samples
-```python
-# Open existing dataset and add new samples
-ds = tiledbvcf.Dataset(uri="existing_dataset", mode="a")  # append mode
-
-# Add new samples without affecting existing data
-ds.ingest_samples(["new_sample1.vcf.gz", "new_sample2.vcf.gz"])
-```
-
-### Resuming Interrupted Ingestion
-```python
-# Resume a previously interrupted ingestion
-ds.ingest_samples(
-    ["large_sample.vcf.gz"],
-    resume=True  # Continue from where it left off
-)
-```
-
-## Memory Optimization
-
-### Memory Budget Configuration
-```python
-# Configure memory usage based on system resources
-import psutil
-
-# Use 75% of available memory
-available_memory = psutil.virtual_memory().available
-memory_budget_mb = int((available_memory * 0.75) / (1024 * 1024))
-
-config = tiledbvcf.ReadConfig(
-    memory_budget=memory_budget_mb,
-    tiledb_config={
-        "sm.mem.total_budget": str(int(available_memory * 0.75))
-    }
-)
-```
-
-### Large File Handling
-```python
-# For very large VCF files (>10GB), use streaming ingestion
-config = tiledbvcf.ReadConfig(
-    memory_budget=2048,  # Conservative memory usage
-    tiledb_config={
-        "sm.tile_cache_size": "500000000",  # 500MB cache
-        "sm.consolidation.buffer_size": "100000000"  # 100MB buffer
-    }
-)
-
-# Process large files one at a time
-large_files = ["huge_sample1.vcf.gz", "huge_sample2.vcf.gz"]
-for vcf_file in large_files:
-    print(f"Processing {vcf_file}")
-    ds.ingest_samples([vcf_file])
-    print(f"Completed {vcf_file}")
-```
-
-## Error Handling and Validation
-
-### Comprehensive Error Handling
-```python
-import logging
-
-logging.basicConfig(level=logging.INFO)
-
-def robust_ingestion(dataset_uri, vcf_files):
-    config = tiledbvcf.ReadConfig(memory_budget=2048)
-
-    with tiledbvcf.Dataset(uri=dataset_uri, mode="w", cfg=config) as ds:
-        failed_files = []
-
-        for vcf_file in vcf_files:
-            try:
-                # Validate file exists and is readable
-                if not os.path.exists(vcf_file):
-                    logging.error(f"File not found: {vcf_file}")
-                    failed_files.append(vcf_file)
-                    continue
-
-                logging.info(f"Ingesting {vcf_file}")
-                ds.ingest_samples([vcf_file])
-                logging.info(f"Successfully ingested {vcf_file}")
-
-            except Exception as e:
-                logging.error(f"Failed to ingest {vcf_file}: {e}")
-                failed_files.append(vcf_file)
-                continue
-
-        if failed_files:
-            logging.warning(f"Failed to ingest {len(failed_files)} files: {failed_files}")
-
-        return failed_files
-```
-
-### Pre-ingestion Validation
-```python
-import pysam
-
-def validate_vcf_files(vcf_files):
-    """Validate VCF files before ingestion"""
-    valid_files = []
-    invalid_files = []
-
-    for vcf_file in vcf_files:
-        try:
-            # Basic validation using pysam
-            vcf = pysam.VariantFile(vcf_file)
-
-            # Check if file has variants
-            try:
-                next(iter(vcf))
-                valid_files.append(vcf_file)
-                print(f"✓ {vcf_file}: Valid")
-            except StopIteration:
-                print(f"⚠ {vcf_file}: No variants found")
-                valid_files.append(vcf_file)  # Empty files are valid
-
-            vcf.close()
-
-        except Exception as e:
-            print(f"✗ {vcf_file}: Invalid - {e}")
-            invalid_files.append(vcf_file)
-
-    return valid_files, invalid_files
-
-# Use validation before ingestion
-vcf_files = ["sample1.vcf.gz", "sample2.vcf.gz"]
-valid_files, invalid_files = validate_vcf_files(vcf_files)
-
-if valid_files:
-    ds.ingest_samples(valid_files)
-```
-
-## Performance Optimization
-
-### I/O Optimization
-```python
-# Optimize for different storage types
-def get_optimized_config(storage_type="local"):
-    base_config = {
-        "sm.mem.total_budget": "4000000000",
-        "sm.tile_cache_size": "1000000000"
-    }
-
-    if storage_type == "local":
-        # Optimize for local SSD storage
-        base_config.update({
-            "sm.num_async_threads": "8",
-            "vfs.file.enable_filelocks": "true"
-        })
-    elif storage_type == "s3":
-        # Optimize for S3 storage
-        base_config.update({
-            "vfs.s3.multipart_part_size": "50MB",
-            "vfs.s3.max_parallel_ops": "8",
-            "vfs.s3.use_multipart_upload": "true"
-        })
-    elif storage_type == "azure":
-        # Optimize for Azure Blob Storage
-        base_config.update({
-            "vfs.azure.max_parallel_ops": "8",
-            "vfs.azure.block_list_block_size": "50MB"
-        })
-
-    return tiledbvcf.ReadConfig(
-        memory_budget=4096,
-        tiledb_config=base_config
-    )
-```
-
-### Monitoring Ingestion Progress
-```python
-import time
-from pathlib import Path
-
-def ingest_with_progress(dataset, vcf_files):
-    """Ingest files with progress monitoring"""
-    start_time = time.time()
-    total_files = len(vcf_files)
-
-    for i, vcf_file in enumerate(vcf_files, 1):
-        file_start = time.time()
-        file_size = Path(vcf_file).stat().st_size / (1024*1024)  # MB
-
-        print(f"[{i}/{total_files}] Processing {vcf_file} ({file_size:.1f} MB)")
-
-        try:
-            dataset.ingest_samples([vcf_file])
-            file_duration = time.time() - file_start
-
-            print(f"  ✓ Completed in {file_duration:.1f}s "
-                  f"({file_size/file_duration:.1f} MB/s)")
-
-        except Exception as e:
-            print(f"  ✗ Failed: {e}")
-
-    total_duration = time.time() - start_time
-    print(f"\nIngestion complete: {total_duration:.1f}s total")
-```
-
-## Cloud Storage Patterns
-
-### S3 Ingestion Pipeline
-```python
-import boto3
-
-def ingest_from_s3_bucket(dataset_uri, bucket, prefix):
-    """Ingest VCF files from S3 bucket"""
-    s3 = boto3.client('s3')
-
-    # List VCF files in bucket
-    response = s3.list_objects_v2(
-        Bucket=bucket,
-        Prefix=prefix,
-        MaxKeys=1000
-    )
-
-    vcf_files = [
-        f"s3://{bucket}/{obj['Key']}"
-        for obj in response.get('Contents', [])
-        if obj['Key'].endswith(('.vcf.gz', '.vcf'))
-    ]
-
-    print(f"Found {len(vcf_files)} VCF files in s3://{bucket}/{prefix}")
-
-    # Configure for S3
-    config = get_optimized_config("s3")
-
-    with tiledbvcf.Dataset(uri=dataset_uri, mode="w", cfg=config) as ds:
-        ds.ingest_samples(vcf_files)
-
-# Usage
-ingest_from_s3_bucket(
-    dataset_uri="s3://my-output-bucket/vcf-dataset",
-    bucket="my-input-bucket",
-    prefix="vcf_files/"
-)
-```
-
-## Best Practices
-
-### Dataset Organization
-```python
-# Organize datasets by study or cohort
-study_datasets = {
-    "ukb": "s3://genomics-data/ukb-dataset",
-    "1kgp": "s3://genomics-data/1kgp-dataset",
-    "gnomad": "s3://genomics-data/gnomad-dataset"
-}
-
-def create_study_dataset(study_name, vcf_files):
-    """Create a dataset for a specific study"""
-    dataset_uri = study_datasets[study_name]
-
-    config = tiledbvcf.ReadConfig(
-        memory_budget=4096,
-        tiledb_config={
-            "sm.consolidation.mode": "fragments",
-            "sm.consolidation.buffer_size": "200000000"
-        }
-    )
-
-    with tiledbvcf.Dataset(uri=dataset_uri, mode="w", cfg=config) as ds:
-        ds.ingest_samples(vcf_files)
-```
-
-### Maintenance and Consolidation
-```python
-# Consolidate dataset after ingestion for optimal query performance
-def consolidate_dataset(dataset_uri):
-    """Consolidate dataset fragments for better query performance"""
-    config = tiledbvcf.ReadConfig(
-        tiledb_config={
-            "sm.consolidation.mode": "fragments",
-            "sm.consolidation.buffer_size": "1000000000"  # 1GB buffer
-        }
-    )
-
-    # Note: Consolidation API varies by TileDB-VCF version
-    # This is a conceptual example
-    print(f"Consolidating dataset: {dataset_uri}")
-    # Implementation depends on specific TileDB-VCF version
-```
-
-This comprehensive guide covers all aspects of TileDB-VCF ingestion from basic single-file ingestion to complex cloud-based parallel processing workflows. Use the patterns that best fit your data scale and infrastructure requirements.