mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
Remove multiple advanced export sections
- Remove VEP annotation preparation section - Remove Cloud Export (S3) section - Remove Export Validation section - Remove Efficient Export Strategies section - Simplify export guide to focus on core export functionality - Maintain essential VCF/BCF and TSV export examples
This commit is contained in:
@@ -199,199 +199,10 @@ export_for_plink(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### VEP Annotation Preparation
|
|
||||||
```python
|
|
||||||
def export_for_vep(ds, regions, output_file):
|
|
||||||
"""Export variants for VEP (Variant Effect Predictor) annotation"""
|
|
||||||
# Query essential variant information
|
|
||||||
df = ds.read(
|
|
||||||
attrs=["contig", "pos_start", "pos_end", "alleles", "id"],
|
|
||||||
regions=regions
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare VEP input format
|
|
||||||
vep_data = []
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
alleles = row['alleles']
|
|
||||||
if len(alleles) >= 2:
|
|
||||||
ref = alleles[0]
|
|
||||||
for alt in alleles[1:]: # Can have multiple ALT alleles
|
|
||||||
vep_data.append({
|
|
||||||
'chr': row['contig'],
|
|
||||||
'start': row['pos_start'],
|
|
||||||
'end': row['pos_end'],
|
|
||||||
'allele': f"{ref}/{alt}",
|
|
||||||
'strand': '+',
|
|
||||||
'id': row['id'] if row['id'] else '.'
|
|
||||||
})
|
|
||||||
|
|
||||||
vep_df = pd.DataFrame(vep_data)
|
|
||||||
|
|
||||||
# Save VEP input format
|
|
||||||
vep_df.to_csv(
|
|
||||||
output_file,
|
|
||||||
sep='\t',
|
|
||||||
header=False,
|
|
||||||
index=False,
|
|
||||||
columns=['chr', 'start', 'end', 'allele', 'strand', 'id']
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Exported {len(vep_df)} variants for VEP annotation to {output_file}")
|
|
||||||
|
|
||||||
# Usage
|
|
||||||
export_for_vep(ds, ["chr1:1000000-2000000"], "variants_for_vep.txt")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Cloud Export
|
|
||||||
|
|
||||||
### S3 Export
|
|
||||||
```python
|
|
||||||
def export_to_s3(ds, regions, samples, s3_bucket, s3_prefix):
|
|
||||||
"""Export data directly to S3"""
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
# Configure for S3
|
|
||||||
config = tiledbvcf.ReadConfig(
|
|
||||||
tiledb_config={
|
|
||||||
"vfs.s3.region": "us-east-1",
|
|
||||||
"vfs.s3.multipart_part_size": "50MB"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export to S3 paths
|
|
||||||
for i, region in enumerate(regions):
|
|
||||||
region_safe = region.replace(":", "_").replace("-", "_")
|
|
||||||
s3_uri = f"s3://{s3_bucket}/{s3_prefix}/region_{region_safe}.bcf"
|
|
||||||
|
|
||||||
print(f"Exporting region {i+1}/{len(regions)}: {region}")
|
|
||||||
|
|
||||||
ds.export_bcf(
|
|
||||||
uri=s3_uri,
|
|
||||||
regions=[region],
|
|
||||||
samples=samples
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Exported to {s3_uri}")
|
|
||||||
|
|
||||||
# Usage
|
|
||||||
export_to_s3(
|
|
||||||
ds,
|
|
||||||
regions=["chr1:1000000-2000000", "chr2:500000-1500000"],
|
|
||||||
samples=ds.sample_names()[:50],
|
|
||||||
s3_bucket="my-genomics-bucket",
|
|
||||||
s3_prefix="exported_variants"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Export Validation
|
|
||||||
|
|
||||||
### Data Integrity Checks
|
|
||||||
```python
|
|
||||||
def validate_export(original_ds, export_file, regions, samples):
|
|
||||||
"""Validate exported data against original dataset"""
|
|
||||||
import pysam
|
|
||||||
|
|
||||||
# Count variants in original dataset
|
|
||||||
original_df = original_ds.read(
|
|
||||||
attrs=["sample_name", "pos_start"],
|
|
||||||
regions=regions,
|
|
||||||
samples=samples
|
|
||||||
)
|
|
||||||
original_count = len(original_df)
|
|
||||||
|
|
||||||
# Count variants in exported file
|
|
||||||
try:
|
|
||||||
if export_file.endswith('.vcf.gz') or export_file.endswith('.bcf'):
|
|
||||||
vcf = pysam.VariantFile(export_file)
|
|
||||||
export_count = sum(1 for _ in vcf)
|
|
||||||
vcf.close()
|
|
||||||
elif export_file.endswith('.tsv') or export_file.endswith('.csv'):
|
|
||||||
export_df = pd.read_csv(export_file, sep='\t' if export_file.endswith('.tsv') else ',')
|
|
||||||
export_count = len(export_df)
|
|
||||||
else:
|
|
||||||
print(f"Unknown file format: {export_file}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Compare counts
|
|
||||||
if original_count == export_count:
|
|
||||||
print(f"✓ Export validation passed: {export_count} variants")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"✗ Export validation failed: {original_count} original vs {export_count} exported")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Export validation error: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Usage
|
|
||||||
success = validate_export(
|
|
||||||
ds,
|
|
||||||
"output.bcf",
|
|
||||||
regions=["chr1:1000000-2000000"],
|
|
||||||
samples=["SAMPLE_001", "SAMPLE_002"]
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Best Practices
|
## Best Practices
|
||||||
|
|
||||||
### Efficient Export Strategies
|
|
||||||
```python
|
|
||||||
# 1. Optimize for intended use case
|
|
||||||
def choose_export_format(use_case, file_size_mb):
|
|
||||||
"""Choose optimal export format based on use case"""
|
|
||||||
if use_case == "downstream_analysis":
|
|
||||||
if file_size_mb > 1000:
|
|
||||||
return "BCF" # Compressed binary
|
|
||||||
else:
|
|
||||||
return "VCF" # Text format
|
|
||||||
|
|
||||||
elif use_case == "data_sharing":
|
|
||||||
return "VCF.gz" # Standard compressed format
|
|
||||||
|
|
||||||
elif use_case == "statistical_analysis":
|
|
||||||
return "TSV" # Easy to process
|
|
||||||
|
|
||||||
elif use_case == "database_import":
|
|
||||||
return "CSV" # Universal format
|
|
||||||
|
|
||||||
else:
|
|
||||||
return "VCF" # Default
|
|
||||||
|
|
||||||
# 2. Batch processing for large exports
|
|
||||||
def batch_export_by_size(ds, regions, samples, max_variants_per_file=1000000):
|
|
||||||
"""Export data in batches based on variant count"""
|
|
||||||
current_batch = []
|
|
||||||
current_count = 0
|
|
||||||
batch_num = 1
|
|
||||||
|
|
||||||
for region in regions:
|
|
||||||
# Estimate variant count (approximate)
|
|
||||||
test_df = ds.read(
|
|
||||||
attrs=["pos_start"],
|
|
||||||
regions=[region],
|
|
||||||
samples=samples[:10] # Small sample for estimation
|
|
||||||
)
|
|
||||||
estimated_variants = len(test_df) * len(samples) // 10
|
|
||||||
|
|
||||||
if current_count + estimated_variants > max_variants_per_file and current_batch:
|
|
||||||
# Export current batch
|
|
||||||
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
|
|
||||||
batch_num += 1
|
|
||||||
current_batch = [region]
|
|
||||||
current_count = estimated_variants
|
|
||||||
else:
|
|
||||||
current_batch.append(region)
|
|
||||||
current_count += estimated_variants
|
|
||||||
|
|
||||||
# Export final batch
|
|
||||||
if current_batch:
|
|
||||||
export_batch(ds, current_batch, samples, f"batch_{batch_num}.bcf")
|
|
||||||
|
|
||||||
def export_batch(ds, regions, samples, output_file):
|
|
||||||
"""Export a batch of regions"""
|
|
||||||
print(f"Exporting batch to {output_file}")
|
|
||||||
ds.export_bcf(uri=output_file, regions=regions, samples=samples)
|
|
||||||
```
|
|
||||||
|
|
||||||
This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.
|
This comprehensive export guide covers all aspects of getting data out of TileDB-VCF in various formats optimized for different downstream analysis workflows.
|
||||||
Reference in New Issue
Block a user