Remove Streaming Export for Large Datasets section

- Remove chunked export functionality
- Remove parallel export functionality
- Simplify export guide to focus on basic export operations
This commit is contained in:
Jeremy Leipzig
2026-02-24 11:13:01 -07:00
parent 70a34bd652
commit 518261c4f2

View File

@@ -130,112 +130,6 @@ processed_df = export_processed_variants(
)
```
## Streaming Export for Large Datasets
### Chunked Export
```python
def streaming_export(ds, regions, samples, output_file, chunk_size=100000):
"""Export large datasets in chunks to manage memory"""
import csv
total_variants = 0
with open(output_file, 'w', newline='') as f:
writer = None
header_written = False
for region in regions:
print(f"Processing region: {region}")
# Query region
df = ds.read(
attrs=["sample_name", "contig", "pos_start", "alleles", "fmt_GT"],
regions=[region],
samples=samples
)
if df.empty:
continue
# Process in chunks
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i+chunk_size]
# Write header on first chunk
if not header_written:
writer = csv.writer(f)
writer.writerow(chunk.columns)
header_written = True
# Write chunk data
for _, row in chunk.iterrows():
writer.writerow(row.values)
total_variants += len(chunk)
if i + chunk_size < len(df):
print(f" Processed {i + chunk_size:,} variants...")
print(f"Exported {total_variants:,} variants to {output_file}")
# Usage
regions = [f"chr{i}" for i in range(1, 23)] # All autosomes
streaming_export(ds, regions, ds.sample_names(), "genome_wide_variants.csv")
```
### Parallel Export
```python
import multiprocessing as mp
import os
def export_region_chunk(args):
"""Export single region - for parallel processing"""
dataset_uri, region, samples, output_dir = args
# Create separate dataset instance for each process
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
# Generate output filename
region_safe = region.replace(":", "_").replace("-", "_")
output_file = os.path.join(output_dir, f"variants_{region_safe}.tsv")
# Export region
ds.export_tsv(
uri=output_file,
regions=[region],
samples=samples,
tsv_fields=["CHR", "POS", "REF", "ALT", "S:GT", "S:DP"]
)
return region, output_file
def parallel_export(dataset_uri, regions, samples, output_dir, n_processes=4):
"""Export multiple regions in parallel"""
os.makedirs(output_dir, exist_ok=True)
# Prepare arguments for parallel processing
args = [(dataset_uri, region, samples, output_dir) for region in regions]
# Export in parallel
with mp.Pool(n_processes) as pool:
results = pool.map(export_region_chunk, args)
# Combine results if needed
output_files = [output_file for _, output_file in results]
print(f"Exported {len(output_files)} region files to {output_dir}")
return output_files
# Usage
regions = [f"chr{i}:1-50000000" for i in range(1, 23)] # First half of each chromosome
output_files = parallel_export(
dataset_uri="my_dataset",
regions=regions,
samples=ds.sample_names()[:100],
output_dir="parallel_export",
n_processes=8
)
```
## Integration with Analysis Tools