Remove references/querying.md

- Delete detailed querying reference documentation
- Update main skill to remove references to querying.md
- Leave only population_genomics.md reference file
This commit is contained in:
Jeremy Leipzig
2026-02-24 11:29:01 -07:00
parent c576d2e66a
commit 55811bdbbe
2 changed files with 1 additions and 527 deletions

View File

@@ -339,17 +339,14 @@ config = tiledbvcf.ReadConfig(
Detailed documentation for each major capability:
- **querying.md** - Complete guide to efficient variant queries, including region specification, attribute selection, filtering strategies, and performance optimization
- **population_genomics.md** - Practical examples of population genomics workflows, including GWAS preparation, quality control, allele frequency analysis, and integration with analysis tools
## Getting Help
### Open Source TileDB-VCF Resources
For detailed information on specific operations, refer to the appropriate reference document:
For detailed information on population genomics workflows, refer to:
- Querying variant data efficiently → `querying.md`
- Population genomics workflows → `population_genomics.md`
**Open Source Documentation:**

View File

@@ -1,523 +0,0 @@
# TileDB-VCF Querying Guide
Complete guide to efficiently querying variant data from TileDB-VCF datasets with optimal performance and memory usage.
## Basic Querying
### Opening a Dataset
```python
import tiledbvcf
import pandas as pd
# Open dataset for reading
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
# With custom configuration
config = tiledbvcf.ReadConfig(
memory_budget=2048,
tiledb_config={"sm.tile_cache_size": "1000000000"}
)
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
```
### Simple Region Query
```python
# Query single region
df = ds.read(
attrs=["sample_name", "pos_start", "pos_end", "alleles"],
regions=["chr1:1000000-2000000"]
)
print(df.head())
```
### Multi-Region Query
```python
# Query multiple regions efficiently
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[
"chr1:1000000-2000000",
"chr2:500000-1500000",
"chr22:20000000-21000000"
]
)
```
## Sample Filtering
### Query Specific Samples
```python
# Query subset of samples
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"],
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
)
```
### Sample Pattern Matching
```python
# Get all sample names first
sample_names = ds.sample_names()
print(f"Total samples: {len(sample_names)}")
# Filter samples by pattern
case_samples = [s for s in sample_names if s.startswith("CASE_")]
control_samples = [s for s in sample_names if s.startswith("CTRL_")]
# Query case samples
case_df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"],
samples=case_samples
)
```
## Attribute Selection
### Available Attributes
```python
# List all available attributes
attrs = ds.attributes()
print("Available attributes:")
for attr in attrs:
print(f" {attr}")
```
### Core Variant Attributes
```python
# Essential variant information
df = ds.read(
attrs=[
"sample_name", # Sample identifier
"contig", # Chromosome
"pos_start", # Start position (1-based)
"pos_end", # End position (1-based)
"alleles", # REF and ALT alleles
"id", # Variant ID (rs numbers, etc.)
"qual", # Quality score
"filters" # FILTER field values
],
regions=["chr1:1000000-2000000"]
)
```
### INFO Field Access
```python
# Access INFO fields (prefix with 'info_')
df = ds.read(
attrs=[
"sample_name", "pos_start", "alleles",
"info_AF", # Allele frequency
"info_AC", # Allele count
"info_AN", # Allele number
"info_DP", # Total depth
"info_ExcessHet" # Excess heterozygosity
],
regions=["chr1:1000000-2000000"]
)
```
### FORMAT Field Access
```python
# Access FORMAT fields (prefix with 'fmt_')
df = ds.read(
attrs=[
"sample_name", "pos_start", "alleles",
"fmt_GT", # Genotype
"fmt_DP", # Read depth
"fmt_GQ", # Genotype quality
"fmt_AD", # Allelic depths
"fmt_PL" # Phred-scaled likelihoods
],
regions=["chr1:1000000-2000000"],
samples=["SAMPLE_001", "SAMPLE_002"]
)
```
## Advanced Filtering
### Quality-Based Filtering
```python
# Query with quality information
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "qual", "fmt_GQ", "fmt_DP"],
regions=["chr1:1000000-2000000"]
)
# Filter high-quality variants in pandas
high_qual_df = df[
(df["qual"] >= 30) &
(df["fmt_GQ"] >= 20) &
(df["fmt_DP"] >= 10)
]
```
### Genotype Filtering
```python
# Query genotype data
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"]
)
# Filter for heterozygous variants
# fmt_GT format: [allele1, allele2] where -1 = missing
het_variants = df[
df["fmt_GT"].apply(lambda gt:
isinstance(gt, list) and len(gt) == 2 and
gt[0] != gt[1] and -1 not in gt
)
]
```
## Performance Optimization
### Memory-Efficient Streaming
```python
# For large result sets, use iterator pattern
def stream_variants(dataset, regions, chunk_size=100000):
"""Stream variants in chunks to manage memory"""
total_variants = 0
for region in regions:
print(f"Processing region: {region}")
# Query region
df = dataset.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[region]
)
# Process in chunks
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i+chunk_size]
yield chunk
total_variants += len(chunk)
if i + chunk_size < len(df):
print(f" Processed {i + chunk_size:,} variants...")
print(f"Total variants processed: {total_variants:,}")
# Usage
regions = ["chr1", "chr2", "chr3"]
for chunk in stream_variants(ds, regions):
# Process chunk
pass
```
### Parallel Region Processing
```python
import multiprocessing as mp
from functools import partial
def query_region(dataset_uri, region, samples=None):
"""Query single region - can be parallelized"""
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=[region],
samples=samples
)
return region, df
def parallel_query(dataset_uri, regions, samples=None, n_processes=4):
"""Query multiple regions in parallel"""
query_func = partial(query_region, dataset_uri, samples=samples)
with mp.Pool(n_processes) as pool:
results = pool.map(query_func, regions)
# Combine results
combined_df = pd.concat([df for _, df in results], ignore_index=True)
return combined_df
# Usage
regions = [f"chr{i}:1000000-2000000" for i in range(1, 23)]
df = parallel_query("my_dataset", regions, n_processes=8)
```
### Query Optimization Strategies
```python
# Optimize tile cache for repeated region access
config = tiledbvcf.ReadConfig(
memory_budget=4096,
tiledb_config={
"sm.tile_cache_size": "2000000000", # 2GB cache
"sm.mem.reader.sparse_global_order.ratio_tile_ranges": "0.5",
"sm.mem.reader.sparse_unordered.ratio_coords": "0.5"
}
)
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
# Combine nearby regions to reduce query overhead
def optimize_regions(regions, max_gap=1000000):
"""Merge nearby regions to reduce query count"""
if not regions:
return regions
# Parse regions (simplified - assumes chr:start-end format)
parsed = []
for region in regions:
chrom, pos_range = region.split(":")
start, end = map(int, pos_range.split("-"))
parsed.append((chrom, start, end))
# Sort by chromosome and position
parsed.sort()
merged = []
current_chrom, current_start, current_end = parsed[0]
for chrom, start, end in parsed[1:]:
if chrom == current_chrom and start - current_end <= max_gap:
# Merge regions
current_end = max(current_end, end)
else:
# Add current region and start new one
merged.append(f"{current_chrom}:{current_start}-{current_end}")
current_chrom, current_start, current_end = chrom, start, end
# Add final region
merged.append(f"{current_chrom}:{current_start}-{current_end}")
return merged
```
## Complex Queries
### Population-Specific Queries
```python
# Query specific populations
def query_population(ds, regions, population_file):
"""Query variants for specific population"""
# Read population assignments
pop_df = pd.read_csv(population_file) # sample_id, population
populations = {}
for _, row in pop_df.iterrows():
pop = row['population']
if pop not in populations:
populations[pop] = []
populations[pop].append(row['sample_id'])
results = {}
for pop_name, samples in populations.items():
print(f"Querying {pop_name}: {len(samples)} samples")
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=regions,
samples=samples
)
results[pop_name] = df
return results
# Usage
regions = ["chr1:1000000-2000000"]
pop_results = query_population(ds, regions, "population_assignments.csv")
```
### Allele Frequency Calculation
```python
def calculate_allele_frequencies(df):
"""Calculate allele frequencies from genotype data"""
af_results = []
# Group by variant position
for (contig, pos), group in df.groupby(['contig', 'pos_start']):
alleles = group['alleles'].iloc[0] # REF and ALT alleles
genotypes = group['fmt_GT'].values
# Count alleles
allele_counts = {}
total_alleles = 0
for gt in genotypes:
if isinstance(gt, list) and -1 not in gt: # Valid genotype
for allele_idx in gt:
allele_counts[allele_idx] = allele_counts.get(allele_idx, 0) + 1
total_alleles += 1
# Calculate frequencies
frequencies = {}
for allele_idx, count in allele_counts.items():
if allele_idx < len(alleles):
allele = alleles[allele_idx]
freq = count / total_alleles if total_alleles > 0 else 0
frequencies[allele] = freq
af_results.append({
'contig': contig,
'pos': pos,
'alleles': alleles,
'frequencies': frequencies,
'sample_count': len(group)
})
return af_results
# Usage
df = ds.read(
attrs=["contig", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-1010000"],
samples=sample_list[:100] # Subset for faster calculation
)
af_results = calculate_allele_frequencies(df)
```
### Rare Variant Analysis
```python
def find_rare_variants(ds, regions, samples, max_af=0.01):
"""Find rare variants (MAF < threshold)"""
# Query with allele frequency info
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "info_AF", "fmt_GT"],
regions=regions,
samples=samples
)
# Filter for rare variants
rare_df = df[df["info_AF"] < max_af]
# Group by variant and count carriers
rare_summary = []
for (pos, alleles), group in rare_df.groupby(['pos_start', 'alleles']):
# Count carriers (samples with non-ref genotypes)
carriers = []
for _, row in group.iterrows():
gt = row['fmt_GT']
if isinstance(gt, list) and any(allele > 0 for allele in gt if allele != -1):
carriers.append(row['sample_name'])
rare_summary.append({
'pos': pos,
'alleles': alleles,
'af': group['info_AF'].iloc[0],
'carrier_count': len(carriers),
'carriers': carriers
})
return pd.DataFrame(rare_summary)
# Usage
rare_variants = find_rare_variants(
ds,
regions=["chr1:1000000-2000000"],
samples=sample_list,
max_af=0.005 # 0.5% frequency threshold
)
```
## Cloud Querying
### S3 Dataset Queries
```python
# Configure for S3 access
s3_config = tiledbvcf.ReadConfig(
memory_budget=2048,
tiledb_config={
"vfs.s3.region": "us-east-1",
"vfs.s3.use_virtual_addressing": "true",
"sm.tile_cache_size": "1000000000"
}
)
# Query S3-hosted dataset
ds = tiledbvcf.Dataset(
uri="s3://my-bucket/vcf-dataset",
mode="r",
cfg=s3_config
)
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=["chr1:1000000-2000000"]
)
```
### Federated Queries
```python
def federated_query(dataset_uris, regions, samples):
"""Query multiple datasets and combine results"""
combined_results = []
for dataset_uri in dataset_uris:
print(f"Querying dataset: {dataset_uri}")
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
# Check which samples are available in this dataset
available_samples = set(ds.sample_names())
query_samples = [s for s in samples if s in available_samples]
if query_samples:
df = ds.read(
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
regions=regions,
samples=query_samples
)
df['dataset'] = dataset_uri
combined_results.append(df)
if combined_results:
return pd.concat(combined_results, ignore_index=True)
else:
return pd.DataFrame()
# Usage
datasets = [
"s3://data-bucket/study1-dataset",
"s3://data-bucket/study2-dataset",
"local_dataset"
]
results = federated_query(datasets, ["chr1:1000000-2000000"], ["SAMPLE_001"])
```
## Troubleshooting
### Common Query Issues
```python
# Handle empty results gracefully
def safe_query(ds, **kwargs):
"""Query with error handling"""
try:
df = ds.read(**kwargs)
if df.empty:
print("Query returned no results")
return pd.DataFrame()
return df
except Exception as e:
print(f"Query failed: {e}")
return pd.DataFrame()
# Check dataset contents before querying
def inspect_dataset(ds):
"""Inspect dataset properties"""
print(f"Sample count: {len(ds.sample_names())}")
print(f"Sample names (first 10): {ds.sample_names()[:10]}")
print(f"Available attributes: {ds.attributes()}")
# Try small test query
try:
test_df = ds.read(
attrs=["sample_name", "pos_start"],
regions=["chr1:1-1000"]
)
print(f"Test query returned {len(test_df)} variants")
except Exception as e:
print(f"Test query failed: {e}")
# Usage
inspect_dataset(ds)
```
This comprehensive querying guide covers all aspects of efficiently retrieving and filtering variant data from TileDB-VCF datasets, from basic queries to complex population genomics analyses.