mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
Remove references/querying.md
- Delete detailed querying reference documentation - Update main skill to remove references to querying.md - Leave only population_genomics.md reference file
This commit is contained in:
@@ -339,17 +339,14 @@ config = tiledbvcf.ReadConfig(
|
||||
|
||||
Detailed documentation for each major capability:
|
||||
|
||||
- **querying.md** - Complete guide to efficient variant queries, including region specification, attribute selection, filtering strategies, and performance optimization
|
||||
|
||||
- **population_genomics.md** - Practical examples of population genomics workflows, including GWAS preparation, quality control, allele frequency analysis, and integration with analysis tools
|
||||
|
||||
## Getting Help
|
||||
|
||||
### Open Source TileDB-VCF Resources
|
||||
|
||||
For detailed information on specific operations, refer to the appropriate reference document:
|
||||
For detailed information on population genomics workflows, refer to:
|
||||
|
||||
- Querying variant data efficiently → `querying.md`
|
||||
- Population genomics workflows → `population_genomics.md`
|
||||
|
||||
**Open Source Documentation:**
|
||||
|
||||
@@ -1,523 +0,0 @@
|
||||
# TileDB-VCF Querying Guide
|
||||
|
||||
Complete guide to efficiently querying variant data from TileDB-VCF datasets with optimal performance and memory usage.
|
||||
|
||||
## Basic Querying
|
||||
|
||||
### Opening a Dataset
|
||||
```python
|
||||
import tiledbvcf
|
||||
import pandas as pd
|
||||
|
||||
# Open dataset for reading
|
||||
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r")
|
||||
|
||||
# With custom configuration
|
||||
config = tiledbvcf.ReadConfig(
|
||||
memory_budget=2048,
|
||||
tiledb_config={"sm.tile_cache_size": "1000000000"}
|
||||
)
|
||||
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
|
||||
```
|
||||
|
||||
### Simple Region Query
|
||||
```python
|
||||
# Query single region
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "pos_end", "alleles"],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
print(df.head())
|
||||
```
|
||||
|
||||
### Multi-Region Query
|
||||
```python
|
||||
# Query multiple regions efficiently
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=[
|
||||
"chr1:1000000-2000000",
|
||||
"chr2:500000-1500000",
|
||||
"chr22:20000000-21000000"
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Sample Filtering
|
||||
|
||||
### Query Specific Samples
|
||||
```python
|
||||
# Query subset of samples
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=["chr1:1000000-2000000"],
|
||||
samples=["SAMPLE_001", "SAMPLE_002", "SAMPLE_003"]
|
||||
)
|
||||
```
|
||||
|
||||
### Sample Pattern Matching
|
||||
```python
|
||||
# Get all sample names first
|
||||
sample_names = ds.sample_names()
|
||||
print(f"Total samples: {len(sample_names)}")
|
||||
|
||||
# Filter samples by pattern
|
||||
case_samples = [s for s in sample_names if s.startswith("CASE_")]
|
||||
control_samples = [s for s in sample_names if s.startswith("CTRL_")]
|
||||
|
||||
# Query case samples
|
||||
case_df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=["chr1:1000000-2000000"],
|
||||
samples=case_samples
|
||||
)
|
||||
```
|
||||
|
||||
## Attribute Selection
|
||||
|
||||
### Available Attributes
|
||||
```python
|
||||
# List all available attributes
|
||||
attrs = ds.attributes()
|
||||
print("Available attributes:")
|
||||
for attr in attrs:
|
||||
print(f" {attr}")
|
||||
```
|
||||
|
||||
### Core Variant Attributes
|
||||
```python
|
||||
# Essential variant information
|
||||
df = ds.read(
|
||||
attrs=[
|
||||
"sample_name", # Sample identifier
|
||||
"contig", # Chromosome
|
||||
"pos_start", # Start position (1-based)
|
||||
"pos_end", # End position (1-based)
|
||||
"alleles", # REF and ALT alleles
|
||||
"id", # Variant ID (rs numbers, etc.)
|
||||
"qual", # Quality score
|
||||
"filters" # FILTER field values
|
||||
],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
```
|
||||
|
||||
### INFO Field Access
|
||||
```python
|
||||
# Access INFO fields (prefix with 'info_')
|
||||
df = ds.read(
|
||||
attrs=[
|
||||
"sample_name", "pos_start", "alleles",
|
||||
"info_AF", # Allele frequency
|
||||
"info_AC", # Allele count
|
||||
"info_AN", # Allele number
|
||||
"info_DP", # Total depth
|
||||
"info_ExcessHet" # Excess heterozygosity
|
||||
],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
```
|
||||
|
||||
### FORMAT Field Access
|
||||
```python
|
||||
# Access FORMAT fields (prefix with 'fmt_')
|
||||
df = ds.read(
|
||||
attrs=[
|
||||
"sample_name", "pos_start", "alleles",
|
||||
"fmt_GT", # Genotype
|
||||
"fmt_DP", # Read depth
|
||||
"fmt_GQ", # Genotype quality
|
||||
"fmt_AD", # Allelic depths
|
||||
"fmt_PL" # Phred-scaled likelihoods
|
||||
],
|
||||
regions=["chr1:1000000-2000000"],
|
||||
samples=["SAMPLE_001", "SAMPLE_002"]
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Filtering
|
||||
|
||||
### Quality-Based Filtering
|
||||
```python
|
||||
# Query with quality information
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "qual", "fmt_GQ", "fmt_DP"],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
|
||||
# Filter high-quality variants in pandas
|
||||
high_qual_df = df[
|
||||
(df["qual"] >= 30) &
|
||||
(df["fmt_GQ"] >= 20) &
|
||||
(df["fmt_DP"] >= 10)
|
||||
]
|
||||
```
|
||||
|
||||
### Genotype Filtering
|
||||
```python
|
||||
# Query genotype data
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
|
||||
# Filter for heterozygous variants
|
||||
# fmt_GT format: [allele1, allele2] where -1 = missing
|
||||
het_variants = df[
|
||||
df["fmt_GT"].apply(lambda gt:
|
||||
isinstance(gt, list) and len(gt) == 2 and
|
||||
gt[0] != gt[1] and -1 not in gt
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Memory-Efficient Streaming
|
||||
```python
|
||||
# For large result sets, use iterator pattern
|
||||
def stream_variants(dataset, regions, chunk_size=100000):
|
||||
"""Stream variants in chunks to manage memory"""
|
||||
total_variants = 0
|
||||
|
||||
for region in regions:
|
||||
print(f"Processing region: {region}")
|
||||
|
||||
# Query region
|
||||
df = dataset.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=[region]
|
||||
)
|
||||
|
||||
# Process in chunks
|
||||
for i in range(0, len(df), chunk_size):
|
||||
chunk = df.iloc[i:i+chunk_size]
|
||||
yield chunk
|
||||
total_variants += len(chunk)
|
||||
|
||||
if i + chunk_size < len(df):
|
||||
print(f" Processed {i + chunk_size:,} variants...")
|
||||
|
||||
print(f"Total variants processed: {total_variants:,}")
|
||||
|
||||
# Usage
|
||||
regions = ["chr1", "chr2", "chr3"]
|
||||
for chunk in stream_variants(ds, regions):
|
||||
# Process chunk
|
||||
pass
|
||||
```
|
||||
|
||||
### Parallel Region Processing
|
||||
```python
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
|
||||
def query_region(dataset_uri, region, samples=None):
|
||||
"""Query single region - can be parallelized"""
|
||||
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
|
||||
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=[region],
|
||||
samples=samples
|
||||
)
|
||||
|
||||
return region, df
|
||||
|
||||
def parallel_query(dataset_uri, regions, samples=None, n_processes=4):
|
||||
"""Query multiple regions in parallel"""
|
||||
query_func = partial(query_region, dataset_uri, samples=samples)
|
||||
|
||||
with mp.Pool(n_processes) as pool:
|
||||
results = pool.map(query_func, regions)
|
||||
|
||||
# Combine results
|
||||
combined_df = pd.concat([df for _, df in results], ignore_index=True)
|
||||
return combined_df
|
||||
|
||||
# Usage
|
||||
regions = [f"chr{i}:1000000-2000000" for i in range(1, 23)]
|
||||
df = parallel_query("my_dataset", regions, n_processes=8)
|
||||
```
|
||||
|
||||
### Query Optimization Strategies
|
||||
```python
|
||||
# Optimize tile cache for repeated region access
|
||||
config = tiledbvcf.ReadConfig(
|
||||
memory_budget=4096,
|
||||
tiledb_config={
|
||||
"sm.tile_cache_size": "2000000000", # 2GB cache
|
||||
"sm.mem.reader.sparse_global_order.ratio_tile_ranges": "0.5",
|
||||
"sm.mem.reader.sparse_unordered.ratio_coords": "0.5"
|
||||
}
|
||||
)
|
||||
|
||||
ds = tiledbvcf.Dataset(uri="my_dataset", mode="r", cfg=config)
|
||||
|
||||
# Combine nearby regions to reduce query overhead
|
||||
def optimize_regions(regions, max_gap=1000000):
|
||||
"""Merge nearby regions to reduce query count"""
|
||||
if not regions:
|
||||
return regions
|
||||
|
||||
# Parse regions (simplified - assumes chr:start-end format)
|
||||
parsed = []
|
||||
for region in regions:
|
||||
chrom, pos_range = region.split(":")
|
||||
start, end = map(int, pos_range.split("-"))
|
||||
parsed.append((chrom, start, end))
|
||||
|
||||
# Sort by chromosome and position
|
||||
parsed.sort()
|
||||
|
||||
merged = []
|
||||
current_chrom, current_start, current_end = parsed[0]
|
||||
|
||||
for chrom, start, end in parsed[1:]:
|
||||
if chrom == current_chrom and start - current_end <= max_gap:
|
||||
# Merge regions
|
||||
current_end = max(current_end, end)
|
||||
else:
|
||||
# Add current region and start new one
|
||||
merged.append(f"{current_chrom}:{current_start}-{current_end}")
|
||||
current_chrom, current_start, current_end = chrom, start, end
|
||||
|
||||
# Add final region
|
||||
merged.append(f"{current_chrom}:{current_start}-{current_end}")
|
||||
|
||||
return merged
|
||||
```
|
||||
|
||||
## Complex Queries
|
||||
|
||||
### Population-Specific Queries
|
||||
```python
|
||||
# Query specific populations
|
||||
def query_population(ds, regions, population_file):
|
||||
"""Query variants for specific population"""
|
||||
# Read population assignments
|
||||
pop_df = pd.read_csv(population_file) # sample_id, population
|
||||
|
||||
populations = {}
|
||||
for _, row in pop_df.iterrows():
|
||||
pop = row['population']
|
||||
if pop not in populations:
|
||||
populations[pop] = []
|
||||
populations[pop].append(row['sample_id'])
|
||||
|
||||
results = {}
|
||||
for pop_name, samples in populations.items():
|
||||
print(f"Querying {pop_name}: {len(samples)} samples")
|
||||
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=regions,
|
||||
samples=samples
|
||||
)
|
||||
|
||||
results[pop_name] = df
|
||||
|
||||
return results
|
||||
|
||||
# Usage
|
||||
regions = ["chr1:1000000-2000000"]
|
||||
pop_results = query_population(ds, regions, "population_assignments.csv")
|
||||
```
|
||||
|
||||
### Allele Frequency Calculation
|
||||
```python
|
||||
def calculate_allele_frequencies(df):
|
||||
"""Calculate allele frequencies from genotype data"""
|
||||
af_results = []
|
||||
|
||||
# Group by variant position
|
||||
for (contig, pos), group in df.groupby(['contig', 'pos_start']):
|
||||
alleles = group['alleles'].iloc[0] # REF and ALT alleles
|
||||
genotypes = group['fmt_GT'].values
|
||||
|
||||
# Count alleles
|
||||
allele_counts = {}
|
||||
total_alleles = 0
|
||||
|
||||
for gt in genotypes:
|
||||
if isinstance(gt, list) and -1 not in gt: # Valid genotype
|
||||
for allele_idx in gt:
|
||||
allele_counts[allele_idx] = allele_counts.get(allele_idx, 0) + 1
|
||||
total_alleles += 1
|
||||
|
||||
# Calculate frequencies
|
||||
frequencies = {}
|
||||
for allele_idx, count in allele_counts.items():
|
||||
if allele_idx < len(alleles):
|
||||
allele = alleles[allele_idx]
|
||||
freq = count / total_alleles if total_alleles > 0 else 0
|
||||
frequencies[allele] = freq
|
||||
|
||||
af_results.append({
|
||||
'contig': contig,
|
||||
'pos': pos,
|
||||
'alleles': alleles,
|
||||
'frequencies': frequencies,
|
||||
'sample_count': len(group)
|
||||
})
|
||||
|
||||
return af_results
|
||||
|
||||
# Usage
|
||||
df = ds.read(
|
||||
attrs=["contig", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=["chr1:1000000-1010000"],
|
||||
samples=sample_list[:100] # Subset for faster calculation
|
||||
)
|
||||
|
||||
af_results = calculate_allele_frequencies(df)
|
||||
```
|
||||
|
||||
### Rare Variant Analysis
|
||||
```python
|
||||
def find_rare_variants(ds, regions, samples, max_af=0.01):
|
||||
"""Find rare variants (MAF < threshold)"""
|
||||
# Query with allele frequency info
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "info_AF", "fmt_GT"],
|
||||
regions=regions,
|
||||
samples=samples
|
||||
)
|
||||
|
||||
# Filter for rare variants
|
||||
rare_df = df[df["info_AF"] < max_af]
|
||||
|
||||
# Group by variant and count carriers
|
||||
rare_summary = []
|
||||
for (pos, alleles), group in rare_df.groupby(['pos_start', 'alleles']):
|
||||
# Count carriers (samples with non-ref genotypes)
|
||||
carriers = []
|
||||
for _, row in group.iterrows():
|
||||
gt = row['fmt_GT']
|
||||
if isinstance(gt, list) and any(allele > 0 for allele in gt if allele != -1):
|
||||
carriers.append(row['sample_name'])
|
||||
|
||||
rare_summary.append({
|
||||
'pos': pos,
|
||||
'alleles': alleles,
|
||||
'af': group['info_AF'].iloc[0],
|
||||
'carrier_count': len(carriers),
|
||||
'carriers': carriers
|
||||
})
|
||||
|
||||
return pd.DataFrame(rare_summary)
|
||||
|
||||
# Usage
|
||||
rare_variants = find_rare_variants(
|
||||
ds,
|
||||
regions=["chr1:1000000-2000000"],
|
||||
samples=sample_list,
|
||||
max_af=0.005 # 0.5% frequency threshold
|
||||
)
|
||||
```
|
||||
|
||||
## Cloud Querying
|
||||
|
||||
### S3 Dataset Queries
|
||||
```python
|
||||
# Configure for S3 access
|
||||
s3_config = tiledbvcf.ReadConfig(
|
||||
memory_budget=2048,
|
||||
tiledb_config={
|
||||
"vfs.s3.region": "us-east-1",
|
||||
"vfs.s3.use_virtual_addressing": "true",
|
||||
"sm.tile_cache_size": "1000000000"
|
||||
}
|
||||
)
|
||||
|
||||
# Query S3-hosted dataset
|
||||
ds = tiledbvcf.Dataset(
|
||||
uri="s3://my-bucket/vcf-dataset",
|
||||
mode="r",
|
||||
cfg=s3_config
|
||||
)
|
||||
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=["chr1:1000000-2000000"]
|
||||
)
|
||||
```
|
||||
|
||||
### Federated Queries
|
||||
```python
|
||||
def federated_query(dataset_uris, regions, samples):
|
||||
"""Query multiple datasets and combine results"""
|
||||
combined_results = []
|
||||
|
||||
for dataset_uri in dataset_uris:
|
||||
print(f"Querying dataset: {dataset_uri}")
|
||||
|
||||
ds = tiledbvcf.Dataset(uri=dataset_uri, mode="r")
|
||||
|
||||
# Check which samples are available in this dataset
|
||||
available_samples = set(ds.sample_names())
|
||||
query_samples = [s for s in samples if s in available_samples]
|
||||
|
||||
if query_samples:
|
||||
df = ds.read(
|
||||
attrs=["sample_name", "pos_start", "alleles", "fmt_GT"],
|
||||
regions=regions,
|
||||
samples=query_samples
|
||||
)
|
||||
df['dataset'] = dataset_uri
|
||||
combined_results.append(df)
|
||||
|
||||
if combined_results:
|
||||
return pd.concat(combined_results, ignore_index=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Usage
|
||||
datasets = [
|
||||
"s3://data-bucket/study1-dataset",
|
||||
"s3://data-bucket/study2-dataset",
|
||||
"local_dataset"
|
||||
]
|
||||
results = federated_query(datasets, ["chr1:1000000-2000000"], ["SAMPLE_001"])
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Query Issues
|
||||
```python
|
||||
# Handle empty results gracefully
|
||||
def safe_query(ds, **kwargs):
|
||||
"""Query with error handling"""
|
||||
try:
|
||||
df = ds.read(**kwargs)
|
||||
if df.empty:
|
||||
print("Query returned no results")
|
||||
return pd.DataFrame()
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"Query failed: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Check dataset contents before querying
|
||||
def inspect_dataset(ds):
|
||||
"""Inspect dataset properties"""
|
||||
print(f"Sample count: {len(ds.sample_names())}")
|
||||
print(f"Sample names (first 10): {ds.sample_names()[:10]}")
|
||||
print(f"Available attributes: {ds.attributes()}")
|
||||
|
||||
# Try small test query
|
||||
try:
|
||||
test_df = ds.read(
|
||||
attrs=["sample_name", "pos_start"],
|
||||
regions=["chr1:1-1000"]
|
||||
)
|
||||
print(f"Test query returned {len(test_df)} variants")
|
||||
except Exception as e:
|
||||
print(f"Test query failed: {e}")
|
||||
|
||||
# Usage
|
||||
inspect_dataset(ds)
|
||||
```
|
||||
|
||||
This comprehensive querying guide covers all aspects of efficiently retrieving and filtering variant data from TileDB-VCF datasets, from basic queries to complex population genomics analyses.
|
||||
Reference in New Issue
Block a user