From 436c8608f21157aa0f7eb1dd708a215d49503103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Sat, 14 Mar 2026 10:27:11 +0100 Subject: [PATCH] Add polars-bio skill for genomic interval operations and bioinformatics I/O Adds a new skill covering polars-bio (v0.26.0), a high-performance library for genomic interval arithmetic and file I/O built on Polars, Arrow, and DataFusion. All code examples verified against the actual API at runtime. SKILL.md covers overlap, nearest, merge, coverage, complement, subtract, cluster, count_overlaps operations plus read/scan/write/sink for BED, VCF, BAM, CRAM, GFF, GTF, FASTA, FASTQ, SAM, and Hi-C pairs formats. References: interval_operations, file_io, sql_processing, pileup_operations, configuration, bioframe_migration. Co-Authored-By: Claude Opus 4.6 (1M context) --- scientific-skills/polars-bio/SKILL.md | 374 ++++++++++++++++ .../references/bioframe_migration.md | 250 +++++++++++ .../polars-bio/references/configuration.md | 175 ++++++++ .../polars-bio/references/file_io.md | 414 ++++++++++++++++++ .../references/interval_operations.md | 362 +++++++++++++++ .../references/pileup_operations.md | 176 ++++++++ .../polars-bio/references/sql_processing.md | 224 ++++++++++ 7 files changed, 1975 insertions(+) create mode 100644 scientific-skills/polars-bio/SKILL.md create mode 100644 scientific-skills/polars-bio/references/bioframe_migration.md create mode 100644 scientific-skills/polars-bio/references/configuration.md create mode 100644 scientific-skills/polars-bio/references/file_io.md create mode 100644 scientific-skills/polars-bio/references/interval_operations.md create mode 100644 scientific-skills/polars-bio/references/pileup_operations.md create mode 100644 scientific-skills/polars-bio/references/sql_processing.md diff --git a/scientific-skills/polars-bio/SKILL.md b/scientific-skills/polars-bio/SKILL.md new file mode 100644 index 0000000..0fcee27 --- /dev/null +++ b/scientific-skills/polars-bio/SKILL.md @@ -0,0 +1,374 @@ +--- +name: polars-bio +description: High-performance genomic interval operations and bioinformatics file I/O on Polars DataFrames. Overlap, nearest, merge, coverage, complement, subtract for BED/VCF/BAM/GFF intervals. Streaming, cloud-native, faster bioframe alternative. +license: https://github.com/biodatageeks/polars-bio/blob/main/LICENSE +metadata: + skill-author: K-Dense Inc. +--- + +# polars-bio + +## Overview + +polars-bio is a high-performance Python library for genomic interval operations and bioinformatics file I/O, built on Polars, Apache Arrow, and Apache DataFusion. It provides a familiar DataFrame-centric API for interval arithmetic (overlap, nearest, merge, coverage, complement, subtract) and reading/writing common bioinformatics formats (BED, VCF, BAM, CRAM, GFF/GTF, FASTA, FASTQ). + +Key value propositions: +- **6-38x faster** than bioframe on real-world genomic benchmarks +- **Streaming/out-of-core** support for large genomes via DataFusion +- **Cloud-native** file I/O (S3, GCS, Azure) with predicate pushdown +- **Two API styles**: functional (`pb.overlap(df1, df2)`) and method-chaining (`df1.lazy().pb.overlap(df2)`) +- **SQL interface** for genomic data via DataFusion SQL engine + +## When to Use This Skill + +Use this skill when: +- Performing genomic interval operations (overlap, nearest, merge, coverage, complement, subtract) +- Reading/writing bioinformatics file formats (BED, VCF, BAM, CRAM, GFF/GTF, FASTA, FASTQ) +- Processing large genomic datasets that don't fit in memory (streaming mode) +- Running SQL queries on genomic data files +- Migrating from bioframe to a faster alternative +- Computing read depth/pileup from BAM/CRAM files +- Working with Polars DataFrames containing genomic intervals + +## Quick Start + +### Installation + +```bash +pip install polars-bio +# or +uv pip install polars-bio +``` + +For pandas compatibility: +```bash +pip install polars-bio[pandas] +``` + +### Basic Overlap Example + +```python +import polars as pl +import polars_bio as pb + +# Create two interval DataFrames +df1 = pl.DataFrame({ + "chrom": ["chr1", "chr1", "chr1"], + "start": [1, 5, 22], + "end": [6, 9, 30], +}) + +df2 = pl.DataFrame({ + "chrom": ["chr1", "chr1"], + "start": [3, 25], + "end": [8, 28], +}) + +# Functional API (returns LazyFrame by default) +result = pb.overlap(df1, df2) +result_df = result.collect() + +# Get a DataFrame directly +result_df = pb.overlap(df1, df2, output_type="polars.DataFrame") + +# Method-chaining API (via .pb accessor on LazyFrame) +result = df1.lazy().pb.overlap(df2) +result_df = result.collect() +``` + +### Reading a BED File + +```python +import polars_bio as pb + +# Eager read (loads entire file) +df = pb.read_bed("regions.bed") + +# Lazy scan (streaming, for large files) +lf = pb.scan_bed("regions.bed") +result = lf.collect() +``` + +## Core Capabilities + +### 1. Genomic Interval Operations + +polars-bio provides 8 core interval operations for genomic range arithmetic. All operations accept Polars DataFrames with `chrom`, `start`, `end` columns (configurable). All operations return a `LazyFrame` by default (use `output_type="polars.DataFrame"` for eager results). + +**Operations:** +- `overlap` / `count_overlaps` - Find or count overlapping intervals between two sets +- `nearest` - Find nearest intervals (with configurable `k`, `overlap`, `distance` params) +- `merge` - Merge overlapping/bookended intervals within a set +- `cluster` - Assign cluster IDs to overlapping intervals +- `coverage` - Compute per-interval coverage counts (two-input operation) +- `complement` - Find gaps between intervals within a genome +- `subtract` - Remove portions of intervals that overlap another set + +**Example:** +```python +import polars_bio as pb + +# Find overlapping intervals (returns LazyFrame) +result = pb.overlap(df1, df2, suffixes=("_1", "_2")) + +# Count overlaps per interval +counts = pb.count_overlaps(df1, df2) + +# Merge overlapping intervals +merged = pb.merge(df1) + +# Find nearest intervals +nearest = pb.nearest(df1, df2) + +# Collect any LazyFrame result to DataFrame +result_df = result.collect() +``` + +**Reference:** See `references/interval_operations.md` for detailed documentation on all operations, parameters, output schemas, and performance considerations. + +### 2. Bioinformatics File I/O + +Read and write common bioinformatics formats with `read_*`, `scan_*`, `write_*`, and `sink_*` functions. Supports cloud storage (S3, GCS, Azure) and compression (GZIP, BGZF). + +**Supported formats:** +- **BED** - Genomic intervals (`read_bed`, `scan_bed`, `write_*` via generic) +- **VCF** - Genetic variants (`read_vcf`, `scan_vcf`, `write_vcf`, `sink_vcf`) +- **BAM** - Aligned reads (`read_bam`, `scan_bam`, `write_bam`, `sink_bam`) +- **CRAM** - Compressed alignments (`read_cram`, `scan_cram`, `write_cram`, `sink_cram`) +- **GFF** - Gene annotations (`read_gff`, `scan_gff`) +- **GTF** - Gene annotations (`read_gtf`, `scan_gtf`) +- **FASTA** - Reference sequences (`read_fasta`, `scan_fasta`) +- **FASTQ** - Sequencing reads (`read_fastq`, `scan_fastq`, `write_fastq`, `sink_fastq`) +- **SAM** - Text alignments (`read_sam`, `scan_sam`, `write_sam`, `sink_sam`) +- **Hi-C pairs** - Chromatin contacts (`read_pairs`, `scan_pairs`) + +**Example:** +```python +import polars_bio as pb + +# Read VCF file +variants = pb.read_vcf("samples.vcf.gz") + +# Lazy scan BAM file (streaming) +alignments = pb.scan_bam("aligned.bam") + +# Read GFF annotations +genes = pb.read_gff("annotations.gff3") + +# Cloud storage (individual params, not a dict) +df = pb.read_bed("s3://bucket/regions.bed", + allow_anonymous=True) +``` + +**Reference:** See `references/file_io.md` for per-format column schemas, parameters, cloud storage options, and compression support. + +### 3. SQL Data Processing + +Register bioinformatics files as tables and query them using DataFusion SQL. Combines the power of SQL with polars-bio's genomic-aware readers. + +```python +import polars as pl +import polars_bio as pb + +# Register files as SQL tables (path first, name= keyword) +pb.register_vcf("samples.vcf.gz", name="variants") +pb.register_bed("target_regions.bed", name="regions") + +# Query with SQL (returns LazyFrame) +result = pb.sql("SELECT chrom, start, end, ref, alt FROM variants WHERE qual > 30") +result_df = result.collect() + +# Register a Polars DataFrame as a SQL table +pb.from_polars("my_intervals", df) +result = pb.sql("SELECT * FROM my_intervals WHERE chrom = 'chr1'").collect() +``` + +**Reference:** See `references/sql_processing.md` for register functions, SQL syntax, and examples. + +### 4. Pileup Operations + +Compute per-base read depth from BAM/CRAM files with CIGAR-aware depth calculation. + +```python +import polars_bio as pb + +# Compute depth across a BAM file +depth_lf = pb.depth("aligned.bam") +depth_df = depth_lf.collect() + +# With quality filter +depth_lf = pb.depth("aligned.bam", min_mapping_quality=20) +``` + +**Reference:** See `references/pileup_operations.md` for parameters and integration patterns. + +## Key Concepts + +### Coordinate Systems + +polars-bio defaults to **1-based** coordinates (genomic convention). This can be changed globally: + +```python +import polars_bio as pb + +# Switch to 0-based coordinates +pb.set_option("coordinate_system", "0-based") + +# Switch back to 1-based (default) +pb.set_option("coordinate_system", "1-based") +``` + +I/O functions also accept `use_zero_based` to set coordinate metadata on the resulting DataFrame: + +```python +# Read BED with explicit 0-based metadata +df = pb.read_bed("regions.bed", use_zero_based=True) +``` + +**Important:** BED files are always 0-based half-open in the file format. polars-bio handles the conversion automatically when reading BED files. Coordinate metadata is attached to DataFrames by I/O functions and propagated through operations. + +### Two API Styles + +**Functional API** - standalone functions, explicit inputs: +```python +result = pb.overlap(df1, df2, suffixes=("_1", "_2")) +merged = pb.merge(df) +``` + +**Method-chaining API** - via `.pb` accessor on **LazyFrames** (not DataFrames): +```python +result = df1.lazy().pb.overlap(df2) +merged = df.lazy().pb.merge() +``` + +**Important:** The `.pb` accessor for interval operations is only available on `LazyFrame`. On `DataFrame`, `.pb` provides write operations only (`write_bam`, `write_vcf`, etc.). + +Method-chaining enables fluent pipelines: +```python +# Chain interval operations (note: overlap outputs suffixed columns, +# so rename before merge which expects chrom/start/end) +result = ( + df1.lazy() + .pb.overlap(df2) + .filter(pl.col("start_2") > 1000) + .select( + pl.col("chrom_1").alias("chrom"), + pl.col("start_1").alias("start"), + pl.col("end_1").alias("end"), + ) + .pb.merge() + .collect() +) +``` + +### Probe-Build Architecture + +For two-input operations (overlap, nearest, count_overlaps, coverage), polars-bio uses a probe-build join strategy: +- The **first** DataFrame is the **probe** (iterated over) +- The **second** DataFrame is the **build** (indexed for lookup) + +For best performance, pass the larger DataFrame as the first argument (probe) and the smaller one as the second (build). + +### Column Conventions + +By default, polars-bio expects columns named `chrom`, `start`, `end`. Custom column names can be specified via lists: + +```python +result = pb.overlap( + df1, df2, + cols1=["chromosome", "begin", "finish"], + cols2=["chr", "pos_start", "pos_end"], +) +``` + +### Return Types and Collecting Results + +All interval operations and `pb.sql()` return a **LazyFrame** by default. Use `.collect()` to materialize results, or pass `output_type="polars.DataFrame"` for eager evaluation: + +```python +# Lazy (default) - collect when needed +result_lf = pb.overlap(df1, df2) +result_df = result_lf.collect() + +# Eager - get DataFrame directly +result_df = pb.overlap(df1, df2, output_type="polars.DataFrame") +``` + +### Streaming and Out-of-Core Processing + +For datasets larger than available RAM, use `scan_*` functions and streaming execution: + +```python +# Scan files lazily +lf = pb.scan_bed("large_intervals.bed") + +# Process with streaming +result = lf.collect(streaming=True) +``` + +DataFusion streaming is enabled by default for interval operations, processing data in batches without loading the full dataset into memory. + +## Common Pitfalls + +1. **`.pb` accessor on DataFrame vs LazyFrame:** Interval operations (overlap, merge, etc.) are only on `LazyFrame.pb`. `DataFrame.pb` only has write methods. Use `.lazy()` to convert before chaining interval ops. + +2. **LazyFrame returns:** All interval operations and `pb.sql()` return `LazyFrame` by default. Don't forget `.collect()` or use `output_type="polars.DataFrame"`. + +3. **Column name mismatches:** polars-bio expects `chrom`, `start`, `end` by default. Use `cols1`/`cols2` parameters (as lists) if your columns have different names. + +4. **Coordinate system metadata:** When constructing DataFrames manually (not via `read_*`/`scan_*`), polars-bio warns about missing coordinate metadata. Use `pb.set_option("coordinate_system", "0-based")` globally, or use I/O functions that set metadata automatically. + +5. **Probe-build order matters:** For overlap, nearest, and coverage, the first DataFrame is probed against the second. Swapping arguments changes which intervals appear in the left vs right output columns, and can affect performance. + +6. **INT32 position limit:** Genomic positions are stored as 32-bit integers, limiting coordinates to ~2.1 billion. This is sufficient for all known genomes but may be an issue with custom coordinate spaces. + +7. **BAM index requirements:** `read_bam` and `scan_bam` require a `.bai` index file alongside the BAM. Create one with `samtools index` if missing. + +8. **Parallel execution disabled by default:** DataFusion parallelism defaults to 1 partition. Enable for large datasets: + ```python + pb.set_option("datafusion.execution.target_partitions", 8) + ``` + +9. **CRAM has separate functions:** Use `read_cram`/`scan_cram`/`register_cram` for CRAM files (not `read_bam`). CRAM functions require a `reference_path` parameter. + +## Best Practices + +1. **Use `scan_*` for large files:** Prefer `scan_bed`, `scan_vcf`, etc. over `read_*` for files larger than available RAM. Scan functions enable streaming and predicate pushdown. + +2. **Configure parallelism for large datasets:** + ```python + import os + pb.set_option("datafusion.execution.target_partitions", os.cpu_count()) + ``` + +3. **Use BGZF compression:** BGZF-compressed files (`.bed.gz`, `.vcf.gz`) support parallel block decompression, significantly faster than plain GZIP. + +4. **Select columns early:** When only specific columns are needed, select them early to reduce memory usage: + ```python + df = pb.read_vcf("large.vcf.gz").select("chrom", "start", "end", "ref", "alt") + ``` + +5. **Use cloud paths directly:** Pass S3/GCS/Azure URIs directly to read/scan functions instead of downloading files first: + ```python + df = pb.read_bed("s3://my-bucket/regions.bed", allow_anonymous=True) + ``` + +6. **Prefer functional API for single operations, method-chaining for pipelines:** Use `pb.overlap()` for one-off operations and `.lazy().pb.overlap()` when building multi-step pipelines. + +## Resources + +### references/ + +Detailed documentation for each major capability: + +- **interval_operations.md** - All 8 interval operations with parameters, examples, output schemas, and performance tips. Core reference for genomic range arithmetic. + +- **file_io.md** - Supported formats table, per-format column schemas, cloud storage configuration, compression support, and common parameters. + +- **sql_processing.md** - Register functions, DataFusion SQL syntax, combining SQL with interval operations, and example queries. + +- **pileup_operations.md** - Per-base read depth computation from BAM/CRAM files, parameters, and integration with interval operations. + +- **configuration.md** - Global settings (parallelism, coordinate systems, streaming modes), logging, and metadata management. + +- **bioframe_migration.md** - Operation mapping table, API differences, performance comparison, migration code examples, and pandas compatibility mode. diff --git a/scientific-skills/polars-bio/references/bioframe_migration.md b/scientific-skills/polars-bio/references/bioframe_migration.md new file mode 100644 index 0000000..87b7ce1 --- /dev/null +++ b/scientific-skills/polars-bio/references/bioframe_migration.md @@ -0,0 +1,250 @@ +# Migrating from bioframe to polars-bio + +## Overview + +polars-bio is a drop-in replacement for bioframe's core interval operations, offering 6.5-38x speedups on real-world genomic benchmarks. The main differences are: Polars DataFrames instead of pandas, a Rust/DataFusion backend instead of pure Python, streaming support for large genomes, and LazyFrame returns by default. + +## Operation Mapping + +| bioframe | polars-bio | Notes | +|----------|------------|-------| +| `bioframe.overlap(df1, df2)` | `pb.overlap(df1, df2)` | Returns LazyFrame; `.collect()` for DataFrame | +| `bioframe.closest(df1, df2)` | `pb.nearest(df1, df2)` | Renamed; uses `k`, `overlap`, `distance` params | +| `bioframe.count_overlaps(df1, df2)` | `pb.count_overlaps(df1, df2)` | Default suffixes differ: `("", "_")` vs bioframe's | +| `bioframe.merge(df)` | `pb.merge(df)` | Output includes `n_intervals` column | +| `bioframe.cluster(df)` | `pb.cluster(df)` | Output cols: `cluster`, `cluster_start`, `cluster_end` | +| `bioframe.coverage(df1, df2)` | `pb.coverage(df1, df2)` | Two-input in both libraries | +| `bioframe.complement(df, chromsizes)` | `pb.complement(df, view_df=genome)` | Genome as DataFrame, not Series | +| `bioframe.subtract(df1, df2)` | `pb.subtract(df1, df2)` | Same semantics | + +## Key API Differences + +### DataFrames: pandas vs Polars + +**bioframe (pandas):** +```python +import bioframe +import pandas as pd + +df1 = pd.DataFrame({ + "chrom": ["chr1", "chr1"], + "start": [1, 10], + "end": [5, 20], +}) + +result = bioframe.overlap(df1, df2) +# result is a pandas DataFrame +result["start_1"] # pandas column access +``` + +**polars-bio (Polars):** +```python +import polars_bio as pb +import polars as pl + +df1 = pl.DataFrame({ + "chrom": ["chr1", "chr1"], + "start": [1, 10], + "end": [5, 20], +}) + +result = pb.overlap(df1, df2) # Returns LazyFrame +result_df = result.collect() # Materialize to DataFrame +result_df.select("start_1") # Polars column access +``` + +### Return Types: LazyFrame by Default + +All polars-bio operations return a **LazyFrame** by default. Use `.collect()` or `output_type="polars.DataFrame"`: + +```python +# bioframe: always returns DataFrame +result = bioframe.overlap(df1, df2) + +# polars-bio: returns LazyFrame, collect for DataFrame +result_lf = pb.overlap(df1, df2) +result_df = result_lf.collect() + +# Or get DataFrame directly +result_df = pb.overlap(df1, df2, output_type="polars.DataFrame") +``` + +### Genome/Chromsizes + +**bioframe:** +```python +chromsizes = bioframe.fetch_chromsizes("hg38") # Returns pandas Series +complement = bioframe.complement(df, chromsizes) +``` + +**polars-bio:** +```python +genome = pl.DataFrame({ + "chrom": ["chr1", "chr2"], + "start": [0, 0], + "end": [248956422, 242193529], +}) +complement = pb.complement(df, view_df=genome) +``` + +### closest vs nearest + +**bioframe:** +```python +result = bioframe.closest(df1, df2) +``` + +**polars-bio:** +```python +# Basic nearest +result = pb.nearest(df1, df2) + +# Find k nearest neighbors +result = pb.nearest(df1, df2, k=3) + +# Exclude overlapping intervals +result = pb.nearest(df1, df2, overlap=False) + +# Without distance column +result = pb.nearest(df1, df2, distance=False) +``` + +### Method-Chaining (polars-bio only) + +polars-bio adds a `.pb` accessor on **LazyFrame** for method chaining: + +```python +# bioframe: sequential function calls +merged = bioframe.merge(bioframe.overlap(df1, df2)) + +# polars-bio: fluent pipeline (must use LazyFrame) +# Note: overlap adds suffixes, so rename before merge +merged = ( + df1.lazy() + .pb.overlap(df2) + .select( + pl.col("chrom_1").alias("chrom"), + pl.col("start_1").alias("start"), + pl.col("end_1").alias("end"), + ) + .pb.merge() + .collect() +) +``` + +## Performance Comparison + +Benchmarks on real-world genomic datasets (from the polars-bio paper, Bioinformatics 2025): + +| Operation | bioframe | polars-bio | Speedup | +|-----------|----------|------------|---------| +| overlap | 1.0x | 6.5x | 6.5x | +| nearest | 1.0x | 38x | 38x | +| merge | 1.0x | 8.2x | 8.2x | +| coverage | 1.0x | 12x | 12x | + +Speedups come from: +- Rust-based interval tree implementation +- Apache DataFusion query engine +- Apache Arrow columnar memory format +- Parallel execution (when configured) +- Streaming/out-of-core support + +## Migration Code Examples + +### Example 1: Basic Overlap Pipeline + +**Before (bioframe):** +```python +import bioframe +import pandas as pd + +df1 = pd.read_csv("peaks.bed", sep="\t", names=["chrom", "start", "end"]) +df2 = pd.read_csv("genes.bed", sep="\t", names=["chrom", "start", "end", "name"]) + +overlaps = bioframe.overlap(df1, df2, suffixes=("_peak", "_gene")) +filtered = overlaps[overlaps["start_gene"] > 10000] +merged = bioframe.merge(filtered[["chrom_peak", "start_peak", "end_peak"]] + .rename(columns={"chrom_peak": "chrom", "start_peak": "start", "end_peak": "end"})) +``` + +**After (polars-bio):** +```python +import polars_bio as pb +import polars as pl + +df1 = pb.read_bed("peaks.bed") +df2 = pb.read_bed("genes.bed") + +overlaps = pb.overlap(df1, df2, suffixes=("_peak", "_gene"), output_type="polars.DataFrame") +filtered = overlaps.filter(pl.col("start_gene") > 10000) +merged = pb.merge( + filtered.select( + pl.col("chrom_peak").alias("chrom"), + pl.col("start_peak").alias("start"), + pl.col("end_peak").alias("end"), + ), + output_type="polars.DataFrame", +) +``` + +### Example 2: Large-Scale Streaming + +**Before (bioframe) — limited to in-memory:** +```python +import bioframe +import pandas as pd + +# Must load entire file into memory +df1 = pd.read_csv("huge_intervals.bed", sep="\t", names=["chrom", "start", "end"]) +result = bioframe.merge(df1) # Memory-bound +``` + +**After (polars-bio) — streaming:** +```python +import polars_bio as pb + +# Lazy scan, streaming execution +lf = pb.scan_bed("huge_intervals.bed") +result = pb.merge(lf).collect(streaming=True) +``` + +## pandas Compatibility Mode + +For gradual migration, install with pandas support: + +```bash +pip install polars-bio[pandas] +``` + +This enables conversion between pandas and Polars DataFrames: + +```python +import polars_bio as pb +import polars as pl + +# Convert pandas DataFrame to Polars for polars-bio +polars_df = pl.from_pandas(pandas_df) +result = pb.overlap(polars_df, other_df).collect() + +# Convert back to pandas if needed +pandas_result = result.to_pandas() + +# Or request pandas output directly +pandas_result = pb.overlap(polars_df, other_df, output_type="pandas.DataFrame") +``` + +## Migration Checklist + +1. Replace `import bioframe` with `import polars_bio as pb` +2. Replace `import pandas as pd` with `import polars as pl` +3. Convert DataFrame creation from `pd.DataFrame` to `pl.DataFrame` +4. Replace `bioframe.closest` with `pb.nearest` +5. Add `.collect()` after operations (they return LazyFrame by default) +6. Update column access from `df["col"]` to `df.select("col")` or `pl.col("col")` +7. Replace pandas filtering `df[df["col"] > x]` with `df.filter(pl.col("col") > x)` +8. Update chromsizes from Series to DataFrame with `chrom`, `start`, `end`; pass as `view_df=` +9. Add `pb.set_option("datafusion.execution.target_partitions", N)` for parallelism +10. Replace `pd.read_csv` for BED files with `pb.read_bed` or `pb.scan_bed` +11. Note `cluster` output column is `cluster` (not `cluster_id`), plus `cluster_start`, `cluster_end` +12. Note `merge` output includes `n_intervals` column diff --git a/scientific-skills/polars-bio/references/configuration.md b/scientific-skills/polars-bio/references/configuration.md new file mode 100644 index 0000000..23e2c1d --- /dev/null +++ b/scientific-skills/polars-bio/references/configuration.md @@ -0,0 +1,175 @@ +# Configuration + +## Overview + +polars-bio uses a global configuration system based on `set_option` and `get_option` to control execution behavior, coordinate systems, parallelism, and streaming modes. + +## set_option / get_option + +```python +import polars_bio as pb + +# Set a configuration option +pb.set_option("datafusion.execution.target_partitions", 8) + +# Get current value +value = pb.get_option("datafusion.execution.target_partitions") +``` + +## Parallelism + +### DataFusion Target Partitions + +Controls the number of parallel execution partitions. Defaults to 1 (single-threaded). + +```python +import os +import polars_bio as pb + +# Use all available CPU cores +pb.set_option("datafusion.execution.target_partitions", os.cpu_count()) + +# Set specific number of partitions +pb.set_option("datafusion.execution.target_partitions", 8) +``` + +**When to increase parallelism:** +- Processing large files (>1GB) +- Running interval operations on millions of intervals +- Batch processing multiple chromosomes + +**When to keep default (1):** +- Small datasets +- Memory-constrained environments +- Debugging (deterministic execution) + +## Coordinate Systems + +polars-bio defaults to 1-based coordinates (standard genomic convention). + +### Global Coordinate System + +```python +import polars_bio as pb + +# Switch to 0-based half-open coordinates +pb.set_option("coordinate_system", "0-based") + +# Switch back to 1-based (default) +pb.set_option("coordinate_system", "1-based") + +# Check current setting +print(pb.get_option("coordinate_system")) +``` + +### Per-File Override via I/O Functions + +I/O functions accept `use_zero_based` to set coordinate metadata on the resulting DataFrame: + +```python +# Read with explicit 0-based metadata +df = pb.read_bed("regions.bed", use_zero_based=True) +``` + +**Note:** Interval operations (overlap, nearest, etc.) do **not** accept `use_zero_based`. They read coordinate metadata from the DataFrames, which is set by I/O functions or the global option. When using manually constructed DataFrames, polars-bio warns about missing metadata and falls back to the global setting. + +### Setting Metadata on Manual DataFrames + +```python +import polars_bio as pb + +# Set coordinate metadata on a manually created DataFrame +pb.set_source_metadata(df, format="bed", path="") +``` + +### File Format Conventions + +| Format | Native Coordinate System | polars-bio Conversion | +|--------|-------------------------|----------------------| +| BED | 0-based half-open | Converted to configured system on read | +| VCF | 1-based | Converted to configured system on read | +| GFF/GTF | 1-based | Converted to configured system on read | +| BAM/SAM | 0-based | Converted to configured system on read | + +## Streaming Execution Modes + +polars-bio supports two streaming modes for out-of-core processing: + +### DataFusion Streaming + +Enabled by default for interval operations. Processes data in batches through the DataFusion execution engine. + +```python +# DataFusion streaming is automatic for interval operations +result = pb.overlap(lf1, lf2) # Streams if inputs are LazyFrames +``` + +### Polars Streaming + +Use Polars' native streaming for post-processing operations: + +```python +# Collect with Polars streaming +result = lf.collect(streaming=True) +``` + +### Combining Both + +```python +import polars_bio as pb + +# Scan files lazily (DataFusion streaming for I/O) +lf1 = pb.scan_bed("large1.bed") +lf2 = pb.scan_bed("large2.bed") + +# Interval operation (DataFusion streaming) +result_lf = pb.overlap(lf1, lf2) + +# Collect with Polars streaming for final materialization +result = result_lf.collect(streaming=True) +``` + +## Logging + +Control log verbosity for debugging: + +```python +import polars_bio as pb + +# Set log level +pb.set_loglevel("debug") # Detailed execution info +pb.set_loglevel("info") # Standard messages +pb.set_loglevel("warn") # Warnings only (default) +``` + +**Note:** Only `"debug"`, `"info"`, and `"warn"` are valid log levels. + +## Metadata Management + +polars-bio attaches coordinate system and source metadata to DataFrames produced by I/O functions. This metadata is used by interval operations to determine the coordinate system. + +```python +import polars_bio as pb + +# Inspect metadata on a DataFrame +metadata = pb.get_metadata(df) + +# Print metadata summary +pb.print_metadata_summary(df) + +# Print metadata as JSON +pb.print_metadata_json(df) + +# Set metadata on a manually created DataFrame +pb.set_source_metadata(df, format="bed", path="regions.bed") + +# Register a DataFrame as a SQL table +pb.from_polars("my_table", df) +``` + +## Complete Configuration Reference + +| Option | Default | Description | +|--------|---------|-------------| +| `datafusion.execution.target_partitions` | `1` | Number of parallel execution partitions | +| `coordinate_system` | `"1-based"` | Default coordinate system (`"0-based"` or `"1-based"`) | diff --git a/scientific-skills/polars-bio/references/file_io.md b/scientific-skills/polars-bio/references/file_io.md new file mode 100644 index 0000000..e1e9f90 --- /dev/null +++ b/scientific-skills/polars-bio/references/file_io.md @@ -0,0 +1,414 @@ +# Bioinformatics File I/O + +## Overview + +polars-bio provides `read_*`, `scan_*`, `write_*`, and `sink_*` functions for common bioinformatics formats. `read_*` loads data eagerly into a DataFrame, while `scan_*` creates a LazyFrame for streaming/out-of-core processing. `write_*` writes from DataFrame/LazyFrame and returns a row count, while `sink_*` streams from a LazyFrame. + +## Supported Formats + +| Format | Read | Scan | Register (SQL) | Write | Sink | +|--------|------|------|-----------------|-------|------| +| BED | `read_bed` | `scan_bed` | `register_bed` | — | — | +| VCF | `read_vcf` | `scan_vcf` | `register_vcf` | `write_vcf` | `sink_vcf` | +| BAM | `read_bam` | `scan_bam` | `register_bam` | `write_bam` | `sink_bam` | +| CRAM | `read_cram` | `scan_cram` | `register_cram` | `write_cram` | `sink_cram` | +| GFF | `read_gff` | `scan_gff` | `register_gff` | — | — | +| GTF | `read_gtf` | `scan_gtf` | `register_gtf` | — | — | +| FASTA | `read_fasta` | `scan_fasta` | — | — | — | +| FASTQ | `read_fastq` | `scan_fastq` | `register_fastq` | `write_fastq` | `sink_fastq` | +| SAM | `read_sam` | `scan_sam` | `register_sam` | `write_sam` | `sink_sam` | +| Hi-C pairs | `read_pairs` | `scan_pairs` | `register_pairs` | — | — | +| Generic table | `read_table` | `scan_table` | — | — | — | + +## Common Cloud/IO Parameters + +All `read_*` and `scan_*` functions share these parameters (instead of a single `storage_options` dict): + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `path` | str | required | File path (local, S3, GCS, Azure) | +| `chunk_size` | int | `8` | Number of chunks for parallel reading | +| `concurrent_fetches` | int | `1` | Number of concurrent fetches for cloud storage | +| `allow_anonymous` | bool | `True` | Allow anonymous access to cloud storage | +| `enable_request_payer` | bool | `False` | Enable requester-pays for cloud storage | +| `max_retries` | int | `5` | Maximum retries for cloud operations | +| `timeout` | int | `300` | Timeout in seconds for cloud operations | +| `compression_type` | str | `"auto"` | Compression type (auto-detected from extension) | +| `projection_pushdown` | bool | `True` | Enable projection pushdown optimization | +| `use_zero_based` | bool | `None` | Set coordinate system metadata (None = use global setting) | + +Not all functions support all parameters. SAM functions lack cloud parameters. FASTA/FASTQ lack `predicate_pushdown`. + +## BED Format + +### read_bed / scan_bed + +Read BED files. Columns are auto-detected (BED3 through BED12). BED files use 0-based half-open coordinates; polars-bio attaches coordinate metadata automatically. + +```python +import polars_bio as pb + +# Eager read +df = pb.read_bed("regions.bed") + +# Lazy scan +lf = pb.scan_bed("regions.bed") +``` + +### Column Schema (BED3) + +| Column | Type | Description | +|--------|------|-------------| +| `chrom` | String | Chromosome name | +| `start` | Int64 | Start position | +| `end` | Int64 | End position | + +Extended BED fields (auto-detected) add: `name`, `score`, `strand`, `thickStart`, `thickEnd`, `itemRgb`, `blockCount`, `blockSizes`, `blockStarts`. + +## VCF Format + +### read_vcf / scan_vcf + +Read VCF/BCF files. Supports `.vcf`, `.vcf.gz`, `.bcf`. + +```python +import polars_bio as pb + +# Read VCF +df = pb.read_vcf("variants.vcf.gz") + +# Read with specific INFO and FORMAT fields extracted as columns +df = pb.read_vcf("variants.vcf.gz", info_fields=["AF", "DP"], format_fields=["GT", "GQ"]) + +# Read specific samples +df = pb.read_vcf("variants.vcf.gz", samples=["SAMPLE1", "SAMPLE2"]) +``` + +### Additional Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `info_fields` | list[str] | `None` | INFO fields to extract as columns | +| `format_fields` | list[str] | `None` | FORMAT fields to extract as columns | +| `samples` | list[str] | `None` | Samples to include | +| `predicate_pushdown` | bool | `True` | Enable predicate pushdown | + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `chrom` | String | Chromosome | +| `start` | UInt32 | Start position | +| `end` | UInt32 | End position | +| `id` | String | Variant ID | +| `ref` | String | Reference allele | +| `alt` | String | Alternate allele(s) | +| `qual` | Float32 | Quality score | +| `filter` | String | Filter status | +| `info` | String | INFO field (raw, unless `info_fields` specified) | + +### write_vcf / sink_vcf + +```python +import polars_bio as pb + +# Write DataFrame to VCF +rows_written = pb.write_vcf(df, "output.vcf") + +# Stream LazyFrame to VCF +pb.sink_vcf(lf, "output.vcf") +``` + +## BAM Format + +### read_bam / scan_bam + +Read aligned sequencing reads from BAM files. Requires a `.bai` index file. + +```python +import polars_bio as pb + +# Read BAM +df = pb.read_bam("aligned.bam") + +# Scan BAM (streaming) +lf = pb.scan_bam("aligned.bam") + +# Read with specific tags +df = pb.read_bam("aligned.bam", tag_fields=["NM", "MD"]) +``` + +### Additional Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `tag_fields` | list[str] | `None` | SAM tags to extract as columns | +| `predicate_pushdown` | bool | `True` | Enable predicate pushdown | +| `infer_tag_types` | bool | `True` | Infer tag column types from data | +| `infer_tag_sample_size` | int | `100` | Number of records to sample for type inference | +| `tag_type_hints` | list[str] | `None` | Explicit type hints for tags | + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `chrom` | String | Reference sequence name | +| `start` | Int64 | Alignment start position | +| `end` | Int64 | Alignment end position | +| `name` | String | Read name | +| `flags` | UInt32 | SAM flags | +| `mapping_quality` | UInt32 | Mapping quality | +| `cigar` | String | CIGAR string | +| `sequence` | String | Read sequence | +| `quality_scores` | String | Base quality string | +| `mate_chrom` | String | Mate reference name | +| `mate_start` | Int64 | Mate start position | +| `template_length` | Int64 | Template length | + +### write_bam / sink_bam + +```python +rows_written = pb.write_bam(df, "output.bam") +rows_written = pb.write_bam(df, "output.bam", sort_on_write=True) + +pb.sink_bam(lf, "output.bam") +pb.sink_bam(lf, "output.bam", sort_on_write=True) +``` + +## CRAM Format + +### read_cram / scan_cram + +CRAM files have **separate functions** from BAM. Require a reference FASTA and `.crai` index. + +```python +import polars_bio as pb + +# Read CRAM (reference required) +df = pb.read_cram("aligned.cram", reference_path="reference.fasta") + +# Scan CRAM (streaming) +lf = pb.scan_cram("aligned.cram", reference_path="reference.fasta") +``` + +Same additional parameters and column schema as BAM, plus: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `reference_path` | str | `None` | Path to reference FASTA | + +### write_cram / sink_cram + +```python +rows_written = pb.write_cram(df, "output.cram", reference_path="reference.fasta") +pb.sink_cram(lf, "output.cram", reference_path="reference.fasta") +``` + +## GFF/GTF Format + +### read_gff / scan_gff / read_gtf / scan_gtf + +GFF3 and GTF have separate functions. + +```python +import polars_bio as pb + +# Read GFF3 +df = pb.read_gff("annotations.gff3") + +# Read GTF +df = pb.read_gtf("genes.gtf") + +# Extract specific attributes as columns +df = pb.read_gff("annotations.gff3", attr_fields=["gene_id", "gene_name"]) +``` + +### Additional Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `attr_fields` | list[str] | `None` | Attribute fields to extract as columns | +| `predicate_pushdown` | bool | `True` | Enable predicate pushdown | + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `chrom` | String | Sequence name | +| `source` | String | Feature source | +| `type` | String | Feature type (gene, exon, etc.) | +| `start` | Int64 | Start position | +| `end` | Int64 | End position | +| `score` | Float32 | Score | +| `strand` | String | Strand (+/-/.) | +| `phase` | UInt32 | Phase (0/1/2) | +| `attributes` | String | Attributes string | + +## FASTA Format + +### read_fasta / scan_fasta + +Read reference sequences from FASTA files. + +```python +import polars_bio as pb + +df = pb.read_fasta("reference.fasta") +``` + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `name` | String | Sequence name | +| `description` | String | Description line | +| `sequence` | String | Nucleotide sequence | + +## FASTQ Format + +### read_fastq / scan_fastq + +Read raw sequencing reads with quality scores. + +```python +import polars_bio as pb + +df = pb.read_fastq("reads.fastq.gz") +``` + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `name` | String | Read name | +| `description` | String | Description line | +| `sequence` | String | Nucleotide sequence | +| `quality` | String | Quality string (Phred+33 encoded) | + +### write_fastq / sink_fastq + +```python +rows_written = pb.write_fastq(df, "output.fastq") +pb.sink_fastq(lf, "output.fastq") +``` + +## SAM Format + +### read_sam / scan_sam + +Read text-format alignment files. Same column schema as BAM. No cloud parameters. + +```python +import polars_bio as pb + +df = pb.read_sam("alignments.sam") +``` + +### Additional Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `tag_fields` | list[str] | `None` | SAM tags to extract | +| `infer_tag_types` | bool | `True` | Infer tag types | +| `infer_tag_sample_size` | int | `100` | Sample size for inference | +| `tag_type_hints` | list[str] | `None` | Explicit type hints | + +### write_sam / sink_sam + +```python +rows_written = pb.write_sam(df, "output.sam") +pb.sink_sam(lf, "output.sam", sort_on_write=True) +``` + +## Hi-C Pairs + +### read_pairs / scan_pairs + +Read Hi-C pairs format files for chromatin contact data. + +```python +import polars_bio as pb + +df = pb.read_pairs("contacts.pairs") +lf = pb.scan_pairs("contacts.pairs") +``` + +### Column Schema + +| Column | Type | Description | +|--------|------|-------------| +| `readID` | String | Read identifier | +| `chrom1` | String | Chromosome of first contact | +| `pos1` | Int32 | Position of first contact | +| `chrom2` | String | Chromosome of second contact | +| `pos2` | Int32 | Position of second contact | +| `strand1` | String | Strand of first contact | +| `strand2` | String | Strand of second contact | + +## Generic Table Reader + +### read_table / scan_table + +Read tab-delimited files with custom schema. Useful for non-standard formats or bioframe-compatible tables. + +```python +import polars_bio as pb + +df = pb.read_table("custom.tsv", schema={"chrom": str, "start": int, "end": int, "name": str}) +lf = pb.scan_table("custom.tsv", schema={"chrom": str, "start": int, "end": int}) +``` + +## Cloud Storage + +All `read_*` and `scan_*` functions support cloud storage via individual parameters: + +### Amazon S3 + +```python +df = pb.read_bed( + "s3://bucket/regions.bed", + allow_anonymous=False, + max_retries=10, + timeout=600, +) +``` + +### Google Cloud Storage + +```python +df = pb.read_vcf("gs://bucket/variants.vcf.gz", allow_anonymous=True) +``` + +### Azure Blob Storage + +```python +df = pb.read_bam("az://container/aligned.bam", allow_anonymous=False) +``` + +**Note:** For authenticated access, configure credentials via environment variables or cloud SDK configuration (e.g., `AWS_ACCESS_KEY_ID`, `GOOGLE_APPLICATION_CREDENTIALS`). + +## Compression Support + +polars-bio transparently handles compressed files: + +| Compression | Extension | Parallel Decompression | +|-------------|-----------|----------------------| +| GZIP | `.gz` | No | +| BGZF | `.gz` (with BGZF blocks) | Yes | +| Uncompressed | (none) | N/A | + +**Recommendation:** Use BGZF compression (e.g., created with `bgzip`) for large files. BGZF supports parallel block decompression, significantly improving read performance compared to plain GZIP. + +## Describe Functions + +Inspect file structure without fully reading: + +```python +import polars_bio as pb + +# Describe file schemas and metadata +schema_df = pb.describe_vcf("samples.vcf.gz") +schema_df = pb.describe_bam("aligned.bam") +schema_df = pb.describe_sam("alignments.sam") +schema_df = pb.describe_cram("aligned.cram", reference_path="ref.fasta") +``` diff --git a/scientific-skills/polars-bio/references/interval_operations.md b/scientific-skills/polars-bio/references/interval_operations.md new file mode 100644 index 0000000..d545cbd --- /dev/null +++ b/scientific-skills/polars-bio/references/interval_operations.md @@ -0,0 +1,362 @@ +# Genomic Interval Operations + +## Overview + +polars-bio provides 8 core operations for genomic interval arithmetic. All operations work on Polars DataFrames or LazyFrames containing genomic intervals (columns: `chrom`, `start`, `end` by default) and return a **LazyFrame** by default. Pass `output_type="polars.DataFrame"` for eager results. + +## Operations Summary + +| Operation | Inputs | Description | +|-----------|--------|-------------| +| `overlap` | two DataFrames | Find pairs of overlapping intervals | +| `count_overlaps` | two DataFrames | Count overlaps per interval in the first set | +| `nearest` | two DataFrames | Find nearest intervals between two sets | +| `merge` | one DataFrame | Merge overlapping/bookended intervals | +| `cluster` | one DataFrame | Assign cluster IDs to overlapping intervals | +| `coverage` | two DataFrames | Compute per-interval coverage counts | +| `complement` | one DataFrame + genome | Find gaps between intervals | +| `subtract` | two DataFrames | Remove overlapping portions | + +## overlap + +Find pairs of overlapping intervals between two DataFrames. + +### Functional API + +```python +import polars as pl +import polars_bio as pb + +df1 = pl.DataFrame({ + "chrom": ["chr1", "chr1", "chr1"], + "start": [1, 5, 22], + "end": [6, 9, 30], +}) + +df2 = pl.DataFrame({ + "chrom": ["chr1", "chr1"], + "start": [3, 25], + "end": [8, 28], +}) + +# Returns LazyFrame by default +result_lf = pb.overlap(df1, df2, suffixes=("_1", "_2")) +result_df = result_lf.collect() + +# Or get DataFrame directly +result_df = pb.overlap(df1, df2, suffixes=("_1", "_2"), output_type="polars.DataFrame") +``` + +### Method-Chaining API (LazyFrame only) + +```python +result = df1.lazy().pb.overlap(df2, suffixes=("_1", "_2")).collect() +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df1` | DataFrame/LazyFrame/str | required | First (probe) interval set | +| `df2` | DataFrame/LazyFrame/str | required | Second (build) interval set | +| `suffixes` | tuple[str, str] | `("_1", "_2")` | Suffixes for overlapping column names | +| `on_cols` | list[str] | `None` | Additional columns to join on (beyond genomic coords) | +| `cols1` | list[str] | `["chrom", "start", "end"]` | Column names in df1 | +| `cols2` | list[str] | `["chrom", "start", "end"]` | Column names in df2 | +| `algorithm` | str | `"Coitrees"` | Interval algorithm | +| `low_memory` | bool | `False` | Low memory mode | +| `output_type` | str | `"polars.LazyFrame"` | Output format: `"polars.LazyFrame"`, `"polars.DataFrame"`, `"pandas.DataFrame"` | +| `projection_pushdown` | bool | `True` | Enable projection pushdown optimization | + +### Output Schema + +Returns columns from both inputs with suffixes applied: +- `chrom_1`, `start_1`, `end_1` (from df1) +- `chrom_2`, `start_2`, `end_2` (from df2) +- Any additional columns from df1 and df2 + +Column dtypes are `String` for chrom and `Int64` for start/end. + +## count_overlaps + +Count the number of overlapping intervals from df2 for each interval in df1. + +```python +# Functional +counts = pb.count_overlaps(df1, df2) + +# Method-chaining (LazyFrame) +counts = df1.lazy().pb.count_overlaps(df2) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df1` | DataFrame/LazyFrame/str | required | Query interval set | +| `df2` | DataFrame/LazyFrame/str | required | Target interval set | +| `suffixes` | tuple[str, str] | `("", "_")` | Suffixes for column names | +| `cols1` | list[str] | `["chrom", "start", "end"]` | Column names in df1 | +| `cols2` | list[str] | `["chrom", "start", "end"]` | Column names in df2 | +| `on_cols` | list[str] | `None` | Additional join columns | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `naive_query` | bool | `True` | Use naive query strategy | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns df1 columns with an additional `count` column (Int64). + +## nearest + +Find the nearest interval in df2 for each interval in df1. + +```python +# Find nearest (default: k=1, any direction) +nearest = pb.nearest(df1, df2, output_type="polars.DataFrame") + +# Find k nearest +nearest = pb.nearest(df1, df2, k=3) + +# Exclude overlapping intervals from results +nearest = pb.nearest(df1, df2, overlap=False) + +# Without distance column +nearest = pb.nearest(df1, df2, distance=False) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df1` | DataFrame/LazyFrame/str | required | Query interval set | +| `df2` | DataFrame/LazyFrame/str | required | Target interval set | +| `suffixes` | tuple[str, str] | `("_1", "_2")` | Suffixes for column names | +| `on_cols` | list[str] | `None` | Additional join columns | +| `cols1` | list[str] | `["chrom", "start", "end"]` | Column names in df1 | +| `cols2` | list[str] | `["chrom", "start", "end"]` | Column names in df2 | +| `k` | int | `1` | Number of nearest neighbors to find | +| `overlap` | bool | `True` | Include overlapping intervals in results | +| `distance` | bool | `True` | Include distance column in output | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns columns from both DataFrames (with suffixes) plus a `distance` column (Int64) with the distance to the nearest interval (0 if overlapping). Distance column is omitted if `distance=False`. + +## merge + +Merge overlapping and bookended intervals within a single DataFrame. + +```python +import polars as pl +import polars_bio as pb + +df = pl.DataFrame({ + "chrom": ["chr1", "chr1", "chr1", "chr2"], + "start": [1, 4, 20, 1], + "end": [6, 9, 30, 10], +}) + +# Functional +merged = pb.merge(df, output_type="polars.DataFrame") + +# Method-chaining (LazyFrame) +merged = df.lazy().pb.merge().collect() + +# Merge intervals within a minimum distance +merged = pb.merge(df, min_dist=10) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df` | DataFrame/LazyFrame/str | required | Interval set to merge | +| `min_dist` | int | `0` | Minimum distance between intervals to merge (0 = must overlap or be bookended) | +| `cols` | list[str] | `["chrom", "start", "end"]` | Column names | +| `on_cols` | list[str] | `None` | Additional grouping columns | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +| Column | Type | Description | +|--------|------|-------------| +| `chrom` | String | Chromosome | +| `start` | Int64 | Merged interval start | +| `end` | Int64 | Merged interval end | +| `n_intervals` | Int64 | Number of intervals merged | + +## cluster + +Assign cluster IDs to overlapping intervals. Intervals that overlap are assigned the same cluster ID. + +```python +# Functional +clustered = pb.cluster(df, output_type="polars.DataFrame") + +# Method-chaining (LazyFrame) +clustered = df.lazy().pb.cluster().collect() + +# With minimum distance +clustered = pb.cluster(df, min_dist=5) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df` | DataFrame/LazyFrame/str | required | Interval set | +| `min_dist` | int | `0` | Minimum distance for clustering | +| `cols` | list[str] | `["chrom", "start", "end"]` | Column names | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns the original columns plus: + +| Column | Type | Description | +|--------|------|-------------| +| `cluster` | Int64 | Cluster ID (intervals in the same cluster overlap) | +| `cluster_start` | Int64 | Start of the cluster extent | +| `cluster_end` | Int64 | End of the cluster extent | + +## coverage + +Compute per-interval coverage counts. This is a **two-input** operation: for each interval in df1, count the coverage from df2. + +```python +# Functional +cov = pb.coverage(df1, df2, output_type="polars.DataFrame") + +# Method-chaining (LazyFrame) +cov = df1.lazy().pb.coverage(df2).collect() +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df1` | DataFrame/LazyFrame/str | required | Query intervals | +| `df2` | DataFrame/LazyFrame/str | required | Coverage source intervals | +| `suffixes` | tuple[str, str] | `("_1", "_2")` | Suffixes for column names | +| `on_cols` | list[str] | `None` | Additional join columns | +| `cols1` | list[str] | `["chrom", "start", "end"]` | Column names in df1 | +| `cols2` | list[str] | `["chrom", "start", "end"]` | Column names in df2 | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns columns from df1 plus a `coverage` column (Int64). + +## complement + +Find gaps between intervals within a genome. Requires a genome definition specifying chromosome sizes. + +```python +import polars as pl +import polars_bio as pb + +df = pl.DataFrame({ + "chrom": ["chr1", "chr1"], + "start": [100, 500], + "end": [200, 600], +}) + +genome = pl.DataFrame({ + "chrom": ["chr1"], + "start": [0], + "end": [1000], +}) + +# Functional +gaps = pb.complement(df, view_df=genome, output_type="polars.DataFrame") + +# Method-chaining (LazyFrame) +gaps = df.lazy().pb.complement(genome).collect() +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df` | DataFrame/LazyFrame/str | required | Interval set | +| `view_df` | DataFrame/LazyFrame | `None` | Genome with chrom, start, end defining chromosome extents | +| `cols` | list[str] | `["chrom", "start", "end"]` | Column names in df | +| `view_cols` | list[str] | `None` | Column names in view_df | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns a DataFrame with `chrom` (String), `start` (Int64), `end` (Int64) columns representing gaps between intervals. + +## subtract + +Remove portions of intervals in df1 that overlap with intervals in df2. + +```python +# Functional +result = pb.subtract(df1, df2, output_type="polars.DataFrame") + +# Method-chaining (LazyFrame) +result = df1.lazy().pb.subtract(df2).collect() +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `df1` | DataFrame/LazyFrame/str | required | Intervals to subtract from | +| `df2` | DataFrame/LazyFrame/str | required | Intervals to subtract | +| `cols1` | list[str] | `["chrom", "start", "end"]` | Column names in df1 | +| `cols2` | list[str] | `["chrom", "start", "end"]` | Column names in df2 | +| `output_type` | str | `"polars.LazyFrame"` | Output format | +| `projection_pushdown` | bool | `True` | Enable projection pushdown | + +### Output Schema + +Returns `chrom` (String), `start` (Int64), `end` (Int64) representing the remaining portions of df1 intervals after subtraction. + +## Performance Considerations + +### Probe-Build Architecture + +Two-input operations (`overlap`, `nearest`, `count_overlaps`, `coverage`, `subtract`) use a probe-build join: +- **Probe** (first DataFrame): Iterated over, row by row +- **Build** (second DataFrame): Indexed into an interval tree for fast lookup + +For best performance, pass the **larger** DataFrame as the probe (first argument) and the **smaller** one as the build (second argument). + +### Parallelism + +By default, polars-bio uses a single execution partition. For large datasets, enable parallel execution: + +```python +import os +import polars_bio as pb + +pb.set_option("datafusion.execution.target_partitions", os.cpu_count()) +``` + +### Streaming Execution + +DataFusion streaming is enabled by default for interval operations. Data is processed in batches, enabling out-of-core computation for datasets larger than available RAM. + +### When to Use Lazy Evaluation + +Use `scan_*` functions and lazy DataFrames for: +- Files larger than available RAM +- When only a subset of results is needed +- Pipeline operations where intermediate results can be optimized away + +```python +# Lazy pipeline +lf1 = pb.scan_bed("large1.bed") +lf2 = pb.scan_bed("large2.bed") +result = pb.overlap(lf1, lf2).collect() +``` diff --git a/scientific-skills/polars-bio/references/pileup_operations.md b/scientific-skills/polars-bio/references/pileup_operations.md new file mode 100644 index 0000000..cf02cfe --- /dev/null +++ b/scientific-skills/polars-bio/references/pileup_operations.md @@ -0,0 +1,176 @@ +# Pileup Operations + +## Overview + +polars-bio provides the `pb.depth()` function for computing per-base or per-block read depth from BAM/CRAM files. It uses CIGAR-aware depth calculation to accurately account for insertions, deletions, and clipping. Returns a **LazyFrame** by default. + +## pb.depth() + +Compute read depth from alignment files. + +### Basic Usage + +```python +import polars_bio as pb + +# Compute depth across entire BAM file (returns LazyFrame) +depth_lf = pb.depth("aligned.bam") +depth_df = depth_lf.collect() + +# Get DataFrame directly +depth_df = pb.depth("aligned.bam", output_type="polars.DataFrame") +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `path` | str | required | Path to BAM or CRAM file | +| `filter_flag` | int | `1796` | SAM flag filter (default excludes unmapped, secondary, duplicate, QC-fail) | +| `min_mapping_quality` | int | `0` | Minimum mapping quality to include reads | +| `binary_cigar` | bool | `True` | Use binary CIGAR for faster processing | +| `dense_mode` | str | `"auto"` | Dense output mode | +| `use_zero_based` | bool | `None` | Coordinate system (None = use global setting) | +| `per_base` | bool | `False` | Per-base depth (True) vs block depth (False) | +| `output_type` | str | `"polars.LazyFrame"` | Output format: `"polars.LazyFrame"`, `"polars.DataFrame"`, `"pandas.DataFrame"` | + +### Output Schema (Block Mode, default) + +When `per_base=False` (default), adjacent positions with the same depth are grouped into blocks: + +| Column | Type | Description | +|--------|------|-------------| +| `contig` | String | Chromosome/contig name | +| `pos_start` | Int64 | Block start position | +| `pos_end` | Int64 | Block end position | +| `coverage` | Int16 | Read depth | + +### Output Schema (Per-Base Mode) + +When `per_base=True`, each position is reported individually: + +| Column | Type | Description | +|--------|------|-------------| +| `contig` | String | Chromosome/contig name | +| `pos` | Int64 | Position | +| `coverage` | Int16 | Read depth at position | + +### filter_flag + +The default `filter_flag=1796` excludes reads with these SAM flags: +- 4: unmapped +- 256: secondary alignment +- 512: failed QC +- 1024: PCR/optical duplicate + +### CIGAR-Aware Computation + +`pb.depth()` correctly handles CIGAR operations: +- **M/X/=** (match/mismatch): Counted as coverage +- **D** (deletion): Counted as coverage (reads span the deletion) +- **N** (skipped region): Not counted (e.g., spliced alignments) +- **I** (insertion): Not counted at reference positions +- **S/H** (soft/hard clipping): Not counted + +## Examples + +### Whole-Genome Depth + +```python +import polars_bio as pb +import polars as pl + +# Compute depth genome-wide (block mode) +depth = pb.depth("sample.bam", output_type="polars.DataFrame") + +# Summary statistics +depth.select( + pl.col("coverage").cast(pl.Int64).mean().alias("mean_depth"), + pl.col("coverage").cast(pl.Int64).median().alias("median_depth"), + pl.col("coverage").cast(pl.Int64).max().alias("max_depth"), +) +``` + +### Per-Base Depth + +```python +import polars_bio as pb + +# Per-base depth (one row per position) +depth = pb.depth("sample.bam", per_base=True, output_type="polars.DataFrame") +``` + +### Depth with Quality Filters + +```python +import polars_bio as pb + +# Only count well-mapped reads +depth = pb.depth( + "sample.bam", + min_mapping_quality=20, + output_type="polars.DataFrame", +) +``` + +### Custom Flag Filter + +```python +import polars_bio as pb + +# Only exclude unmapped (4) and duplicate (1024) reads +depth = pb.depth( + "sample.bam", + filter_flag=4 + 1024, + output_type="polars.DataFrame", +) +``` + +## Integration with Interval Operations + +Depth results can be used with polars-bio interval operations. Note that depth output uses `contig`/`pos_start`/`pos_end` column names, so use `cols` parameters to map them: + +```python +import polars_bio as pb +import polars as pl + +# Compute depth +depth = pb.depth("sample.bam", output_type="polars.DataFrame") + +# Rename columns to match interval operation conventions +depth_intervals = depth.rename({ + "contig": "chrom", + "pos_start": "start", + "pos_end": "end", +}) + +# Find regions with adequate coverage +adequate = depth_intervals.filter(pl.col("coverage") >= 30) + +# Merge adjacent adequate-coverage blocks +merged = pb.merge(adequate, output_type="polars.DataFrame") + +# Find gaps in coverage (complement) +genome = pl.DataFrame({ + "chrom": ["chr1"], + "start": [0], + "end": [248956422], +}) +gaps = pb.complement(adequate, view_df=genome, output_type="polars.DataFrame") +``` + +### Using cols Parameters Instead of Renaming + +```python +import polars_bio as pb + +depth = pb.depth("sample.bam", output_type="polars.DataFrame") +targets = pb.read_bed("targets.bed") + +# Use cols1 to specify depth column names +overlapping = pb.overlap( + depth, targets, + cols1=["contig", "pos_start", "pos_end"], + output_type="polars.DataFrame", +) +``` diff --git a/scientific-skills/polars-bio/references/sql_processing.md b/scientific-skills/polars-bio/references/sql_processing.md new file mode 100644 index 0000000..0053b91 --- /dev/null +++ b/scientific-skills/polars-bio/references/sql_processing.md @@ -0,0 +1,224 @@ +# SQL Data Processing + +## Overview + +polars-bio integrates Apache DataFusion's SQL engine, enabling SQL queries on bioinformatics files and Polars DataFrames. Register files as tables and query them using standard SQL syntax. All queries return a **LazyFrame** — call `.collect()` to materialize results. + +## Register Functions + +Register bioinformatics files as SQL tables. **Path is the first argument**, name is an optional keyword: + +```python +import polars_bio as pb + +# Register various file formats (path first, name= keyword) +pb.register_vcf("samples.vcf.gz", name="variants") +pb.register_bed("target_regions.bed", name="regions") +pb.register_bam("aligned.bam", name="alignments") +pb.register_cram("aligned.cram", name="cram_alignments") +pb.register_gff("genes.gff3", name="annotations") +pb.register_gtf("genes.gtf", name="gtf_annotations") +pb.register_fastq("sample.fastq.gz", name="reads") +pb.register_sam("alignments.sam", name="sam_alignments") +pb.register_pairs("contacts.pairs", name="hic_contacts") +``` + +### Parameters + +All `register_*` functions share these parameters: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `path` | str | required (first positional) | Path to file (local or cloud) | +| `name` | str | `None` | Table name for SQL queries (auto-generated if omitted) | +| `chunk_size` | int | `64` | Chunk size for reading | +| `concurrent_fetches` | int | `8` | Concurrent cloud fetches | +| `allow_anonymous` | bool | `True` | Allow anonymous cloud access | +| `max_retries` | int | `5` | Cloud retry count | +| `timeout` | int | `300` | Cloud timeout in seconds | +| `enable_request_payer` | bool | `False` | Requester-pays cloud | +| `compression_type` | str | `"auto"` | Compression type | + +Some register functions have additional format-specific parameters (e.g., `info_fields` on `register_vcf`). + +**Note:** `register_fasta` does not exist. Use `scan_fasta` + `from_polars` as a workaround. + +## from_polars + +Register an existing Polars DataFrame as a SQL-queryable table: + +```python +import polars as pl +import polars_bio as pb + +df = pl.DataFrame({ + "chrom": ["chr1", "chr1", "chr2"], + "start": [100, 500, 200], + "end": [200, 600, 400], + "name": ["peak1", "peak2", "peak3"], +}) + +pb.from_polars("my_peaks", df) + +# Now query with SQL +result = pb.sql("SELECT * FROM my_peaks WHERE chrom = 'chr1'").collect() +``` + +**Important:** `register_view` takes a SQL query string, not a DataFrame. Use `from_polars` to register DataFrames. + +## register_view + +Create a SQL view from a query string: + +```python +import polars_bio as pb + +# Create a view from a SQL query +pb.register_view("chr1_variants", "SELECT * FROM variants WHERE chrom = 'chr1'") + +# Query the view +result = pb.sql("SELECT * FROM chr1_variants WHERE qual > 30").collect() +``` + +### Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `name` | str | View name | +| `query` | str | SQL query string defining the view | + +## pb.sql() + +Execute SQL queries using DataFusion SQL syntax. **Returns a LazyFrame** — call `.collect()` to get a DataFrame. + +```python +import polars_bio as pb + +# Simple query +result = pb.sql("SELECT chrom, start, end FROM regions WHERE chrom = 'chr1'").collect() + +# Aggregation +result = pb.sql(""" + SELECT chrom, COUNT(*) as variant_count, AVG(qual) as avg_qual + FROM variants + GROUP BY chrom + ORDER BY variant_count DESC +""").collect() + +# Join tables +result = pb.sql(""" + SELECT v.chrom, v.start, v.end, v.ref, v.alt, r.name + FROM variants v + JOIN regions r ON v.chrom = r.chrom + AND v.start >= r.start + AND v.end <= r.end +""").collect() +``` + +## DataFusion SQL Syntax + +polars-bio uses Apache DataFusion's SQL dialect. Key features: + +### Filtering + +```sql +SELECT * FROM variants WHERE qual > 30 AND filter = 'PASS' +``` + +### Aggregations + +```sql +SELECT chrom, COUNT(*) as n, MIN(start) as min_pos, MAX(end) as max_pos +FROM regions +GROUP BY chrom +HAVING COUNT(*) > 100 +``` + +### Window Functions + +```sql +SELECT chrom, start, end, + ROW_NUMBER() OVER (PARTITION BY chrom ORDER BY start) as row_num, + LAG(end) OVER (PARTITION BY chrom ORDER BY start) as prev_end +FROM regions +``` + +### Subqueries + +```sql +SELECT * FROM variants +WHERE chrom IN (SELECT DISTINCT chrom FROM regions) +``` + +### Common Table Expressions (CTEs) + +```sql +WITH filtered_variants AS ( + SELECT * FROM variants WHERE qual > 30 +), +chr1_regions AS ( + SELECT * FROM regions WHERE chrom = 'chr1' +) +SELECT f.chrom, f.start, f.ref, f.alt +FROM filtered_variants f +JOIN chr1_regions r ON f.start BETWEEN r.start AND r.end +``` + +## Combining SQL with Interval Operations + +SQL queries return LazyFrames that can be used directly with polars-bio interval operations: + +```python +import polars_bio as pb + +# Register files +pb.register_vcf("samples.vcf.gz", name="variants") +pb.register_bed("target_regions.bed", name="targets") + +# SQL to filter (returns LazyFrame) +high_qual = pb.sql("SELECT chrom, start, end FROM variants WHERE qual > 30").collect() +targets = pb.sql("SELECT chrom, start, end FROM targets WHERE chrom = 'chr1'").collect() + +# Interval operation on SQL results +overlapping = pb.overlap(high_qual, targets).collect() +``` + +## Example Workflows + +### Variant Density Analysis + +```python +import polars_bio as pb + +pb.register_vcf("cohort.vcf.gz", name="variants") +pb.register_bed("genome_windows_1mb.bed", name="windows") + +# Count variants per window using SQL +result = pb.sql(""" + SELECT w.chrom, w.start, w.end, COUNT(v.start) as variant_count + FROM windows w + LEFT JOIN variants v ON w.chrom = v.chrom + AND v.start >= w.start + AND v.start < w.end + GROUP BY w.chrom, w.start, w.end + ORDER BY variant_count DESC +""").collect() +``` + +### Gene Annotation Lookup + +```python +import polars_bio as pb + +pb.register_gff("gencode.gff3", name="genes") + +# Find all protein-coding genes on chromosome 1 +coding_genes = pb.sql(""" + SELECT chrom, start, end, attributes + FROM genes + WHERE type = 'gene' + AND chrom = 'chr1' + AND attributes LIKE '%protein_coding%' + ORDER BY start +""").collect() +```