Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,262 @@
"""
Helper script to quickly set up a PyTorch Lightning Trainer with common configurations.
This script provides preset configurations for different training scenarios
and makes it easy to create a Trainer with best practices.
"""
import lightning as L
from lightning.pytorch.callbacks import (
ModelCheckpoint,
EarlyStopping,
LearningRateMonitor,
RichProgressBar,
ModelSummary,
)
from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger
def create_trainer(
preset: str = "default",
max_epochs: int = 100,
accelerator: str = "auto",
devices: int = 1,
log_dir: str = "./logs",
experiment_name: str = "lightning_experiment",
enable_checkpointing: bool = True,
enable_early_stopping: bool = True,
**kwargs
):
"""
Create a Lightning Trainer with preset configurations.
Args:
preset: Configuration preset - "default", "fast_dev", "production", "distributed"
max_epochs: Maximum number of training epochs
accelerator: Device to use ("auto", "gpu", "cpu", "tpu")
devices: Number of devices to use
log_dir: Directory for logs and checkpoints
experiment_name: Name for the experiment
enable_checkpointing: Whether to enable model checkpointing
enable_early_stopping: Whether to enable early stopping
**kwargs: Additional arguments to pass to Trainer
Returns:
Configured Lightning Trainer instance
"""
callbacks = []
logger_list = []
# Configure based on preset
if preset == "fast_dev":
# Fast development run - minimal epochs, quick debugging
config = {
"fast_dev_run": False,
"max_epochs": 3,
"limit_train_batches": 100,
"limit_val_batches": 50,
"log_every_n_steps": 10,
"enable_progress_bar": True,
"enable_model_summary": True,
}
elif preset == "production":
# Production-ready configuration with all bells and whistles
config = {
"max_epochs": max_epochs,
"precision": "16-mixed",
"gradient_clip_val": 1.0,
"log_every_n_steps": 50,
"enable_progress_bar": True,
"enable_model_summary": True,
"deterministic": True,
"benchmark": True,
}
# Add model checkpointing
if enable_checkpointing:
callbacks.append(
ModelCheckpoint(
dirpath=f"{log_dir}/{experiment_name}/checkpoints",
filename="{epoch}-{val_loss:.2f}",
monitor="val_loss",
mode="min",
save_top_k=3,
save_last=True,
verbose=True,
)
)
# Add early stopping
if enable_early_stopping:
callbacks.append(
EarlyStopping(
monitor="val_loss",
patience=10,
mode="min",
verbose=True,
)
)
# Add learning rate monitor
callbacks.append(LearningRateMonitor(logging_interval="epoch"))
# Add TensorBoard logger
logger_list.append(
TensorBoardLogger(
save_dir=log_dir,
name=experiment_name,
version=None,
)
)
elif preset == "distributed":
# Distributed training configuration
config = {
"max_epochs": max_epochs,
"strategy": "ddp",
"precision": "16-mixed",
"sync_batchnorm": True,
"use_distributed_sampler": True,
"log_every_n_steps": 50,
"enable_progress_bar": True,
}
# Add model checkpointing
if enable_checkpointing:
callbacks.append(
ModelCheckpoint(
dirpath=f"{log_dir}/{experiment_name}/checkpoints",
filename="{epoch}-{val_loss:.2f}",
monitor="val_loss",
mode="min",
save_top_k=3,
save_last=True,
)
)
else: # default
# Default configuration - balanced for most use cases
config = {
"max_epochs": max_epochs,
"log_every_n_steps": 50,
"enable_progress_bar": True,
"enable_model_summary": True,
}
# Add basic checkpointing
if enable_checkpointing:
callbacks.append(
ModelCheckpoint(
dirpath=f"{log_dir}/{experiment_name}/checkpoints",
filename="{epoch}-{val_loss:.2f}",
monitor="val_loss",
save_last=True,
)
)
# Add CSV logger
logger_list.append(
CSVLogger(
save_dir=log_dir,
name=experiment_name,
)
)
# Add progress bar
if config.get("enable_progress_bar", True):
callbacks.append(RichProgressBar())
# Merge with provided kwargs
final_config = {
**config,
"accelerator": accelerator,
"devices": devices,
"callbacks": callbacks,
"logger": logger_list if logger_list else True,
**kwargs,
}
# Create and return trainer
return L.Trainer(**final_config)
def create_debugging_trainer():
"""Create a trainer optimized for debugging."""
return create_trainer(
preset="fast_dev",
max_epochs=1,
limit_train_batches=10,
limit_val_batches=5,
num_sanity_val_steps=2,
)
def create_gpu_trainer(num_gpus: int = 1, precision: str = "16-mixed"):
"""Create a trainer optimized for GPU training."""
return create_trainer(
preset="production",
accelerator="gpu",
devices=num_gpus,
precision=precision,
)
def create_distributed_trainer(num_gpus: int = 2, num_nodes: int = 1):
"""Create a trainer for distributed training across multiple GPUs."""
return create_trainer(
preset="distributed",
accelerator="gpu",
devices=num_gpus,
num_nodes=num_nodes,
strategy="ddp",
)
# Example usage
if __name__ == "__main__":
print("Creating different trainer configurations...\n")
# 1. Default trainer
print("1. Default trainer:")
trainer_default = create_trainer(preset="default", max_epochs=50)
print(f" Max epochs: {trainer_default.max_epochs}")
print(f" Accelerator: {trainer_default.accelerator}")
print(f" Callbacks: {len(trainer_default.callbacks)}")
print()
# 2. Fast development trainer
print("2. Fast development trainer:")
trainer_dev = create_trainer(preset="fast_dev")
print(f" Max epochs: {trainer_dev.max_epochs}")
print(f" Train batches limit: {trainer_dev.limit_train_batches}")
print()
# 3. Production trainer
print("3. Production trainer:")
trainer_prod = create_trainer(
preset="production",
max_epochs=100,
experiment_name="my_experiment"
)
print(f" Max epochs: {trainer_prod.max_epochs}")
print(f" Precision: {trainer_prod.precision}")
print(f" Callbacks: {len(trainer_prod.callbacks)}")
print()
# 4. Debugging trainer
print("4. Debugging trainer:")
trainer_debug = create_debugging_trainer()
print(f" Max epochs: {trainer_debug.max_epochs}")
print(f" Train batches: {trainer_debug.limit_train_batches}")
print()
# 5. GPU trainer
print("5. GPU trainer:")
trainer_gpu = create_gpu_trainer(num_gpus=1)
print(f" Accelerator: {trainer_gpu.accelerator}")
print(f" Precision: {trainer_gpu.precision}")
print()
print("All trainer configurations created successfully!")

View File

@@ -0,0 +1,221 @@
"""
Template for creating a PyTorch Lightning DataModule.
This template includes all common hooks and patterns for organizing
data processing workflows with best practices.
"""
import lightning as L
from torch.utils.data import DataLoader, Dataset, random_split
import torch
class TemplateDataset(Dataset):
"""Example dataset - replace with your actual dataset."""
def __init__(self, data, targets, transform=None):
self.data = data
self.targets = targets
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.targets[idx]
if self.transform:
x = self.transform(x)
return x, y
class TemplateDataModule(L.LightningDataModule):
"""Template DataModule with all common hooks and patterns."""
def __init__(
self,
data_dir: str = "./data",
batch_size: int = 32,
num_workers: int = 4,
train_val_split: tuple = (0.8, 0.2),
seed: int = 42,
pin_memory: bool = True,
persistent_workers: bool = True,
):
super().__init__()
# Save hyperparameters
self.save_hyperparameters()
# Initialize attributes
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
self.train_val_split = train_val_split
self.seed = seed
self.pin_memory = pin_memory
self.persistent_workers = persistent_workers
# Placeholders for datasets
self.train_dataset = None
self.val_dataset = None
self.test_dataset = None
self.predict_dataset = None
# Placeholder for transforms
self.train_transform = None
self.val_transform = None
self.test_transform = None
def prepare_data(self):
"""
Download and prepare data (called only on 1 GPU/TPU in distributed settings).
Use this for downloading, tokenizing, etc. Do NOT set state here (no self.x = y).
"""
# Example: Download datasets
# datasets.MNIST(self.data_dir, train=True, download=True)
# datasets.MNIST(self.data_dir, train=False, download=True)
pass
def setup(self, stage: str = None):
"""
Load data and create train/val/test splits (called on every GPU/TPU in distributed).
Use this for splitting, creating datasets, etc. Setting state is OK here (self.x = y).
Args:
stage: Either 'fit', 'validate', 'test', or 'predict'
"""
# Fit stage: setup training and validation datasets
if stage == "fit" or stage is None:
# Load full dataset
# Example: full_dataset = datasets.MNIST(self.data_dir, train=True, transform=self.train_transform)
# Create dummy data for template
full_data = torch.randn(1000, 784)
full_targets = torch.randint(0, 10, (1000,))
full_dataset = TemplateDataset(full_data, full_targets, transform=self.train_transform)
# Split into train and validation
train_size = int(len(full_dataset) * self.train_val_split[0])
val_size = len(full_dataset) - train_size
self.train_dataset, self.val_dataset = random_split(
full_dataset,
[train_size, val_size],
generator=torch.Generator().manual_seed(self.seed)
)
# Apply validation transform if different from train
if self.val_transform:
self.val_dataset.dataset.transform = self.val_transform
# Test stage: setup test dataset
if stage == "test" or stage is None:
# Example: self.test_dataset = datasets.MNIST(
# self.data_dir, train=False, transform=self.test_transform
# )
# Create dummy test data for template
test_data = torch.randn(200, 784)
test_targets = torch.randint(0, 10, (200,))
self.test_dataset = TemplateDataset(test_data, test_targets, transform=self.test_transform)
# Predict stage: setup prediction dataset
if stage == "predict" or stage is None:
# Example: self.predict_dataset = YourCustomDataset(...)
# Create dummy predict data for template
predict_data = torch.randn(100, 784)
predict_targets = torch.zeros(100, dtype=torch.long)
self.predict_dataset = TemplateDataset(predict_data, predict_targets)
def train_dataloader(self):
"""Return training dataloader."""
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
)
def val_dataloader(self):
"""Return validation dataloader."""
return DataLoader(
self.val_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
)
def test_dataloader(self):
"""Return test dataloader."""
return DataLoader(
self.test_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
)
def predict_dataloader(self):
"""Return prediction dataloader."""
return DataLoader(
self.predict_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
persistent_workers=self.persistent_workers if self.num_workers > 0 else False,
)
def teardown(self, stage: str = None):
"""Clean up after fit, validate, test, or predict."""
# Example: close database connections, clear caches, etc.
pass
def state_dict(self):
"""Save state for checkpointing."""
# Return anything you want to save in the checkpoint
return {}
def load_state_dict(self, state_dict):
"""Load state from checkpoint."""
# Restore state from checkpoint
pass
# Example usage
if __name__ == "__main__":
# Create datamodule
datamodule = TemplateDataModule(
data_dir="./data",
batch_size=32,
num_workers=4,
train_val_split=(0.8, 0.2),
)
# Prepare and setup data
datamodule.prepare_data()
datamodule.setup("fit")
# Get dataloaders
train_loader = datamodule.train_dataloader()
val_loader = datamodule.val_dataloader()
print("Template DataModule created successfully!")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Batch size: {datamodule.batch_size}")
# Test a batch
batch = next(iter(train_loader))
x, y = batch
print(f"Batch shape: {x.shape}, {y.shape}")

View File

@@ -0,0 +1,215 @@
"""
Template for creating a PyTorch Lightning LightningModule.
This template includes all common hooks and patterns for building
a Lightning model with best practices.
"""
import lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
class TemplateLightningModule(L.LightningModule):
"""Template LightningModule with all common hooks and patterns."""
def __init__(
self,
# Model architecture parameters
input_dim: int = 784,
hidden_dim: int = 128,
output_dim: int = 10,
# Optimization parameters
learning_rate: float = 1e-3,
optimizer_type: str = "adam",
scheduler_type: str = None,
# Other hyperparameters
dropout: float = 0.1,
):
super().__init__()
# Save hyperparameters for checkpointing and logging
self.save_hyperparameters()
# Define model architecture
self.model = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, output_dim)
)
# Define loss function
self.criterion = nn.CrossEntropyLoss()
# For tracking validation outputs (optional)
self.validation_step_outputs = []
def forward(self, x):
"""Forward pass for inference."""
return self.model(x)
def training_step(self, batch, batch_idx):
"""Training step - called for each training batch."""
x, y = batch
# Forward pass
logits = self(x)
loss = self.criterion(logits, y)
# Calculate accuracy
preds = torch.argmax(logits, dim=1)
acc = (preds == y).float().mean()
# Log metrics
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
"""Validation step - called for each validation batch."""
x, y = batch
# Forward pass (model automatically in eval mode)
logits = self(x)
loss = self.criterion(logits, y)
# Calculate accuracy
preds = torch.argmax(logits, dim=1)
acc = (preds == y).float().mean()
# Log metrics
self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
# Optional: store outputs for epoch-level processing
self.validation_step_outputs.append({"loss": loss, "acc": acc})
return loss
def on_validation_epoch_end(self):
"""Called at the end of validation epoch."""
# Optional: process all validation outputs
if self.validation_step_outputs:
avg_loss = torch.stack([x["loss"] for x in self.validation_step_outputs]).mean()
avg_acc = torch.stack([x["acc"] for x in self.validation_step_outputs]).mean()
# Log epoch-level metrics if needed
# self.log("val_epoch_loss", avg_loss)
# self.log("val_epoch_acc", avg_acc)
# Clear outputs
self.validation_step_outputs.clear()
def test_step(self, batch, batch_idx):
"""Test step - called for each test batch."""
x, y = batch
# Forward pass
logits = self(x)
loss = self.criterion(logits, y)
# Calculate accuracy
preds = torch.argmax(logits, dim=1)
acc = (preds == y).float().mean()
# Log metrics
self.log("test_loss", loss, on_step=False, on_epoch=True)
self.log("test_acc", acc, on_step=False, on_epoch=True)
return loss
def predict_step(self, batch, batch_idx, dataloader_idx=0):
"""Prediction step - called for each prediction batch."""
x, y = batch
logits = self(x)
preds = torch.argmax(logits, dim=1)
return preds
def configure_optimizers(self):
"""Configure optimizer and learning rate scheduler."""
# Create optimizer
if self.hparams.optimizer_type.lower() == "adam":
optimizer = Adam(self.parameters(), lr=self.hparams.learning_rate)
elif self.hparams.optimizer_type.lower() == "sgd":
optimizer = SGD(self.parameters(), lr=self.hparams.learning_rate, momentum=0.9)
else:
raise ValueError(f"Unknown optimizer: {self.hparams.optimizer_type}")
# Configure with scheduler if specified
if self.hparams.scheduler_type:
if self.hparams.scheduler_type.lower() == "reduce_on_plateau":
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=5)
return {
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
"monitor": "val_loss",
"interval": "epoch",
"frequency": 1,
}
}
elif self.hparams.scheduler_type.lower() == "step":
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
return {
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
"interval": "epoch",
"frequency": 1,
}
}
return optimizer
# Optional: Additional hooks for custom behavior
def on_train_start(self):
"""Called at the beginning of training."""
pass
def on_train_epoch_start(self):
"""Called at the beginning of each training epoch."""
pass
def on_train_epoch_end(self):
"""Called at the end of each training epoch."""
pass
def on_train_end(self):
"""Called at the end of training."""
pass
# Example usage
if __name__ == "__main__":
# Create model
model = TemplateLightningModule(
input_dim=784,
hidden_dim=128,
output_dim=10,
learning_rate=1e-3,
optimizer_type="adam",
scheduler_type="reduce_on_plateau"
)
# Create trainer
trainer = L.Trainer(
max_epochs=10,
accelerator="auto",
devices=1,
log_every_n_steps=50,
)
# Note: You would need to provide dataloaders
# trainer.fit(model, train_dataloader, val_dataloader)
print("Template LightningModule created successfully!")
print(f"Model hyperparameters: {model.hparams}")