Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,699 @@
# Transformers API Reference
This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
## Core Model Classes
### PreTrainedModel
Base class for all models. Handles loading, saving, and common model operations.
**Key Methods:**
```python
from transformers import PreTrainedModel
# Load pretrained model
model = ModelClass.from_pretrained(
pretrained_model_name_or_path,
config=None, # Custom config
cache_dir=None, # Custom cache location
force_download=False, # Force re-download
resume_download=False, # Resume interrupted download
proxies=None, # HTTP proxies
local_files_only=False, # Only use cached files
token=None, # HF auth token
revision="main", # Git branch/tag
trust_remote_code=False, # Allow custom model code
device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.)
torch_dtype=None, # Model dtype (torch.float16, "auto", etc.)
low_cpu_mem_usage=False, # Reduce CPU memory during loading
**model_kwargs
)
# Save model
model.save_pretrained(
save_directory,
save_config=True, # Save config.json
state_dict=None, # Custom state dict
save_function=torch.save, # Custom save function
push_to_hub=False, # Upload to Hub
max_shard_size="5GB", # Max checkpoint size
safe_serialization=True, # Use SafeTensors format
variant=None, # Model variant name
)
# Generate text (for generative models)
outputs = model.generate(
inputs=None, # Input token IDs
max_length=20, # Max total length
max_new_tokens=None, # Max new tokens to generate
min_length=0, # Minimum length
do_sample=False, # Enable sampling
early_stopping=False, # Stop when num_beams finish
num_beams=1, # Beam search width
temperature=1.0, # Sampling temperature
top_k=50, # Top-k sampling
top_p=1.0, # Nucleus sampling
repetition_penalty=1.0, # Penalize repetition
length_penalty=1.0, # Beam search length penalty
no_repeat_ngram_size=0, # Block repeated n-grams
num_return_sequences=1, # Number of sequences to return
**model_kwargs
)
# Resize token embeddings (after adding tokens)
new_embeddings = model.resize_token_embeddings(
new_num_tokens,
pad_to_multiple_of=None
)
# Utility methods
num_params = model.num_parameters(only_trainable=False)
model.gradient_checkpointing_enable() # Enable gradient checkpointing
model.enable_input_require_grads() # For PEFT with frozen models
```
### AutoModel Classes
Automatically instantiate the correct model architecture.
**Available Classes:**
- `AutoModel`: Base model (returns hidden states)
- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
- `AutoModelForSequenceClassification`: Text classification
- `AutoModelForTokenClassification`: Token classification (NER)
- `AutoModelForQuestionAnswering`: Extractive QA
- `AutoModelForImageClassification`: Image classification
- `AutoModelForObjectDetection`: Object detection
- `AutoModelForSemanticSegmentation`: Semantic segmentation
- `AutoModelForAudioClassification`: Audio classification
- `AutoModelForSpeechSeq2Seq`: Speech-to-text
- `AutoModelForVision2Seq`: Image captioning, VQA
**Usage:**
```python
from transformers import AutoModel, AutoConfig
# Load with default configuration
model = AutoModel.from_pretrained("bert-base-uncased")
# Load with custom configuration
config = AutoConfig.from_pretrained("bert-base-uncased")
config.hidden_dropout_prob = 0.2
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
# Register custom models
from transformers import AutoConfig, AutoModel
AutoConfig.register("my-model", MyModelConfig)
AutoModel.register(MyModelConfig, MyModel)
```
## Tokenizer Classes
### PreTrainedTokenizer / PreTrainedTokenizerFast
Convert text to token IDs and vice versa.
**Key Methods:**
```python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
use_fast=True, # Use fast (Rust) tokenizer if available
revision="main",
**kwargs
)
# Encoding (text → token IDs)
encoded = tokenizer(
text, # String or List[str]
text_pair=None, # Second sequence for pairs
add_special_tokens=True, # Add [CLS], [SEP], etc.
padding=False, # True, False, "longest", "max_length"
truncation=False, # True, False, "longest_first", "only_first", "only_second"
max_length=None, # Max sequence length
stride=0, # Overlap for split sequences
return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
return_token_type_ids=None, # Return token type IDs
return_attention_mask=None, # Return attention mask
return_overflowing_tokens=False, # Return overflowing tokens
return_special_tokens_mask=False, # Return special token mask
return_offsets_mapping=False, # Return char-level offsets (fast only)
return_length=False, # Return sequence lengths
**kwargs
)
# Decoding (token IDs → text)
text = tokenizer.decode(
token_ids,
skip_special_tokens=False, # Remove special tokens
clean_up_tokenization_spaces=True, # Clean up spacing
)
# Batch decoding
texts = tokenizer.batch_decode(
sequences,
skip_special_tokens=False,
clean_up_tokenization_spaces=True,
)
# Tokenization (text → tokens)
tokens = tokenizer.tokenize(text, **kwargs)
# Convert tokens to IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
# Convert IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(ids)
# Add new tokens
num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
# Add special tokens
tokenizer.add_special_tokens({
"bos_token": "[BOS]",
"eos_token": "[EOS]",
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
})
# Chat template formatting
formatted = tokenizer.apply_chat_template(
conversation, # List[Dict[str, str]] with "role" and "content"
chat_template=None, # Custom template
add_generation_prompt=False, # Add prompt for model to continue
tokenize=True, # Return token IDs
padding=False,
truncation=False,
max_length=None,
return_tensors=None,
return_dict=True,
)
# Save tokenizer
tokenizer.save_pretrained(save_directory)
# Get vocab size
vocab_size = len(tokenizer)
# Get special tokens
pad_token = tokenizer.pad_token
pad_token_id = tokenizer.pad_token_id
# Similar for: bos, eos, unk, sep, cls, mask
```
**Special Token Attributes:**
```python
tokenizer.bos_token # Beginning of sequence
tokenizer.eos_token # End of sequence
tokenizer.unk_token # Unknown token
tokenizer.sep_token # Separator token
tokenizer.pad_token # Padding token
tokenizer.cls_token # Classification token
tokenizer.mask_token # Mask token
# Corresponding IDs
tokenizer.bos_token_id
tokenizer.eos_token_id
# ... etc
```
## Image Processors
### AutoImageProcessor
Preprocess images for vision models.
**Key Methods:**
```python
from transformers import AutoImageProcessor
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
# Process images
inputs = processor(
images, # PIL Image, np.array, torch.Tensor, or List
return_tensors="pt", # "pt", "tf", "np", None
do_resize=True, # Resize to model size
size=None, # Target size dict
resample=None, # Resampling method
do_rescale=True, # Rescale pixel values
do_normalize=True, # Normalize with mean/std
image_mean=None, # Custom mean
image_std=None, # Custom std
do_center_crop=False, # Center crop
crop_size=None, # Crop size
**kwargs
)
# Returns: BatchFeature with 'pixel_values' key
```
## Training Components
### TrainingArguments
Configuration for the Trainer class.
**Essential Arguments:**
```python
from transformers import TrainingArguments
args = TrainingArguments(
# ===== Output & Logging =====
output_dir="./results", # REQUIRED: Output directory
overwrite_output_dir=False, # Overwrite output directory
# ===== Training Parameters =====
num_train_epochs=3.0, # Number of epochs
max_steps=-1, # Max training steps (overrides epochs)
per_device_train_batch_size=8, # Train batch size per device
per_device_eval_batch_size=8, # Eval batch size per device
gradient_accumulation_steps=1, # Accumulation steps
# ===== Learning Rate & Optimization =====
learning_rate=5e-5, # Initial learning rate
weight_decay=0.0, # Weight decay
adam_beta1=0.9, # Adam beta1
adam_beta2=0.999, # Adam beta2
adam_epsilon=1e-8, # Adam epsilon
max_grad_norm=1.0, # Gradient clipping
optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
# ===== Learning Rate Scheduler =====
lr_scheduler_type="linear", # Scheduler type
warmup_steps=0, # Warmup steps
warmup_ratio=0.0, # Warmup ratio (alternative to steps)
# ===== Evaluation =====
eval_strategy="no", # "no", "steps", "epoch"
eval_steps=None, # Eval every N steps
eval_delay=0, # Delay first eval
eval_accumulation_steps=None, # Accumulate eval outputs
# ===== Checkpointing =====
save_strategy="steps", # "no", "steps", "epoch"
save_steps=500, # Save every N steps
save_total_limit=None, # Max checkpoints to keep
save_safetensors=True, # Save as SafeTensors
save_on_each_node=False, # Save on each node (distributed)
# ===== Best Model Selection =====
load_best_model_at_end=False, # Load best checkpoint at end
metric_for_best_model=None, # Metric to use
greater_is_better=None, # True if higher is better
# ===== Logging =====
logging_dir=None, # TensorBoard log directory
logging_strategy="steps", # "no", "steps", "epoch"
logging_steps=500, # Log every N steps
logging_first_step=False, # Log first step
logging_nan_inf_filter=True, # Filter NaN/Inf
# ===== Mixed Precision =====
fp16=False, # Use fp16 training
fp16_opt_level="O1", # Apex AMP optimization level
fp16_backend="auto", # "auto", "apex", "cpu_amp"
bf16=False, # Use bfloat16 training
bf16_full_eval=False, # Use bf16 for evaluation
tf32=None, # Use TF32 (Ampere+ GPUs)
# ===== Memory Optimization =====
gradient_checkpointing=False, # Enable gradient checkpointing
gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing
torch_empty_cache_steps=None, # Clear cache every N steps
# ===== Distributed Training =====
local_rank=-1, # Local rank for distributed
ddp_backend=None, # "nccl", "gloo", "mpi", "ccl"
ddp_find_unused_parameters=None, # Find unused parameters
ddp_bucket_cap_mb=None, # DDP bucket size
fsdp="", # FSDP configuration
fsdp_config=None, # FSDP config dict
deepspeed=None, # DeepSpeed config
# ===== Hub Integration =====
push_to_hub=False, # Push to Hugging Face Hub
hub_model_id=None, # Hub model ID
hub_strategy="every_save", # "every_save", "checkpoint", "end"
hub_token=None, # Hub authentication token
hub_private_repo=False, # Make repo private
# ===== Data Handling =====
dataloader_num_workers=0, # DataLoader workers
dataloader_pin_memory=True, # Pin memory
dataloader_drop_last=False, # Drop last incomplete batch
dataloader_prefetch_factor=None, # Prefetch factor
remove_unused_columns=True, # Remove unused dataset columns
label_names=None, # Label column names
# ===== Other =====
seed=42, # Random seed
data_seed=None, # Data sampling seed
jit_mode_eval=False, # Use PyTorch JIT for eval
use_ipex=False, # Use Intel Extension for PyTorch
torch_compile=False, # Use torch.compile()
torch_compile_backend=None, # Compile backend
torch_compile_mode=None, # Compile mode
include_inputs_for_metrics=False, # Pass inputs to compute_metrics
skip_memory_metrics=True, # Skip memory profiling
)
```
### Trainer
Main training class with full training loop.
**Key Methods:**
```python
from transformers import Trainer
trainer = Trainer(
model=None, # Model to train
args=None, # TrainingArguments
data_collator=None, # Data collator
train_dataset=None, # Training dataset
eval_dataset=None, # Evaluation dataset
tokenizer=None, # Tokenizer
model_init=None, # Function to instantiate model
compute_metrics=None, # Function to compute metrics
callbacks=None, # List of callbacks
optimizers=(None, None), # (optimizer, scheduler) tuple
preprocess_logits_for_metrics=None, # Preprocess logits before metrics
)
# Train model
train_result = trainer.train(
resume_from_checkpoint=None, # Resume from checkpoint
trial=None, # Optuna/Ray trial
ignore_keys_for_eval=None, # Keys to ignore in eval
)
# Evaluate model
eval_result = trainer.evaluate(
eval_dataset=None, # Eval dataset (default: self.eval_dataset)
ignore_keys=None, # Keys to ignore
metric_key_prefix="eval", # Prefix for metric names
)
# Make predictions
predictions = trainer.predict(
test_dataset, # Test dataset
ignore_keys=None, # Keys to ignore
metric_key_prefix="test", # Metric prefix
)
# Returns: PredictionOutput(predictions, label_ids, metrics)
# Save model
trainer.save_model(output_dir=None)
# Push to Hub
trainer.push_to_hub(
commit_message="End of training",
blocking=True,
**kwargs
)
# Hyperparameter search
best_trial = trainer.hyperparameter_search(
hp_space=None, # Hyperparameter search space
compute_objective=None, # Objective function
n_trials=20, # Number of trials
direction="minimize", # "minimize" or "maximize"
backend=None, # "optuna", "ray", "sigopt"
**kwargs
)
# Create optimizer
optimizer = trainer.create_optimizer()
# Create scheduler
scheduler = trainer.create_scheduler(
num_training_steps,
optimizer=None
)
# Log metrics
trainer.log_metrics(split, metrics)
trainer.save_metrics(split, metrics)
# Save checkpoint
trainer.save_state()
# Access current step/epoch
current_step = trainer.state.global_step
current_epoch = trainer.state.epoch
# Access training logs
logs = trainer.state.log_history
```
### Seq2SeqTrainer
Specialized trainer for sequence-to-sequence models.
```python
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Use Seq2SeqTrainingArguments with additional parameters
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
predict_with_generate=True, # Use generate() for evaluation
generation_max_length=None, # Max length for generation
generation_num_beams=None, # Num beams for generation
**other_training_arguments
)
# Trainer usage is identical to Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
```
## Pipeline Classes
### pipeline()
Unified inference API for all tasks.
```python
from transformers import pipeline
pipe = pipeline(
task=None, # Task name (required)
model=None, # Model name/path or model object
config=None, # Model config
tokenizer=None, # Tokenizer
feature_extractor=None, # Feature extractor
image_processor=None, # Image processor
framework=None, # "pt" or "tf"
revision=None, # Model revision
use_fast=True, # Use fast tokenizer
token=None, # HF token
device=None, # Device (-1 for CPU, 0+ for GPU)
device_map=None, # Device map for multi-GPU
torch_dtype=None, # Model dtype
trust_remote_code=False, # Allow custom code
model_kwargs=None, # Additional model kwargs
pipeline_class=None, # Custom pipeline class
**kwargs
)
# Use pipeline
results = pipe(
inputs, # Input data
**task_specific_parameters
)
```
## Data Collators
Batch and pad data for training.
```python
from transformers import (
DataCollatorWithPadding, # Dynamic padding for classification
DataCollatorForTokenClassification, # Padding for token classification
DataCollatorForSeq2Seq, # Padding for seq2seq
DataCollatorForLanguageModeling, # MLM/CLM data collation
default_data_collator, # Simple collator (no padding)
)
# Text classification
data_collator = DataCollatorWithPadding(
tokenizer=tokenizer,
padding=True,
max_length=None,
pad_to_multiple_of=None,
)
# Token classification
data_collator = DataCollatorForTokenClassification(
tokenizer=tokenizer,
padding=True,
max_length=None,
pad_to_multiple_of=None,
label_pad_token_id=-100,
)
# Seq2Seq
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=None,
padding=True,
max_length=None,
pad_to_multiple_of=None,
label_pad_token_id=-100,
)
# Language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True, # Masked LM (False for causal LM)
mlm_probability=0.15, # Mask probability
pad_to_multiple_of=None,
)
```
## Optimization & Scheduling
```python
from transformers import (
AdamW, # AdamW optimizer
Adafactor, # Adafactor optimizer
get_scheduler, # Get LR scheduler
get_linear_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_polynomial_decay_schedule_with_warmup,
)
# Create optimizer
optimizer = AdamW(
model.parameters(),
lr=5e-5,
betas=(0.9, 0.999),
eps=1e-8,
weight_decay=0.01,
)
# Create scheduler
scheduler = get_scheduler(
name="linear", # "linear", "cosine", "polynomial", "constant"
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=total_steps,
)
# Or use specific schedulers
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps,
)
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps,
num_cycles=0.5,
)
```
## Configuration Classes
```python
from transformers import AutoConfig
# Load configuration
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
**kwargs
)
# Common configuration attributes
config.vocab_size # Vocabulary size
config.hidden_size # Hidden layer size
config.num_hidden_layers # Number of layers
config.num_attention_heads # Attention heads
config.intermediate_size # FFN intermediate size
config.hidden_dropout_prob # Dropout probability
config.attention_probs_dropout_prob # Attention dropout
config.max_position_embeddings # Max sequence length
# Save configuration
config.save_pretrained(save_directory)
# Create model from config
from transformers import AutoModel
model = AutoModel.from_config(config)
```
## Utility Functions
```python
from transformers import (
set_seed, # Set random seed
logging, # Logging utilities
)
# Set seed for reproducibility
set_seed(42)
# Configure logging
logging.set_verbosity_info()
logging.set_verbosity_warning()
logging.set_verbosity_error()
logging.set_verbosity_debug()
# Get logger
logger = logging.get_logger(__name__)
```
## Model Outputs
All models return model-specific output classes (subclasses of `ModelOutput`):
```python
# Common output attributes
outputs.loss # Loss (if labels provided)
outputs.logits # Model logits
outputs.hidden_states # All hidden states (if output_hidden_states=True)
outputs.attentions # Attention weights (if output_attentions=True)
# Seq2Seq specific
outputs.encoder_last_hidden_state
outputs.encoder_hidden_states
outputs.encoder_attentions
outputs.decoder_hidden_states
outputs.decoder_attentions
outputs.cross_attentions
# Access as dict or tuple
logits = outputs.logits
logits = outputs["logits"]
loss, logits = outputs.to_tuple()[:2]
```
This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.

View File

@@ -0,0 +1,530 @@
# Text Generation Strategies
Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity.
## Overview
Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost.
**When to use each strategy:**
- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical
- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization)
- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue)
- **Top-k/Top-p**: Balanced creativity and coherence
## Basic Generation Methods
### Greedy Decoding
Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
inputs = tokenizer("The future of AI", return_tensors="pt")
# Greedy decoding (default)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))
```
**Characteristics:**
- Deterministic (always same output for same input)
- Fast (single forward pass per token)
- Prone to repetition in longer sequences
- Best for: Short generations, deterministic applications
**Parameters:**
```python
outputs = model.generate(
**inputs,
max_new_tokens=50, # Number of tokens to generate
min_length=10, # Minimum total length
pad_token_id=tokenizer.pad_token_id,
)
```
### Beam Search
Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability.
```python
outputs = model.generate(
**inputs,
max_new_tokens=50,
num_beams=5, # Number of beams
early_stopping=True, # Stop when all beams finish
no_repeat_ngram_size=2, # Prevent 2-gram repetition
)
```
**Characteristics:**
- Higher quality than greedy for tasks with "correct" answers
- Slower than greedy (num_beams forward passes per step)
- Still can suffer from repetition
- Best for: Translation, summarization, QA generation
**Advanced Parameters:**
```python
outputs = model.generate(
**inputs,
num_beams=5,
num_beam_groups=1, # Diverse beam search groups
diversity_penalty=0.0, # Penalty for similar beams
length_penalty=1.0, # >1: longer sequences, <1: shorter
early_stopping=True, # Stop when num_beams sequences finish
no_repeat_ngram_size=2, # Block repeating n-grams
num_return_sequences=1, # Return top-k sequences (≤ num_beams)
)
```
**Length Penalty:**
- `length_penalty > 1.0`: Favor longer sequences
- `length_penalty = 1.0`: No penalty
- `length_penalty < 1.0`: Favor shorter sequences
### Sampling (Multinomial)
Randomly sample tokens according to the probability distribution.
```python
outputs = model.generate(
**inputs,
max_new_tokens=50,
do_sample=True, # Enable sampling
temperature=1.0, # Sampling temperature
num_beams=1, # Must be 1 for sampling
)
```
**Characteristics:**
- Non-deterministic (different output each time)
- More diverse and creative than greedy/beam search
- Can produce incoherent output if not controlled
- Best for: Creative writing, dialogue, open-ended generation
**Temperature Parameter:**
```python
# Low temperature (0.1-0.7): More focused, less random
outputs = model.generate(**inputs, do_sample=True, temperature=0.5)
# Medium temperature (0.7-1.0): Balanced
outputs = model.generate(**inputs, do_sample=True, temperature=0.8)
# High temperature (1.0-2.0): More random, more creative
outputs = model.generate(**inputs, do_sample=True, temperature=1.5)
```
- `temperature → 0`: Approaches greedy decoding
- `temperature = 1.0`: Sample from original distribution
- `temperature > 1.0`: Flatter distribution, more random
- `temperature < 1.0`: Sharper distribution, more confident
## Advanced Sampling Methods
### Top-k Sampling
Sample from only the k most likely tokens.
```python
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=50,
top_k=50, # Consider top 50 tokens
temperature=0.8,
)
```
**How it works:**
1. Filter to top-k most probable tokens
2. Renormalize probabilities
3. Sample from filtered distribution
**Choosing k:**
- `k=1`: Equivalent to greedy decoding
- `k=10-50`: More focused, coherent output
- `k=100-500`: More diverse output
- Too high k: Includes low-probability tokens (noise)
- Too low k: Less diverse, may miss good alternatives
### Top-p (Nucleus) Sampling
Sample from the smallest set of tokens whose cumulative probability ≥ p.
```python
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=50,
top_p=0.95, # Nucleus probability
temperature=0.8,
)
```
**How it works:**
1. Sort tokens by probability
2. Find smallest set with cumulative probability ≥ p
3. Sample from this set
**Choosing p:**
- `p=0.9-0.95`: Good balance (recommended)
- `p=1.0`: Sample from full distribution
- Higher p: More diverse, might include unlikely tokens
- Lower p: More focused, like top-k with adaptive k
**Top-p vs Top-k:**
- Top-p adapts to probability distribution shape
- Top-k is fixed regardless of distribution
- Top-p generally better for variable-quality contexts
- Can combine: `top_k=50, top_p=0.95` (apply both filters)
### Combining Strategies
```python
# Recommended for high-quality open-ended generation
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=100,
temperature=0.8, # Moderate temperature
top_k=50, # Limit to top 50 tokens
top_p=0.95, # Nucleus sampling
repetition_penalty=1.2, # Discourage repetition
no_repeat_ngram_size=3, # Block 3-gram repetition
)
```
## Controlling Generation Quality
### Repetition Control
Prevent models from repeating themselves:
```python
outputs = model.generate(
**inputs,
max_new_tokens=100,
# Method 1: Repetition penalty
repetition_penalty=1.2, # Penalize repeated tokens (>1.0)
# Method 2: Block n-gram repetition
no_repeat_ngram_size=3, # Never repeat 3-grams
# Method 3: Encoder repetition penalty (for seq2seq)
encoder_repetition_penalty=1.0, # Penalize input tokens
)
```
**Repetition Penalty Values:**
- `1.0`: No penalty
- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3)
- `>1.5`: Strong penalty (may harm coherence)
### Length Control
```python
outputs = model.generate(
**inputs,
# Hard constraints
min_length=20, # Minimum total length
max_length=100, # Maximum total length
max_new_tokens=50, # Maximum new tokens (excluding input)
# Soft constraints (with beam search)
length_penalty=1.0, # Encourage longer/shorter outputs
# Early stopping
early_stopping=True, # Stop when condition met
)
```
### Bad Words and Forced Tokens
```python
# Prevent specific tokens
bad_words_ids = [
tokenizer.encode("badword1", add_special_tokens=False),
tokenizer.encode("badword2", add_special_tokens=False),
]
outputs = model.generate(
**inputs,
bad_words_ids=bad_words_ids,
)
# Force specific tokens
force_words_ids = [
tokenizer.encode("important", add_special_tokens=False),
]
outputs = model.generate(
**inputs,
force_words_ids=force_words_ids,
)
```
## Streaming Generation
Generate and process tokens as they're produced:
```python
from transformers import TextStreamer, TextIteratorStreamer
from threading import Thread
# Simple streaming (prints to stdout)
streamer = TextStreamer(tokenizer, skip_prompt=True)
outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100)
# Iterator streaming (for custom processing)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for text in streamer:
print(text, end="", flush=True)
thread.join()
```
## Advanced Techniques
### Contrastive Search
Balance coherence and diversity using contrastive objective:
```python
outputs = model.generate(
**inputs,
max_new_tokens=50,
penalty_alpha=0.6, # Contrastive penalty
top_k=4, # Consider top-4 tokens
)
```
**When to use:**
- Open-ended text generation
- Reduces repetition without sacrificing coherence
- Good alternative to sampling
### Diverse Beam Search
Generate multiple diverse outputs:
```python
outputs = model.generate(
**inputs,
max_new_tokens=50,
num_beams=10,
num_beam_groups=5, # 5 groups of 2 beams each
diversity_penalty=1.0, # Penalty for similar beams
num_return_sequences=5, # Return 5 diverse outputs
)
```
### Constrained Beam Search
Force output to include specific phrases:
```python
from transformers import PhrasalConstraint
constraints = [
PhrasalConstraint(
tokenizer("machine learning", add_special_tokens=False).input_ids
),
]
outputs = model.generate(
**inputs,
constraints=constraints,
num_beams=10, # Requires beam search
)
```
## Speculative Decoding
Accelerate generation using a smaller draft model:
```python
from transformers import AutoModelForCausalLM
# Load main and assistant models
model = AutoModelForCausalLM.from_pretrained("large-model")
assistant_model = AutoModelForCausalLM.from_pretrained("small-model")
# Generate with speculative decoding
outputs = model.generate(
**inputs,
assistant_model=assistant_model,
do_sample=True,
temperature=0.8,
)
```
**Benefits:**
- 2-3x faster generation
- Identical output distribution to regular generation
- Works with sampling and greedy decoding
## Recipe: Recommended Settings by Task
### Creative Writing / Dialogue
```python
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=200,
temperature=0.9,
top_p=0.95,
top_k=50,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
)
```
### Translation / Summarization
```python
outputs = model.generate(
**inputs,
num_beams=5,
max_new_tokens=150,
early_stopping=True,
length_penalty=1.0,
no_repeat_ngram_size=2,
)
```
### Code Generation
```python
outputs = model.generate(
**inputs,
max_new_tokens=300,
temperature=0.2, # Low temperature for correctness
top_p=0.95,
do_sample=True,
)
```
### Chatbot / Instruction Following
```python
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.15,
)
```
### Factual QA / Information Extraction
```python
outputs = model.generate(
**inputs,
max_new_tokens=50,
num_beams=3,
early_stopping=True,
# Or greedy for very short answers:
# (no special parameters needed)
)
```
## Debugging Generation
### Check Token Probabilities
```python
outputs = model.generate(
**inputs,
max_new_tokens=20,
output_scores=True, # Return generation scores
return_dict_in_generate=True, # Return as dict
)
# Access generation scores
scores = outputs.scores # Tuple of tensors (seq_len, vocab_size)
# Get token probabilities
import torch
probs = torch.softmax(scores[0], dim=-1)
```
### Monitor Generation Process
```python
from transformers import LogitsProcessor, LogitsProcessorList
class DebugLogitsProcessor(LogitsProcessor):
def __call__(self, input_ids, scores):
# Print top 5 tokens at each step
top_tokens = scores[0].topk(5)
print(f"Top 5 tokens: {top_tokens}")
return scores
outputs = model.generate(
**inputs,
max_new_tokens=10,
logits_processor=LogitsProcessorList([DebugLogitsProcessor()]),
)
```
## Common Issues and Solutions
**Issue: Repetitive output**
- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3`
- For sampling: Increase `temperature`, enable `top_p`
**Issue: Incoherent output**
- Solution: Lower `temperature` (0.5-0.8), use beam search
- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens
**Issue: Too short output**
- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search)
- Check if EOS token is being generated early
**Issue: Too slow generation**
- Solution: Use greedy instead of beam search
- Reduce `num_beams`
- Try speculative decoding with assistant model
- Use smaller model variant
**Issue: Output doesn't follow format**
- Solution: Use constrained beam search
- Add format examples to prompt
- Use `bad_words_ids` to prevent format-breaking tokens
## Performance Optimization
```python
# Use half precision
model = AutoModelForCausalLM.from_pretrained(
"model-name",
torch_dtype=torch.float16,
device_map="auto"
)
# Use KV cache optimization (default, but can be disabled)
outputs = model.generate(**inputs, use_cache=True)
# Batch generation
inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True)
outputs = model.generate(**inputs, max_new_tokens=50)
# Static cache for longer sequences (if supported)
outputs = model.generate(**inputs, cache_implementation="static")
```
This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`.

View File

@@ -0,0 +1,504 @@
# Model Quantization Guide
Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy.
## Overview
Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables:
- Running larger models on limited hardware
- Faster inference (reduced memory bandwidth)
- Lower deployment costs
- Enabling fine-tuning of models that wouldn't fit in memory
**Tradeoffs:**
- Slight accuracy loss (typically < 1-2%)
- Initial quantization overhead
- Some methods require calibration data
## Quick Comparison
| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup |
|--------|-----------|-------|----------|-------------|----------|-------|
| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy |
| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium |
| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium |
| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy |
| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium |
## Bitsandbytes (BnB)
On-the-fly quantization with excellent PEFT fine-tuning support.
### 8-bit Quantization
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_8bit=True, # Enable 8-bit quantization
device_map="auto", # Automatic device placement
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Use normally
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
```
**Memory Savings:**
- 7B model: ~14GB → ~7GB (50% reduction)
- 13B model: ~26GB → ~13GB
- 70B model: ~140GB → ~70GB
**Characteristics:**
- Fast inference
- Minimal accuracy loss
- Works with PEFT (LoRA, QLoRA)
- Supports CPU and CUDA GPUs
### 4-bit Quantization (QLoRA)
```python
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # Enable 4-bit quantization
bnb_4bit_quant_type="nf4", # Quantization type ("nf4" or "fp4")
bnb_4bit_compute_dtype=torch.float16, # Computation dtype
bnb_4bit_use_double_quant=True, # Nested quantization for more savings
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto",
)
```
**Memory Savings:**
- 7B model: ~14GB → ~4GB (70% reduction)
- 13B model: ~26GB → ~7GB
- 70B model: ~140GB → ~35GB
**Quantization Types:**
- `nf4`: Normal Float 4 (recommended, better quality)
- `fp4`: Float Point 4 (slightly more memory efficient)
**Compute Dtype:**
```python
# For better quality
bnb_4bit_compute_dtype=torch.float16
# For best performance on Ampere+ GPUs
bnb_4bit_compute_dtype=torch.bfloat16
```
**Double Quantization:**
```python
# Enable for additional ~0.4 bits/param savings
bnb_4bit_use_double_quant=True # Quantize the quantization constants
```
### Fine-tuning with QLoRA
```python
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# Load quantized model
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto",
)
# Prepare for training
model = prepare_model_for_kbit_training(model)
# Configure LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# Train normally
trainer = Trainer(model=model, args=training_args, ...)
trainer.train()
```
## GPTQ
Post-training quantization requiring calibration, optimized for inference speed.
### Loading GPTQ Models
```python
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
# Load pre-quantized GPTQ model
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/Llama-2-7B-GPTQ", # Pre-quantized model
device_map="auto",
revision="gptq-4bit-32g-actorder_True", # Specific quantization config
)
# Or quantize yourself
gptq_config = GPTQConfig(
bits=4, # 2, 3, 4, 8 bits
dataset="c4", # Calibration dataset
tokenizer=tokenizer,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
device_map="auto",
quantization_config=gptq_config,
)
# Save quantized model
model.save_pretrained("llama-2-7b-gptq")
```
**Configuration Options:**
```python
gptq_config = GPTQConfig(
bits=4, # Quantization bits
group_size=128, # Group size for quantization (128, 32, -1)
dataset="c4", # Calibration dataset
desc_act=False, # Activation order (can improve accuracy)
sym=True, # Symmetric quantization
damp_percent=0.1, # Dampening factor
)
```
**Characteristics:**
- Fastest inference among quantization methods
- Requires one-time calibration (slow)
- Best when using pre-quantized models from Hub
- Limited fine-tuning support
- Excellent for production deployment
## AWQ (Activation-aware Weight Quantization)
Protects important weights for better quality.
### Loading AWQ Models
```python
from transformers import AutoModelForCausalLM, AwqConfig
# Load pre-quantized AWQ model
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/Llama-2-7B-AWQ",
device_map="auto",
)
# Or quantize yourself
awq_config = AwqConfig(
bits=4, # 4-bit quantization
group_size=128, # Quantization group size
zero_point=True, # Use zero-point quantization
version="GEMM", # Quantization version
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=awq_config,
device_map="auto",
)
```
**Characteristics:**
- Better accuracy than GPTQ at same bit width
- Excellent inference speed
- Supports PEFT fine-tuning
- Requires calibration data
### Fine-tuning AWQ Models
```python
from peft import LoraConfig, get_peft_model
# AWQ models support LoRA fine-tuning
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
trainer = Trainer(model=model, ...)
trainer.train()
```
## GGUF (GGML Format)
CPU-optimized quantization format, popular in llama.cpp ecosystem.
### Using GGUF Models
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load GGUF model
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/Llama-2-7B-GGUF",
gguf_file="llama-2-7b.Q4_K_M.gguf", # Specific quantization file
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF")
```
**GGUF Quantization Types:**
- `Q4_0`: 4-bit, smallest, lowest quality
- `Q4_K_M`: 4-bit, medium quality (recommended)
- `Q5_K_M`: 5-bit, good quality
- `Q6_K`: 6-bit, high quality
- `Q8_0`: 8-bit, very high quality
**Characteristics:**
- Optimized for CPU inference
- Wide range of bit depths (1-8)
- Good for Apple Silicon (M1/M2)
- No fine-tuning support
- Excellent for local/edge deployment
## HQQ (Half-Quadratic Quantization)
Flexible quantization with good accuracy retention.
### Using HQQ
```python
from transformers import AutoModelForCausalLM, HqqConfig
hqq_config = HqqConfig(
nbits=4, # Quantization bits
group_size=64, # Group size
quant_zero=False, # Quantize zero point
quant_scale=False, # Quantize scale
axis=0, # Quantization axis
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=hqq_config,
device_map="auto",
)
```
**Characteristics:**
- Very fast quantization
- No calibration data needed
- Support for 1-8 bits
- Can serialize/deserialize
- Good accuracy vs size tradeoff
## Choosing a Quantization Method
### Decision Tree
**For inference only:**
1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models)
2. CPU-only deployment? → **GGUF**
3. Want easiest setup? → **Bitsandbytes 8-bit**
4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit**
**For fine-tuning:**
1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)**
2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA**
3. Need very large models? → **QLoRA with double quantization**
**For production:**
1. Latency-critical? → **GPTQ or AWQ**
2. Cost-optimized? → **Bitsandbytes 8-bit**
3. CPU deployment? → **GGUF**
## Memory Requirements
Approximate memory for Llama-2 7B model:
| Method | Memory | vs FP16 |
|--------|--------|---------|
| FP32 | 28GB | 2x |
| FP16 / BF16 | 14GB | 1x |
| 8-bit (BnB) | 7GB | 0.5x |
| 4-bit (QLoRA) | 3.5GB | 0.25x |
| 4-bit Double Quant | 3GB | 0.21x |
| GPTQ 4-bit | 4GB | 0.29x |
| AWQ 4-bit | 4GB | 0.29x |
**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead.
## Best Practices
### For Training
```python
# QLoRA recommended configuration
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16, # BF16 if available
bnb_4bit_use_double_quant=True,
)
# LoRA configuration
lora_config = LoraConfig(
r=16, # Rank (8, 16, 32, 64)
lora_alpha=32, # Scaling (typically 2*r)
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
```
### For Inference
```python
# High-speed inference
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/Llama-2-7B-GPTQ",
device_map="auto",
torch_dtype=torch.float16, # Use FP16 for activations
)
# Balanced quality/speed
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_8bit=True,
device_map="auto",
)
# Maximum compression
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
),
device_map="auto",
)
```
### Multi-GPU Setups
```python
# Automatically distribute across GPUs
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b-hf",
load_in_4bit=True,
device_map="auto", # Automatic distribution
max_memory={0: "20GB", 1: "20GB"}, # Optional: limit per GPU
)
# Manual device map
device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
# ... distribute layers ...
"model.norm": 1,
"lm_head": 1,
}
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b-hf",
load_in_4bit=True,
device_map=device_map,
)
```
## Troubleshooting
**Issue: OOM during quantization**
```python
# Solution: Use low_cpu_mem_usage
model = AutoModelForCausalLM.from_pretrained(
"model-name",
quantization_config=config,
device_map="auto",
low_cpu_mem_usage=True, # Reduce CPU memory during loading
)
```
**Issue: Slow quantization**
```python
# GPTQ/AWQ take time to calibrate
# Solution: Use pre-quantized models from Hub
model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ")
# Or use BnB for instant quantization
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True)
```
**Issue: Poor quality after quantization**
```python
# Try different quantization types
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # Try "nf4" instead of "fp4"
bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 if available
)
# Or use 8-bit instead of 4-bit
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True)
```
**Issue: Can't fine-tune quantized model**
```python
# Ensure using compatible quantization method
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
# Only BnB and AWQ support PEFT fine-tuning
# GPTQ has limited support, GGUF doesn't support fine-tuning
```
## Performance Benchmarks
Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB:
| Method | Speed | Memory |
|--------|-------|--------|
| FP16 | 100 tok/s | 14GB |
| 8-bit | 90 tok/s | 7GB |
| 4-bit QLoRA | 70 tok/s | 4GB |
| GPTQ 4-bit | 95 tok/s | 4GB |
| AWQ 4-bit | 95 tok/s | 4GB |
**Note:** Actual performance varies by hardware, sequence length, and batch size.
## Resources
- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub
- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes
- **PEFT library:** https://github.com/huggingface/peft
- **QLoRA paper:** https://arxiv.org/abs/2305.14314
For task-specific quantization examples, see `training_guide.md`.

View File

@@ -0,0 +1,610 @@
# Task-Specific Patterns
Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
## Text Classification
Classify text into predefined categories (sentiment, topic, intent, etc.).
```python
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import load_dataset
# 1. Load data
dataset = load_dataset("imdb")
# 2. Preprocess
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def preprocess(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized = dataset.map(preprocess, batched=True)
# 3. Model
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2,
id2label={0: "negative", 1: "positive"},
label2id={"negative": 0, "positive": 1}
)
# 4. Train
training_args = TrainingArguments(
output_dir="./results",
learning_rate=2e-5,
per_device_train_batch_size=16,
num_train_epochs=3,
eval_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()
# 5. Inference
text = "This movie was fantastic!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(-1)
print(model.config.id2label[predictions.item()]) # "positive"
```
## Token Classification (NER)
Label each token in text (named entities, POS tags, etc.).
```python
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
# Load data (tokens and NER tags)
dataset = load_dataset("conll2003")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100) # Special tokens
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100) # Subword tokens
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
# Model
label_list = dataset["train"].features["ner_tags"].feature.names
model = AutoModelForTokenClassification.from_pretrained(
"bert-base-cased",
num_labels=len(label_list),
id2label={i: label for i, label in enumerate(label_list)},
label2id={label: i for i, label in enumerate(label_list)}
)
# Training similar to classification
# ... (use Trainer with DataCollatorForTokenClassification)
```
## Question Answering (Extractive)
Extract answer spans from context.
```python
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."
inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)
# Get answer span
answer_start = outputs.start_logits.argmax()
answer_end = outputs.end_logits.argmax() + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
print(answer) # "Paris"
```
## Text Generation
Generate text continuations.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
prompt = "In the future, artificial intelligence will"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True,
temperature=0.8,
top_p=0.95,
repetition_penalty=1.2,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
```
## Summarization
Condense long text into summaries.
```python
from transformers import (
AutoTokenizer, AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments, Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
def preprocess(examples):
inputs = ["summarize: " + doc for doc in examples["document"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(
examples["summary"],
max_length=128,
truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess, batched=True)
# Training
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
predict_with_generate=True, # Important for seq2seq
eval_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)
trainer.train()
# Inference
text = "Long article text here..."
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
```
## Translation
Translate text between languages.
```python
from transformers import pipeline
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Hello, how are you?")
print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?"
# For fine-tuning, similar to summarization with Seq2SeqTrainer
```
## Image Classification
Classify images into categories.
```python
from transformers import (
AutoImageProcessor, AutoModelForImageClassification,
TrainingArguments, Trainer
)
from datasets import load_dataset
from PIL import Image
# Load data
dataset = load_dataset("food101", split="train[:1000]")
# Preprocess
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
def transform(examples):
examples["pixel_values"] = [
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
for img in examples["image"]
]
return examples
dataset = dataset.with_transform(transform)
# Model
model = AutoModelForImageClassification.from_pretrained(
"google/vit-base-patch16-224",
num_labels=101,
ignore_mismatched_sizes=True
)
# Training
training_args = TrainingArguments(
output_dir="./results",
remove_unused_columns=False, # Keep image data
eval_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=32,
num_train_epochs=3,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=processor,
)
trainer.train()
# Inference
image = Image.open("food.jpg")
inputs = processor(image, return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()
print(model.config.id2label[predicted_class])
```
## Object Detection
Detect and localize objects in images.
```python
from transformers import pipeline
from PIL import Image
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
image = Image.open("street.jpg")
results = detector(image)
for result in results:
print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
# car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
```
## Image Segmentation
Segment images into regions.
```python
from transformers import pipeline
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
image = "path/to/image.jpg"
segments = segmenter(image)
for segment in segments:
print(f"{segment['label']}: {segment['score']:.2f}")
# Access mask: segment['mask']
```
## Image Captioning
Generate textual descriptions of images.
```python
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
image = Image.open("photo.jpg")
inputs = processor(images=image, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(caption) # "a dog sitting on grass"
```
## Speech Recognition (ASR)
Transcribe speech to text.
```python
from transformers import pipeline
transcriber = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base"
)
result = transcriber("audio.mp3")
print(result["text"]) # "Hello, this is a test."
# With timestamps
result = transcriber("audio.mp3", return_timestamps=True)
for chunk in result["chunks"]:
print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
```
## Text-to-Speech
Generate speech from text.
```python
from transformers import pipeline
synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
result = synthesizer("Hello, how are you today?")
# result["audio"] contains the waveform
# result["sampling_rate"] contains the sample rate
# Save audio
import scipy
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
```
## Visual Question Answering
Answer questions about images.
```python
from transformers import pipeline
from PIL import Image
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
image = Image.open("photo.jpg")
question = "What color is the car?"
result = vqa(image=image, question=question)
print(result[0]["answer"]) # "red"
```
## Document Question Answering
Extract information from documents (PDFs, images with text).
```python
from transformers import pipeline
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
result = doc_qa(
image="invoice.png",
question="What is the total amount?"
)
print(result["answer"]) # "$1,234.56"
```
## Zero-Shot Classification
Classify without training data.
```python
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
text = "This is a delicious Italian restaurant with great pasta."
candidate_labels = ["food", "travel", "technology", "sports"]
result = classifier(text, candidate_labels)
print(result["labels"][0]) # "food"
print(result["scores"][0]) # 0.95
```
## Few-Shot Learning with LLMs
Use large language models for few-shot tasks.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Few-shot prompt
prompt = """
Classify the sentiment: positive, negative, or neutral.
Text: "I love this product!"
Sentiment: positive
Text: "This is terrible."
Sentiment: negative
Text: "It's okay, nothing special."
Sentiment: neutral
Text: "Best purchase ever!"
Sentiment:"""
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response.split("Sentiment:")[-1].strip()) # "positive"
```
## Instruction-Following / Chat
Use instruction-tuned models.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is machine learning?"},
]
formatted = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(formatted, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract assistant response
assistant_response = response.split("[/INST]")[-1].strip()
print(assistant_response)
```
## Embeddings / Semantic Search
Generate embeddings for semantic similarity.
```python
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
# Mean pooling
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
# Get embeddings
text1 = "Machine learning is a subset of AI"
text2 = "AI includes machine learning"
emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
# Compute similarity
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
print(f"Similarity: {similarity.item():.4f}") # ~0.85
```
## Multimodal Understanding (CLIP)
Connect vision and language.
```python
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = Image.open("photo.jpg")
texts = ["a dog", "a cat", "a car", "a house"]
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
# Get similarity scores
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
for text, prob in zip(texts, probs[0]):
print(f"{text}: {prob.item():.4f}")
```
## Common Evaluation Metrics
```python
from datasets import load_metric
# Accuracy (classification)
metric = load_metric("accuracy")
predictions = [0, 1, 1, 0]
references = [0, 1, 0, 0]
result = metric.compute(predictions=predictions, references=references)
# F1 Score (classification, NER)
metric = load_metric("f1")
result = metric.compute(predictions=predictions, references=references)
# BLEU (translation)
metric = load_metric("bleu")
predictions = ["hello there general kenobi"]
references = [["hello there general kenobi", "hello there!"]]
result = metric.compute(predictions=predictions, references=references)
# ROUGE (summarization)
metric = load_metric("rouge")
predictions = ["summary text"]
references = ["reference summary"]
result = metric.compute(predictions=predictions, references=references)
```
## Common Data Collators
```python
from transformers import (
DataCollatorWithPadding,
DataCollatorForTokenClassification,
DataCollatorForSeq2Seq,
DataCollatorForLanguageModeling,
)
# Classification: dynamic padding
DataCollatorWithPadding(tokenizer=tokenizer)
# NER: pad labels too
DataCollatorForTokenClassification(tokenizer=tokenizer)
# Seq2Seq: pad inputs and labels
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Language modeling: create MLM masks
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
```
This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`.