mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
Add more scientific skills
This commit is contained in:
860
scientific-packages/transformers/SKILL.md
Normal file
860
scientific-packages/transformers/SKILL.md
Normal file
@@ -0,0 +1,860 @@
|
||||
---
|
||||
name: transformers
|
||||
description: Comprehensive toolkit for working with Hugging Face Transformers library for state-of-the-art machine learning across NLP, computer vision, audio, and multimodal tasks. Use this skill when working with pretrained models, fine-tuning transformers, implementing text generation, image classification, speech recognition, or any task involving transformer architectures like BERT, GPT, T5, Vision Transformers, CLIP, or Whisper.
|
||||
---
|
||||
|
||||
# Transformers
|
||||
|
||||
## Overview
|
||||
|
||||
Transformers is Hugging Face's flagship library providing unified access to over 1 million pretrained models for machine learning across text, vision, audio, and multimodal domains. The library serves as a standardized model-definition framework compatible with PyTorch, TensorFlow, and JAX, emphasizing ease of use through three core components:
|
||||
|
||||
- **Pipeline**: Simple, optimized inference API for common tasks
|
||||
- **AutoClasses**: Automatic model/tokenizer selection from pretrained checkpoints
|
||||
- **Trainer**: Full-featured training loop with distributed training, mixed precision, and optimization
|
||||
|
||||
The library prioritizes accessibility with pretrained models that reduce computational costs and carbon footprint while providing compatibility across major training frameworks (PyTorch-Lightning, DeepSpeed, vLLM, etc.).
|
||||
|
||||
## Quick Start with Pipelines
|
||||
|
||||
Use pipelines for simple, efficient inference without managing models, tokenizers, or preprocessing manually. Pipelines abstract complexity into a single function call.
|
||||
|
||||
### Basic Pipeline Usage
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Text classification
|
||||
classifier = pipeline("text-classification")
|
||||
result = classifier("This restaurant is awesome")
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998}]
|
||||
|
||||
# Text generation
|
||||
generator = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf")
|
||||
generator("The secret to baking a good cake is", max_length=50)
|
||||
|
||||
# Question answering
|
||||
qa = pipeline("question-answering")
|
||||
qa(question="What is extractive QA?", context="Extractive QA is...")
|
||||
|
||||
# Image classification
|
||||
img_classifier = pipeline("image-classification")
|
||||
img_classifier("path/to/image.jpg")
|
||||
|
||||
# Automatic speech recognition
|
||||
transcriber = pipeline("automatic-speech-recognition")
|
||||
transcriber("audio_file.mp3")
|
||||
```
|
||||
|
||||
### Available Pipeline Tasks
|
||||
|
||||
**NLP Tasks:**
|
||||
- `text-classification`, `token-classification`, `question-answering`
|
||||
- `fill-mask`, `summarization`, `translation`
|
||||
- `text-generation`, `conversational`
|
||||
- `zero-shot-classification`, `sentiment-analysis`
|
||||
|
||||
**Vision Tasks:**
|
||||
- `image-classification`, `image-segmentation`, `object-detection`
|
||||
- `depth-estimation`, `image-to-image`, `zero-shot-image-classification`
|
||||
|
||||
**Audio Tasks:**
|
||||
- `automatic-speech-recognition`, `audio-classification`
|
||||
- `text-to-audio`, `zero-shot-audio-classification`
|
||||
|
||||
**Multimodal Tasks:**
|
||||
- `visual-question-answering`, `document-question-answering`
|
||||
- `image-to-text`, `zero-shot-object-detection`
|
||||
|
||||
### Pipeline Best Practices
|
||||
|
||||
**Device Management:**
|
||||
```python
|
||||
from transformers import pipeline, infer_device
|
||||
|
||||
device = infer_device() # Auto-detect best device
|
||||
pipe = pipeline("text-generation", model="...", device=device)
|
||||
```
|
||||
|
||||
**Batch Processing:**
|
||||
```python
|
||||
# Process multiple inputs efficiently
|
||||
results = classifier(["Text 1", "Text 2", "Text 3"])
|
||||
|
||||
# Use KeyDataset for large datasets
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("imdb", split="test")
|
||||
for result in pipe(KeyDataset(dataset, "text")):
|
||||
print(result)
|
||||
```
|
||||
|
||||
**Memory Optimization:**
|
||||
```python
|
||||
# Use half-precision for faster inference
|
||||
pipe = pipeline("text-generation", model="...",
|
||||
torch_dtype=torch.float16, device="cuda")
|
||||
```
|
||||
|
||||
## Core Components
|
||||
|
||||
### AutoClasses for Model Loading
|
||||
|
||||
AutoClasses automatically select the correct architecture based on pretrained checkpoints.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoModel, AutoTokenizer, AutoConfig,
|
||||
AutoModelForCausalLM, AutoModelForSequenceClassification
|
||||
)
|
||||
|
||||
# Load any model by checkpoint name
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Task-specific model classes
|
||||
causal_lm = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
classifier = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=3
|
||||
)
|
||||
|
||||
# Load with device and dtype optimization
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
device_map="auto", # Automatically distribute across devices
|
||||
torch_dtype="auto" # Use optimal dtype
|
||||
)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `device_map="auto"`: Optimal device allocation (CPU/GPU/multi-GPU)
|
||||
- `torch_dtype`: Control precision (torch.float16, torch.bfloat16, "auto")
|
||||
- `trust_remote_code`: Enable custom model code (use cautiously)
|
||||
- `use_fast`: Enable Rust-backed fast tokenizers (default True)
|
||||
|
||||
### Tokenization
|
||||
|
||||
Tokenizers convert text to model-compatible tensor inputs.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Basic tokenization
|
||||
tokens = tokenizer.tokenize("Hello, how are you?")
|
||||
# ['hello', ',', 'how', 'are', 'you', '?']
|
||||
|
||||
# Encoding (text → token IDs)
|
||||
encoded = tokenizer("Hello, how are you?", return_tensors="pt")
|
||||
# {'input_ids': tensor([[...]], 'attention_mask': tensor([[...]])}
|
||||
|
||||
# Batch encoding with padding and truncation
|
||||
batch = tokenizer(
|
||||
["Short text", "This is a much longer text..."],
|
||||
padding=True, # Pad to longest in batch
|
||||
truncation=True, # Truncate to model's max length
|
||||
max_length=512,
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
# Decoding (token IDs → text)
|
||||
text = tokenizer.decode(encoded['input_ids'][0])
|
||||
```
|
||||
|
||||
**Special Tokens:**
|
||||
```python
|
||||
# Access special tokens
|
||||
tokenizer.pad_token # Padding token
|
||||
tokenizer.cls_token # Classification token
|
||||
tokenizer.sep_token # Separator token
|
||||
tokenizer.mask_token # Mask token (for MLM)
|
||||
|
||||
# Add custom tokens
|
||||
tokenizer.add_tokens(["[CUSTOM]"])
|
||||
tokenizer.add_special_tokens({'additional_special_tokens': ['[NEW]']})
|
||||
|
||||
# Resize model embeddings to match new vocabulary
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
```
|
||||
|
||||
### Image Processors
|
||||
|
||||
For vision tasks, use image processors instead of tokenizers.
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Process single image
|
||||
from PIL import Image
|
||||
image = Image.open("path/to/image.jpg")
|
||||
inputs = processor(image, return_tensors="pt")
|
||||
# Returns: {'pixel_values': tensor([[...]])}
|
||||
|
||||
# Batch processing
|
||||
images = [Image.open(f"img{i}.jpg") for i in range(3)]
|
||||
inputs = processor(images, return_tensors="pt")
|
||||
```
|
||||
|
||||
### Processors for Multimodal Models
|
||||
|
||||
Multimodal models use processors that combine image and text processing.
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor
|
||||
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
|
||||
# Process image + text caption
|
||||
inputs = processor(
|
||||
images=image,
|
||||
text="A description of the image",
|
||||
return_tensors="pt",
|
||||
padding=True
|
||||
)
|
||||
```
|
||||
|
||||
## Model Inference
|
||||
|
||||
### Basic Inference Pattern
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Load model and tokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
# Tokenize input
|
||||
inputs = tokenizer("The future of AI is", return_tensors="pt")
|
||||
|
||||
# Generate (for causal LM)
|
||||
outputs = model.generate(**inputs, max_length=50)
|
||||
text = tokenizer.decode(outputs[0])
|
||||
|
||||
# Or get model outputs directly
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits # Shape: (batch_size, seq_len, vocab_size)
|
||||
```
|
||||
|
||||
### Text Generation Strategies
|
||||
|
||||
For generative models, control generation behavior with parameters:
|
||||
|
||||
```python
|
||||
# Greedy decoding (default)
|
||||
output = model.generate(inputs, max_length=50)
|
||||
|
||||
# Beam search (multiple hypothesis)
|
||||
output = model.generate(
|
||||
inputs,
|
||||
max_length=50,
|
||||
num_beams=5, # Keep top 5 beams
|
||||
early_stopping=True
|
||||
)
|
||||
|
||||
# Sampling with temperature
|
||||
output = model.generate(
|
||||
inputs,
|
||||
max_length=50,
|
||||
do_sample=True,
|
||||
temperature=0.7, # Lower = more focused, higher = more random
|
||||
top_k=50, # Sample from top 50 tokens
|
||||
top_p=0.95 # Nucleus sampling
|
||||
)
|
||||
|
||||
# Streaming generation
|
||||
from transformers import TextStreamer
|
||||
|
||||
streamer = TextStreamer(tokenizer)
|
||||
model.generate(**inputs, streamer=streamer, max_length=100)
|
||||
```
|
||||
|
||||
**Generation Parameters:**
|
||||
- `max_length` / `max_new_tokens`: Control output length
|
||||
- `num_beams`: Beam search width (1 = greedy)
|
||||
- `temperature`: Randomness (0.7-1.0 typical)
|
||||
- `top_k`: Sample from top k tokens
|
||||
- `top_p`: Nucleus sampling threshold
|
||||
- `repetition_penalty`: Discourage repetition (>1.0)
|
||||
|
||||
Refer to `references/generation_strategies.md` for detailed information on choosing appropriate strategies.
|
||||
|
||||
## Training and Fine-Tuning
|
||||
|
||||
### Training Workflow Overview
|
||||
|
||||
1. **Load dataset** → 2. **Preprocess** → 3. **Configure training** → 4. **Train** → 5. **Evaluate** → 6. **Save/Share**
|
||||
|
||||
### Text Classification Example
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSequenceClassification,
|
||||
TrainingArguments, Trainer, DataCollatorWithPadding
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# 1. Load dataset
|
||||
dataset = load_dataset("imdb")
|
||||
|
||||
# 2. Preprocess
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def preprocess(examples):
|
||||
return tokenizer(examples["text"], truncation=True)
|
||||
|
||||
tokenized = dataset.map(preprocess, batched=True)
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# 3. Load model
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2,
|
||||
id2label={0: "negative", 1: "positive"},
|
||||
label2id={"negative": 0, "positive": 1}
|
||||
)
|
||||
|
||||
# 4. Configure training
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
weight_decay=0.01,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
push_to_hub=False,
|
||||
)
|
||||
|
||||
# 5. Train
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized["train"],
|
||||
eval_dataset=tokenized["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# 6. Evaluate and save
|
||||
metrics = trainer.evaluate()
|
||||
trainer.save_model("./my-finetuned-model")
|
||||
trainer.push_to_hub() # Share to Hugging Face Hub
|
||||
```
|
||||
|
||||
### Vision Task Fine-Tuning
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoImageProcessor, AutoModelForImageClassification,
|
||||
TrainingArguments, Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("food101", split="train[:5000]")
|
||||
|
||||
# Image preprocessing
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
def transform(examples):
|
||||
examples["pixel_values"] = [
|
||||
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
|
||||
for img in examples["image"]
|
||||
]
|
||||
return examples
|
||||
|
||||
dataset = dataset.with_transform(transform)
|
||||
|
||||
# Load model
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
"google/vit-base-patch16-224",
|
||||
num_labels=101, # 101 food categories
|
||||
ignore_mismatched_sizes=True
|
||||
)
|
||||
|
||||
# Training (similar pattern to text)
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./vit-food101",
|
||||
remove_unused_columns=False, # Keep image data
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=32,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset,
|
||||
tokenizer=processor,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
### Sequence-to-Sequence Tasks
|
||||
|
||||
For tasks like summarization, translation, use Seq2SeqTrainer:
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSeq2SeqLM,
|
||||
Seq2SeqTrainingArguments, Seq2SeqTrainer,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
|
||||
def preprocess(examples):
|
||||
# Prefix input for T5
|
||||
inputs = ["summarize: " + doc for doc in examples["text"]]
|
||||
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
||||
|
||||
# Tokenize targets
|
||||
labels = tokenizer(
|
||||
examples["summary"],
|
||||
max_length=128,
|
||||
truncation=True
|
||||
)
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_dataset = dataset.map(preprocess, batched=True)
|
||||
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./t5-summarization",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
predict_with_generate=True, # Important for seq2seq
|
||||
)
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
### Important TrainingArguments
|
||||
|
||||
```python
|
||||
TrainingArguments(
|
||||
# Essential
|
||||
output_dir="./results",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=8,
|
||||
learning_rate=2e-5,
|
||||
|
||||
# Evaluation
|
||||
eval_strategy="epoch", # or "steps"
|
||||
eval_steps=500, # if eval_strategy="steps"
|
||||
|
||||
# Checkpointing
|
||||
save_strategy="epoch",
|
||||
save_steps=500,
|
||||
save_total_limit=2, # Keep only 2 best checkpoints
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="accuracy",
|
||||
|
||||
# Optimization
|
||||
gradient_accumulation_steps=4,
|
||||
warmup_steps=500,
|
||||
weight_decay=0.01,
|
||||
max_grad_norm=1.0,
|
||||
|
||||
# Mixed Precision
|
||||
fp16=True, # For Nvidia GPUs
|
||||
bf16=True, # For Ampere+ GPUs (better)
|
||||
|
||||
# Logging
|
||||
logging_steps=100,
|
||||
report_to="tensorboard", # or "wandb", "mlflow"
|
||||
|
||||
# Memory Optimization
|
||||
gradient_checkpointing=True,
|
||||
optim="adamw_torch", # or "adafactor" for memory
|
||||
|
||||
# Distributed Training
|
||||
ddp_find_unused_parameters=False,
|
||||
)
|
||||
```
|
||||
|
||||
Refer to `references/training_guide.md` for comprehensive training patterns and optimization strategies.
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Model Quantization
|
||||
|
||||
Reduce memory footprint while maintaining accuracy:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
# 8-bit quantization
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
# 4-bit quantization (even smaller)
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
**Quantization Methods:**
|
||||
- **Bitsandbytes**: 4/8-bit on-the-fly quantization, supports PEFT fine-tuning
|
||||
- **GPTQ**: 2/3/4/8-bit, requires calibration, very fast inference
|
||||
- **AWQ**: 4-bit activation-aware, balanced speed/accuracy
|
||||
|
||||
Refer to `references/quantization.md` for detailed comparison and usage patterns.
|
||||
|
||||
### Training Optimization
|
||||
|
||||
```python
|
||||
# Gradient accumulation (simulate larger batch)
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=4,
|
||||
gradient_accumulation_steps=8, # Effective batch = 4 * 8 = 32
|
||||
)
|
||||
|
||||
# Gradient checkpointing (reduce memory, slower)
|
||||
training_args = TrainingArguments(
|
||||
gradient_checkpointing=True,
|
||||
)
|
||||
|
||||
# Mixed precision training
|
||||
training_args = TrainingArguments(
|
||||
bf16=True, # or fp16=True
|
||||
)
|
||||
|
||||
# Efficient optimizer
|
||||
training_args = TrainingArguments(
|
||||
optim="adafactor", # Lower memory than AdamW
|
||||
)
|
||||
```
|
||||
|
||||
**Key Strategies:**
|
||||
- **Batch sizes**: Use powers of 2 (8, 16, 32, 64, 128)
|
||||
- **Gradient accumulation**: Enables larger effective batch sizes
|
||||
- **Gradient checkpointing**: Reduces memory ~60%, increases time ~20%
|
||||
- **Mixed precision**: bf16 for Ampere+ GPUs, fp16 for older
|
||||
- **torch.compile**: Optimize model graph (PyTorch 2.0+)
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Custom Training Loop
|
||||
|
||||
For maximum control, bypass Trainer:
|
||||
|
||||
```python
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AdamW, get_scheduler
|
||||
|
||||
# Prepare data
|
||||
train_dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)
|
||||
|
||||
# Setup optimizer and scheduler
|
||||
optimizer = AdamW(model.parameters(), lr=5e-5)
|
||||
scheduler = get_scheduler(
|
||||
"linear",
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=len(train_dataloader) * num_epochs
|
||||
)
|
||||
|
||||
# Training loop
|
||||
model.train()
|
||||
for epoch in range(num_epochs):
|
||||
for batch in train_dataloader:
|
||||
batch = {k: v.to(device) for k, v in batch.items()}
|
||||
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
loss.backward()
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
```
|
||||
|
||||
### Parameter-Efficient Fine-Tuning (PEFT)
|
||||
|
||||
Use PEFT library with transformers for efficient fine-tuning:
|
||||
|
||||
```python
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
r=16, # Low-rank dimension
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "v_proj"], # Which layers to adapt
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
# Apply to model
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Now train as usual - only LoRA parameters train
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
### Chat Templates
|
||||
|
||||
Apply chat templates for instruction-tuned models:
|
||||
|
||||
```python
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
]
|
||||
|
||||
# Format according to model's chat template
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Tokenize and generate
|
||||
inputs = tokenizer(formatted, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_length=200)
|
||||
response = tokenizer.decode(outputs[0])
|
||||
```
|
||||
|
||||
### Multi-GPU Training
|
||||
|
||||
```python
|
||||
# Automatic with Trainer - no code changes needed
|
||||
# Just run with: accelerate launch train.py
|
||||
|
||||
# Or use PyTorch DDP explicitly
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
ddp_find_unused_parameters=False,
|
||||
# ... other args
|
||||
)
|
||||
|
||||
# For larger models, use FSDP
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
fsdp="full_shard auto_wrap",
|
||||
fsdp_config={
|
||||
"fsdp_transformer_layer_cls_to_wrap": ["BertLayer"],
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
## Task-Specific Patterns
|
||||
|
||||
### Question Answering (Extractive)
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
|
||||
|
||||
result = qa(
|
||||
question="What is extractive QA?",
|
||||
context="Extractive QA extracts the answer from the given context..."
|
||||
)
|
||||
# {'answer': 'extracts the answer from the given context', 'score': 0.97, ...}
|
||||
```
|
||||
|
||||
### Named Entity Recognition
|
||||
|
||||
```python
|
||||
ner = pipeline("token-classification", model="dslim/bert-base-NER")
|
||||
|
||||
result = ner("My name is John and I live in New York")
|
||||
# [{'entity': 'B-PER', 'word': 'John', ...}, {'entity': 'B-LOC', 'word': 'New York', ...}]
|
||||
```
|
||||
|
||||
### Image Captioning
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
|
||||
|
||||
from PIL import Image
|
||||
image = Image.open("image.jpg")
|
||||
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_length=50)
|
||||
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||
```
|
||||
|
||||
### Speech Recognition
|
||||
|
||||
```python
|
||||
transcriber = pipeline(
|
||||
"automatic-speech-recognition",
|
||||
model="openai/whisper-base"
|
||||
)
|
||||
|
||||
result = transcriber("audio.mp3")
|
||||
# {'text': 'This is the transcribed text...'}
|
||||
|
||||
# With timestamps
|
||||
result = transcriber("audio.mp3", return_timestamps=True)
|
||||
```
|
||||
|
||||
## Common Patterns and Best Practices
|
||||
|
||||
### Saving and Loading Models
|
||||
|
||||
```python
|
||||
# Save entire model
|
||||
model.save_pretrained("./my-model")
|
||||
tokenizer.save_pretrained("./my-model")
|
||||
|
||||
# Load later
|
||||
model = AutoModel.from_pretrained("./my-model")
|
||||
tokenizer = AutoTokenizer.from_pretrained("./my-model")
|
||||
|
||||
# Push to Hugging Face Hub
|
||||
model.push_to_hub("username/my-model")
|
||||
tokenizer.push_to_hub("username/my-model")
|
||||
|
||||
# Load from Hub
|
||||
model = AutoModel.from_pretrained("username/my-model")
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
from transformers import AutoModel
|
||||
import torch
|
||||
|
||||
try:
|
||||
model = AutoModel.from_pretrained("model-name")
|
||||
except OSError:
|
||||
print("Model not found - check internet connection or model name")
|
||||
except torch.cuda.OutOfMemoryError:
|
||||
print("GPU memory exceeded - try quantization or smaller batch size")
|
||||
```
|
||||
|
||||
### Device Management
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
# Check device availability
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# Move model to device
|
||||
model = model.to(device)
|
||||
|
||||
# Or use device_map for automatic distribution
|
||||
model = AutoModel.from_pretrained("model-name", device_map="auto")
|
||||
|
||||
# For inputs
|
||||
inputs = tokenizer(text, return_tensors="pt").to(device)
|
||||
```
|
||||
|
||||
### Memory Management
|
||||
|
||||
```python
|
||||
import torch
|
||||
|
||||
# Clear CUDA cache
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Use context manager for inference
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Delete unused models
|
||||
del model
|
||||
torch.cuda.empty_cache()
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
This skill includes comprehensive reference documentation and example scripts:
|
||||
|
||||
### scripts/
|
||||
|
||||
- `quick_inference.py`: Ready-to-use script for running inference with pipelines
|
||||
- `fine_tune_classifier.py`: Complete example for fine-tuning a text classifier
|
||||
- `generate_text.py`: Text generation with various strategies
|
||||
|
||||
Execute scripts directly or read them as implementation templates.
|
||||
|
||||
### references/
|
||||
|
||||
- `api_reference.md`: Comprehensive API documentation for key classes
|
||||
- `training_guide.md`: Detailed training patterns, optimization, and troubleshooting
|
||||
- `generation_strategies.md`: In-depth guide to text generation methods
|
||||
- `quantization.md`: Model quantization techniques comparison and usage
|
||||
- `task_patterns.md`: Quick reference for common task implementations
|
||||
|
||||
Load reference files when you need detailed information on specific topics. References contain extensive examples, parameter explanations, and best practices.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Import errors:**
|
||||
```bash
|
||||
pip install transformers
|
||||
pip install accelerate # For device_map="auto"
|
||||
pip install bitsandbytes # For quantization
|
||||
```
|
||||
|
||||
**CUDA out of memory:**
|
||||
- Reduce batch size
|
||||
- Enable gradient checkpointing
|
||||
- Use gradient accumulation
|
||||
- Try quantization (8-bit or 4-bit)
|
||||
- Use smaller model variant
|
||||
|
||||
**Slow training:**
|
||||
- Enable mixed precision (fp16/bf16)
|
||||
- Increase batch size (if memory allows)
|
||||
- Use torch.compile (PyTorch 2.0+)
|
||||
- Check data loading isn't bottleneck
|
||||
|
||||
**Poor generation quality:**
|
||||
- Adjust temperature (lower = more focused)
|
||||
- Try different decoding strategies (beam search vs sampling)
|
||||
- Increase max_length if outputs cut off
|
||||
- Use repetition_penalty to reduce repetition
|
||||
|
||||
For task-specific guidance, consult the appropriate reference file in the `references/` directory.
|
||||
699
scientific-packages/transformers/references/api_reference.md
Normal file
699
scientific-packages/transformers/references/api_reference.md
Normal file
@@ -0,0 +1,699 @@
|
||||
# Transformers API Reference
|
||||
|
||||
This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
|
||||
|
||||
## Core Model Classes
|
||||
|
||||
### PreTrainedModel
|
||||
|
||||
Base class for all models. Handles loading, saving, and common model operations.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import PreTrainedModel
|
||||
|
||||
# Load pretrained model
|
||||
model = ModelClass.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
config=None, # Custom config
|
||||
cache_dir=None, # Custom cache location
|
||||
force_download=False, # Force re-download
|
||||
resume_download=False, # Resume interrupted download
|
||||
proxies=None, # HTTP proxies
|
||||
local_files_only=False, # Only use cached files
|
||||
token=None, # HF auth token
|
||||
revision="main", # Git branch/tag
|
||||
trust_remote_code=False, # Allow custom model code
|
||||
device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.)
|
||||
torch_dtype=None, # Model dtype (torch.float16, "auto", etc.)
|
||||
low_cpu_mem_usage=False, # Reduce CPU memory during loading
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Save model
|
||||
model.save_pretrained(
|
||||
save_directory,
|
||||
save_config=True, # Save config.json
|
||||
state_dict=None, # Custom state dict
|
||||
save_function=torch.save, # Custom save function
|
||||
push_to_hub=False, # Upload to Hub
|
||||
max_shard_size="5GB", # Max checkpoint size
|
||||
safe_serialization=True, # Use SafeTensors format
|
||||
variant=None, # Model variant name
|
||||
)
|
||||
|
||||
# Generate text (for generative models)
|
||||
outputs = model.generate(
|
||||
inputs=None, # Input token IDs
|
||||
max_length=20, # Max total length
|
||||
max_new_tokens=None, # Max new tokens to generate
|
||||
min_length=0, # Minimum length
|
||||
do_sample=False, # Enable sampling
|
||||
early_stopping=False, # Stop when num_beams finish
|
||||
num_beams=1, # Beam search width
|
||||
temperature=1.0, # Sampling temperature
|
||||
top_k=50, # Top-k sampling
|
||||
top_p=1.0, # Nucleus sampling
|
||||
repetition_penalty=1.0, # Penalize repetition
|
||||
length_penalty=1.0, # Beam search length penalty
|
||||
no_repeat_ngram_size=0, # Block repeated n-grams
|
||||
num_return_sequences=1, # Number of sequences to return
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Resize token embeddings (after adding tokens)
|
||||
new_embeddings = model.resize_token_embeddings(
|
||||
new_num_tokens,
|
||||
pad_to_multiple_of=None
|
||||
)
|
||||
|
||||
# Utility methods
|
||||
num_params = model.num_parameters(only_trainable=False)
|
||||
model.gradient_checkpointing_enable() # Enable gradient checkpointing
|
||||
model.enable_input_require_grads() # For PEFT with frozen models
|
||||
```
|
||||
|
||||
### AutoModel Classes
|
||||
|
||||
Automatically instantiate the correct model architecture.
|
||||
|
||||
**Available Classes:**
|
||||
|
||||
- `AutoModel`: Base model (returns hidden states)
|
||||
- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
|
||||
- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
|
||||
- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
|
||||
- `AutoModelForSequenceClassification`: Text classification
|
||||
- `AutoModelForTokenClassification`: Token classification (NER)
|
||||
- `AutoModelForQuestionAnswering`: Extractive QA
|
||||
- `AutoModelForImageClassification`: Image classification
|
||||
- `AutoModelForObjectDetection`: Object detection
|
||||
- `AutoModelForSemanticSegmentation`: Semantic segmentation
|
||||
- `AutoModelForAudioClassification`: Audio classification
|
||||
- `AutoModelForSpeechSeq2Seq`: Speech-to-text
|
||||
- `AutoModelForVision2Seq`: Image captioning, VQA
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
from transformers import AutoModel, AutoConfig
|
||||
|
||||
# Load with default configuration
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Load with custom configuration
|
||||
config = AutoConfig.from_pretrained("bert-base-uncased")
|
||||
config.hidden_dropout_prob = 0.2
|
||||
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
|
||||
|
||||
# Register custom models
|
||||
from transformers import AutoConfig, AutoModel
|
||||
|
||||
AutoConfig.register("my-model", MyModelConfig)
|
||||
AutoModel.register(MyModelConfig, MyModel)
|
||||
```
|
||||
|
||||
## Tokenizer Classes
|
||||
|
||||
### PreTrainedTokenizer / PreTrainedTokenizerFast
|
||||
|
||||
Convert text to token IDs and vice versa.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
use_fast=True, # Use fast (Rust) tokenizer if available
|
||||
revision="main",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Encoding (text → token IDs)
|
||||
encoded = tokenizer(
|
||||
text, # String or List[str]
|
||||
text_pair=None, # Second sequence for pairs
|
||||
add_special_tokens=True, # Add [CLS], [SEP], etc.
|
||||
padding=False, # True, False, "longest", "max_length"
|
||||
truncation=False, # True, False, "longest_first", "only_first", "only_second"
|
||||
max_length=None, # Max sequence length
|
||||
stride=0, # Overlap for split sequences
|
||||
return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
|
||||
return_token_type_ids=None, # Return token type IDs
|
||||
return_attention_mask=None, # Return attention mask
|
||||
return_overflowing_tokens=False, # Return overflowing tokens
|
||||
return_special_tokens_mask=False, # Return special token mask
|
||||
return_offsets_mapping=False, # Return char-level offsets (fast only)
|
||||
return_length=False, # Return sequence lengths
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Decoding (token IDs → text)
|
||||
text = tokenizer.decode(
|
||||
token_ids,
|
||||
skip_special_tokens=False, # Remove special tokens
|
||||
clean_up_tokenization_spaces=True, # Clean up spacing
|
||||
)
|
||||
|
||||
# Batch decoding
|
||||
texts = tokenizer.batch_decode(
|
||||
sequences,
|
||||
skip_special_tokens=False,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
# Tokenization (text → tokens)
|
||||
tokens = tokenizer.tokenize(text, **kwargs)
|
||||
|
||||
# Convert tokens to IDs
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# Convert IDs to tokens
|
||||
tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
|
||||
# Add new tokens
|
||||
num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
|
||||
|
||||
# Add special tokens
|
||||
tokenizer.add_special_tokens({
|
||||
"bos_token": "[BOS]",
|
||||
"eos_token": "[EOS]",
|
||||
"unk_token": "[UNK]",
|
||||
"sep_token": "[SEP]",
|
||||
"pad_token": "[PAD]",
|
||||
"cls_token": "[CLS]",
|
||||
"mask_token": "[MASK]",
|
||||
"additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
|
||||
})
|
||||
|
||||
# Chat template formatting
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
conversation, # List[Dict[str, str]] with "role" and "content"
|
||||
chat_template=None, # Custom template
|
||||
add_generation_prompt=False, # Add prompt for model to continue
|
||||
tokenize=True, # Return token IDs
|
||||
padding=False,
|
||||
truncation=False,
|
||||
max_length=None,
|
||||
return_tensors=None,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
# Save tokenizer
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
|
||||
# Get vocab size
|
||||
vocab_size = len(tokenizer)
|
||||
|
||||
# Get special tokens
|
||||
pad_token = tokenizer.pad_token
|
||||
pad_token_id = tokenizer.pad_token_id
|
||||
# Similar for: bos, eos, unk, sep, cls, mask
|
||||
```
|
||||
|
||||
**Special Token Attributes:**
|
||||
|
||||
```python
|
||||
tokenizer.bos_token # Beginning of sequence
|
||||
tokenizer.eos_token # End of sequence
|
||||
tokenizer.unk_token # Unknown token
|
||||
tokenizer.sep_token # Separator token
|
||||
tokenizer.pad_token # Padding token
|
||||
tokenizer.cls_token # Classification token
|
||||
tokenizer.mask_token # Mask token
|
||||
|
||||
# Corresponding IDs
|
||||
tokenizer.bos_token_id
|
||||
tokenizer.eos_token_id
|
||||
# ... etc
|
||||
```
|
||||
|
||||
## Image Processors
|
||||
|
||||
### AutoImageProcessor
|
||||
|
||||
Preprocess images for vision models.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Process images
|
||||
inputs = processor(
|
||||
images, # PIL Image, np.array, torch.Tensor, or List
|
||||
return_tensors="pt", # "pt", "tf", "np", None
|
||||
do_resize=True, # Resize to model size
|
||||
size=None, # Target size dict
|
||||
resample=None, # Resampling method
|
||||
do_rescale=True, # Rescale pixel values
|
||||
do_normalize=True, # Normalize with mean/std
|
||||
image_mean=None, # Custom mean
|
||||
image_std=None, # Custom std
|
||||
do_center_crop=False, # Center crop
|
||||
crop_size=None, # Crop size
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Returns: BatchFeature with 'pixel_values' key
|
||||
```
|
||||
|
||||
## Training Components
|
||||
|
||||
### TrainingArguments
|
||||
|
||||
Configuration for the Trainer class.
|
||||
|
||||
**Essential Arguments:**
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments
|
||||
|
||||
args = TrainingArguments(
|
||||
# ===== Output & Logging =====
|
||||
output_dir="./results", # REQUIRED: Output directory
|
||||
overwrite_output_dir=False, # Overwrite output directory
|
||||
|
||||
# ===== Training Parameters =====
|
||||
num_train_epochs=3.0, # Number of epochs
|
||||
max_steps=-1, # Max training steps (overrides epochs)
|
||||
per_device_train_batch_size=8, # Train batch size per device
|
||||
per_device_eval_batch_size=8, # Eval batch size per device
|
||||
gradient_accumulation_steps=1, # Accumulation steps
|
||||
|
||||
# ===== Learning Rate & Optimization =====
|
||||
learning_rate=5e-5, # Initial learning rate
|
||||
weight_decay=0.0, # Weight decay
|
||||
adam_beta1=0.9, # Adam beta1
|
||||
adam_beta2=0.999, # Adam beta2
|
||||
adam_epsilon=1e-8, # Adam epsilon
|
||||
max_grad_norm=1.0, # Gradient clipping
|
||||
optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
|
||||
|
||||
# ===== Learning Rate Scheduler =====
|
||||
lr_scheduler_type="linear", # Scheduler type
|
||||
warmup_steps=0, # Warmup steps
|
||||
warmup_ratio=0.0, # Warmup ratio (alternative to steps)
|
||||
|
||||
# ===== Evaluation =====
|
||||
eval_strategy="no", # "no", "steps", "epoch"
|
||||
eval_steps=None, # Eval every N steps
|
||||
eval_delay=0, # Delay first eval
|
||||
eval_accumulation_steps=None, # Accumulate eval outputs
|
||||
|
||||
# ===== Checkpointing =====
|
||||
save_strategy="steps", # "no", "steps", "epoch"
|
||||
save_steps=500, # Save every N steps
|
||||
save_total_limit=None, # Max checkpoints to keep
|
||||
save_safetensors=True, # Save as SafeTensors
|
||||
save_on_each_node=False, # Save on each node (distributed)
|
||||
|
||||
# ===== Best Model Selection =====
|
||||
load_best_model_at_end=False, # Load best checkpoint at end
|
||||
metric_for_best_model=None, # Metric to use
|
||||
greater_is_better=None, # True if higher is better
|
||||
|
||||
# ===== Logging =====
|
||||
logging_dir=None, # TensorBoard log directory
|
||||
logging_strategy="steps", # "no", "steps", "epoch"
|
||||
logging_steps=500, # Log every N steps
|
||||
logging_first_step=False, # Log first step
|
||||
logging_nan_inf_filter=True, # Filter NaN/Inf
|
||||
|
||||
# ===== Mixed Precision =====
|
||||
fp16=False, # Use fp16 training
|
||||
fp16_opt_level="O1", # Apex AMP optimization level
|
||||
fp16_backend="auto", # "auto", "apex", "cpu_amp"
|
||||
bf16=False, # Use bfloat16 training
|
||||
bf16_full_eval=False, # Use bf16 for evaluation
|
||||
tf32=None, # Use TF32 (Ampere+ GPUs)
|
||||
|
||||
# ===== Memory Optimization =====
|
||||
gradient_checkpointing=False, # Enable gradient checkpointing
|
||||
gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing
|
||||
torch_empty_cache_steps=None, # Clear cache every N steps
|
||||
|
||||
# ===== Distributed Training =====
|
||||
local_rank=-1, # Local rank for distributed
|
||||
ddp_backend=None, # "nccl", "gloo", "mpi", "ccl"
|
||||
ddp_find_unused_parameters=None, # Find unused parameters
|
||||
ddp_bucket_cap_mb=None, # DDP bucket size
|
||||
fsdp="", # FSDP configuration
|
||||
fsdp_config=None, # FSDP config dict
|
||||
deepspeed=None, # DeepSpeed config
|
||||
|
||||
# ===== Hub Integration =====
|
||||
push_to_hub=False, # Push to Hugging Face Hub
|
||||
hub_model_id=None, # Hub model ID
|
||||
hub_strategy="every_save", # "every_save", "checkpoint", "end"
|
||||
hub_token=None, # Hub authentication token
|
||||
hub_private_repo=False, # Make repo private
|
||||
|
||||
# ===== Data Handling =====
|
||||
dataloader_num_workers=0, # DataLoader workers
|
||||
dataloader_pin_memory=True, # Pin memory
|
||||
dataloader_drop_last=False, # Drop last incomplete batch
|
||||
dataloader_prefetch_factor=None, # Prefetch factor
|
||||
remove_unused_columns=True, # Remove unused dataset columns
|
||||
label_names=None, # Label column names
|
||||
|
||||
# ===== Other =====
|
||||
seed=42, # Random seed
|
||||
data_seed=None, # Data sampling seed
|
||||
jit_mode_eval=False, # Use PyTorch JIT for eval
|
||||
use_ipex=False, # Use Intel Extension for PyTorch
|
||||
torch_compile=False, # Use torch.compile()
|
||||
torch_compile_backend=None, # Compile backend
|
||||
torch_compile_mode=None, # Compile mode
|
||||
include_inputs_for_metrics=False, # Pass inputs to compute_metrics
|
||||
skip_memory_metrics=True, # Skip memory profiling
|
||||
)
|
||||
```
|
||||
|
||||
### Trainer
|
||||
|
||||
Main training class with full training loop.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=None, # Model to train
|
||||
args=None, # TrainingArguments
|
||||
data_collator=None, # Data collator
|
||||
train_dataset=None, # Training dataset
|
||||
eval_dataset=None, # Evaluation dataset
|
||||
tokenizer=None, # Tokenizer
|
||||
model_init=None, # Function to instantiate model
|
||||
compute_metrics=None, # Function to compute metrics
|
||||
callbacks=None, # List of callbacks
|
||||
optimizers=(None, None), # (optimizer, scheduler) tuple
|
||||
preprocess_logits_for_metrics=None, # Preprocess logits before metrics
|
||||
)
|
||||
|
||||
# Train model
|
||||
train_result = trainer.train(
|
||||
resume_from_checkpoint=None, # Resume from checkpoint
|
||||
trial=None, # Optuna/Ray trial
|
||||
ignore_keys_for_eval=None, # Keys to ignore in eval
|
||||
)
|
||||
|
||||
# Evaluate model
|
||||
eval_result = trainer.evaluate(
|
||||
eval_dataset=None, # Eval dataset (default: self.eval_dataset)
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="eval", # Prefix for metric names
|
||||
)
|
||||
|
||||
# Make predictions
|
||||
predictions = trainer.predict(
|
||||
test_dataset, # Test dataset
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="test", # Metric prefix
|
||||
)
|
||||
# Returns: PredictionOutput(predictions, label_ids, metrics)
|
||||
|
||||
# Save model
|
||||
trainer.save_model(output_dir=None)
|
||||
|
||||
# Push to Hub
|
||||
trainer.push_to_hub(
|
||||
commit_message="End of training",
|
||||
blocking=True,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Hyperparameter search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
hp_space=None, # Hyperparameter search space
|
||||
compute_objective=None, # Objective function
|
||||
n_trials=20, # Number of trials
|
||||
direction="minimize", # "minimize" or "maximize"
|
||||
backend=None, # "optuna", "ray", "sigopt"
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = trainer.create_optimizer()
|
||||
|
||||
# Create scheduler
|
||||
scheduler = trainer.create_scheduler(
|
||||
num_training_steps,
|
||||
optimizer=None
|
||||
)
|
||||
|
||||
# Log metrics
|
||||
trainer.log_metrics(split, metrics)
|
||||
trainer.save_metrics(split, metrics)
|
||||
|
||||
# Save checkpoint
|
||||
trainer.save_state()
|
||||
|
||||
# Access current step/epoch
|
||||
current_step = trainer.state.global_step
|
||||
current_epoch = trainer.state.epoch
|
||||
|
||||
# Access training logs
|
||||
logs = trainer.state.log_history
|
||||
```
|
||||
|
||||
### Seq2SeqTrainer
|
||||
|
||||
Specialized trainer for sequence-to-sequence models.
|
||||
|
||||
```python
|
||||
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
|
||||
|
||||
# Use Seq2SeqTrainingArguments with additional parameters
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./results",
|
||||
predict_with_generate=True, # Use generate() for evaluation
|
||||
generation_max_length=None, # Max length for generation
|
||||
generation_num_beams=None, # Num beams for generation
|
||||
**other_training_arguments
|
||||
)
|
||||
|
||||
# Trainer usage is identical to Trainer
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
```
|
||||
|
||||
## Pipeline Classes
|
||||
|
||||
### pipeline()
|
||||
|
||||
Unified inference API for all tasks.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
pipe = pipeline(
|
||||
task=None, # Task name (required)
|
||||
model=None, # Model name/path or model object
|
||||
config=None, # Model config
|
||||
tokenizer=None, # Tokenizer
|
||||
feature_extractor=None, # Feature extractor
|
||||
image_processor=None, # Image processor
|
||||
framework=None, # "pt" or "tf"
|
||||
revision=None, # Model revision
|
||||
use_fast=True, # Use fast tokenizer
|
||||
token=None, # HF token
|
||||
device=None, # Device (-1 for CPU, 0+ for GPU)
|
||||
device_map=None, # Device map for multi-GPU
|
||||
torch_dtype=None, # Model dtype
|
||||
trust_remote_code=False, # Allow custom code
|
||||
model_kwargs=None, # Additional model kwargs
|
||||
pipeline_class=None, # Custom pipeline class
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Use pipeline
|
||||
results = pipe(
|
||||
inputs, # Input data
|
||||
**task_specific_parameters
|
||||
)
|
||||
```
|
||||
|
||||
## Data Collators
|
||||
|
||||
Batch and pad data for training.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding, # Dynamic padding for classification
|
||||
DataCollatorForTokenClassification, # Padding for token classification
|
||||
DataCollatorForSeq2Seq, # Padding for seq2seq
|
||||
DataCollatorForLanguageModeling, # MLM/CLM data collation
|
||||
default_data_collator, # Simple collator (no padding)
|
||||
)
|
||||
|
||||
# Text classification
|
||||
data_collator = DataCollatorWithPadding(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
|
||||
# Token classification
|
||||
data_collator = DataCollatorForTokenClassification(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Seq2Seq
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer=tokenizer,
|
||||
model=None,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Language modeling
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=True, # Masked LM (False for causal LM)
|
||||
mlm_probability=0.15, # Mask probability
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
```
|
||||
|
||||
## Optimization & Scheduling
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AdamW, # AdamW optimizer
|
||||
Adafactor, # Adafactor optimizer
|
||||
get_scheduler, # Get LR scheduler
|
||||
get_linear_schedule_with_warmup,
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_polynomial_decay_schedule_with_warmup,
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = AdamW(
|
||||
model.parameters(),
|
||||
lr=5e-5,
|
||||
betas=(0.9, 0.999),
|
||||
eps=1e-8,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
# Create scheduler
|
||||
scheduler = get_scheduler(
|
||||
name="linear", # "linear", "cosine", "polynomial", "constant"
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
# Or use specific schedulers
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
scheduler = get_cosine_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
num_cycles=0.5,
|
||||
)
|
||||
```
|
||||
|
||||
## Configuration Classes
|
||||
|
||||
```python
|
||||
from transformers import AutoConfig
|
||||
|
||||
# Load configuration
|
||||
config = AutoConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Common configuration attributes
|
||||
config.vocab_size # Vocabulary size
|
||||
config.hidden_size # Hidden layer size
|
||||
config.num_hidden_layers # Number of layers
|
||||
config.num_attention_heads # Attention heads
|
||||
config.intermediate_size # FFN intermediate size
|
||||
config.hidden_dropout_prob # Dropout probability
|
||||
config.attention_probs_dropout_prob # Attention dropout
|
||||
config.max_position_embeddings # Max sequence length
|
||||
|
||||
# Save configuration
|
||||
config.save_pretrained(save_directory)
|
||||
|
||||
# Create model from config
|
||||
from transformers import AutoModel
|
||||
model = AutoModel.from_config(config)
|
||||
```
|
||||
|
||||
## Utility Functions
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
set_seed, # Set random seed
|
||||
logging, # Logging utilities
|
||||
)
|
||||
|
||||
# Set seed for reproducibility
|
||||
set_seed(42)
|
||||
|
||||
# Configure logging
|
||||
logging.set_verbosity_info()
|
||||
logging.set_verbosity_warning()
|
||||
logging.set_verbosity_error()
|
||||
logging.set_verbosity_debug()
|
||||
|
||||
# Get logger
|
||||
logger = logging.get_logger(__name__)
|
||||
```
|
||||
|
||||
## Model Outputs
|
||||
|
||||
All models return model-specific output classes (subclasses of `ModelOutput`):
|
||||
|
||||
```python
|
||||
# Common output attributes
|
||||
outputs.loss # Loss (if labels provided)
|
||||
outputs.logits # Model logits
|
||||
outputs.hidden_states # All hidden states (if output_hidden_states=True)
|
||||
outputs.attentions # Attention weights (if output_attentions=True)
|
||||
|
||||
# Seq2Seq specific
|
||||
outputs.encoder_last_hidden_state
|
||||
outputs.encoder_hidden_states
|
||||
outputs.encoder_attentions
|
||||
outputs.decoder_hidden_states
|
||||
outputs.decoder_attentions
|
||||
outputs.cross_attentions
|
||||
|
||||
# Access as dict or tuple
|
||||
logits = outputs.logits
|
||||
logits = outputs["logits"]
|
||||
loss, logits = outputs.to_tuple()[:2]
|
||||
```
|
||||
|
||||
This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.
|
||||
@@ -0,0 +1,530 @@
|
||||
# Text Generation Strategies
|
||||
|
||||
Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity.
|
||||
|
||||
## Overview
|
||||
|
||||
Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost.
|
||||
|
||||
**When to use each strategy:**
|
||||
- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical
|
||||
- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization)
|
||||
- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue)
|
||||
- **Top-k/Top-p**: Balanced creativity and coherence
|
||||
|
||||
## Basic Generation Methods
|
||||
|
||||
### Greedy Decoding
|
||||
|
||||
Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
inputs = tokenizer("The future of AI", return_tensors="pt")
|
||||
|
||||
# Greedy decoding (default)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Deterministic (always same output for same input)
|
||||
- Fast (single forward pass per token)
|
||||
- Prone to repetition in longer sequences
|
||||
- Best for: Short generations, deterministic applications
|
||||
|
||||
**Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50, # Number of tokens to generate
|
||||
min_length=10, # Minimum total length
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
```
|
||||
|
||||
### Beam Search
|
||||
|
||||
Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5, # Number of beams
|
||||
early_stopping=True, # Stop when all beams finish
|
||||
no_repeat_ngram_size=2, # Prevent 2-gram repetition
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Higher quality than greedy for tasks with "correct" answers
|
||||
- Slower than greedy (num_beams forward passes per step)
|
||||
- Still can suffer from repetition
|
||||
- Best for: Translation, summarization, QA generation
|
||||
|
||||
**Advanced Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
num_beams=5,
|
||||
num_beam_groups=1, # Diverse beam search groups
|
||||
diversity_penalty=0.0, # Penalty for similar beams
|
||||
length_penalty=1.0, # >1: longer sequences, <1: shorter
|
||||
early_stopping=True, # Stop when num_beams sequences finish
|
||||
no_repeat_ngram_size=2, # Block repeating n-grams
|
||||
num_return_sequences=1, # Return top-k sequences (≤ num_beams)
|
||||
)
|
||||
```
|
||||
|
||||
**Length Penalty:**
|
||||
- `length_penalty > 1.0`: Favor longer sequences
|
||||
- `length_penalty = 1.0`: No penalty
|
||||
- `length_penalty < 1.0`: Favor shorter sequences
|
||||
|
||||
### Sampling (Multinomial)
|
||||
|
||||
Randomly sample tokens according to the probability distribution.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True, # Enable sampling
|
||||
temperature=1.0, # Sampling temperature
|
||||
num_beams=1, # Must be 1 for sampling
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Non-deterministic (different output each time)
|
||||
- More diverse and creative than greedy/beam search
|
||||
- Can produce incoherent output if not controlled
|
||||
- Best for: Creative writing, dialogue, open-ended generation
|
||||
|
||||
**Temperature Parameter:**
|
||||
```python
|
||||
# Low temperature (0.1-0.7): More focused, less random
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.5)
|
||||
|
||||
# Medium temperature (0.7-1.0): Balanced
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.8)
|
||||
|
||||
# High temperature (1.0-2.0): More random, more creative
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=1.5)
|
||||
```
|
||||
|
||||
- `temperature → 0`: Approaches greedy decoding
|
||||
- `temperature = 1.0`: Sample from original distribution
|
||||
- `temperature > 1.0`: Flatter distribution, more random
|
||||
- `temperature < 1.0`: Sharper distribution, more confident
|
||||
|
||||
## Advanced Sampling Methods
|
||||
|
||||
### Top-k Sampling
|
||||
|
||||
Sample from only the k most likely tokens.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_k=50, # Consider top 50 tokens
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Filter to top-k most probable tokens
|
||||
2. Renormalize probabilities
|
||||
3. Sample from filtered distribution
|
||||
|
||||
**Choosing k:**
|
||||
- `k=1`: Equivalent to greedy decoding
|
||||
- `k=10-50`: More focused, coherent output
|
||||
- `k=100-500`: More diverse output
|
||||
- Too high k: Includes low-probability tokens (noise)
|
||||
- Too low k: Less diverse, may miss good alternatives
|
||||
|
||||
### Top-p (Nucleus) Sampling
|
||||
|
||||
Sample from the smallest set of tokens whose cumulative probability ≥ p.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_p=0.95, # Nucleus probability
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Sort tokens by probability
|
||||
2. Find smallest set with cumulative probability ≥ p
|
||||
3. Sample from this set
|
||||
|
||||
**Choosing p:**
|
||||
- `p=0.9-0.95`: Good balance (recommended)
|
||||
- `p=1.0`: Sample from full distribution
|
||||
- Higher p: More diverse, might include unlikely tokens
|
||||
- Lower p: More focused, like top-k with adaptive k
|
||||
|
||||
**Top-p vs Top-k:**
|
||||
- Top-p adapts to probability distribution shape
|
||||
- Top-k is fixed regardless of distribution
|
||||
- Top-p generally better for variable-quality contexts
|
||||
- Can combine: `top_k=50, top_p=0.95` (apply both filters)
|
||||
|
||||
### Combining Strategies
|
||||
|
||||
```python
|
||||
# Recommended for high-quality open-ended generation
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=100,
|
||||
temperature=0.8, # Moderate temperature
|
||||
top_k=50, # Limit to top 50 tokens
|
||||
top_p=0.95, # Nucleus sampling
|
||||
repetition_penalty=1.2, # Discourage repetition
|
||||
no_repeat_ngram_size=3, # Block 3-gram repetition
|
||||
)
|
||||
```
|
||||
|
||||
## Controlling Generation Quality
|
||||
|
||||
### Repetition Control
|
||||
|
||||
Prevent models from repeating themselves:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
|
||||
# Method 1: Repetition penalty
|
||||
repetition_penalty=1.2, # Penalize repeated tokens (>1.0)
|
||||
|
||||
# Method 2: Block n-gram repetition
|
||||
no_repeat_ngram_size=3, # Never repeat 3-grams
|
||||
|
||||
# Method 3: Encoder repetition penalty (for seq2seq)
|
||||
encoder_repetition_penalty=1.0, # Penalize input tokens
|
||||
)
|
||||
```
|
||||
|
||||
**Repetition Penalty Values:**
|
||||
- `1.0`: No penalty
|
||||
- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3)
|
||||
- `>1.5`: Strong penalty (may harm coherence)
|
||||
|
||||
### Length Control
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
|
||||
# Hard constraints
|
||||
min_length=20, # Minimum total length
|
||||
max_length=100, # Maximum total length
|
||||
max_new_tokens=50, # Maximum new tokens (excluding input)
|
||||
|
||||
# Soft constraints (with beam search)
|
||||
length_penalty=1.0, # Encourage longer/shorter outputs
|
||||
|
||||
# Early stopping
|
||||
early_stopping=True, # Stop when condition met
|
||||
)
|
||||
```
|
||||
|
||||
### Bad Words and Forced Tokens
|
||||
|
||||
```python
|
||||
# Prevent specific tokens
|
||||
bad_words_ids = [
|
||||
tokenizer.encode("badword1", add_special_tokens=False),
|
||||
tokenizer.encode("badword2", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
bad_words_ids=bad_words_ids,
|
||||
)
|
||||
|
||||
# Force specific tokens
|
||||
force_words_ids = [
|
||||
tokenizer.encode("important", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
force_words_ids=force_words_ids,
|
||||
)
|
||||
```
|
||||
|
||||
## Streaming Generation
|
||||
|
||||
Generate and process tokens as they're produced:
|
||||
|
||||
```python
|
||||
from transformers import TextStreamer, TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
# Simple streaming (prints to stdout)
|
||||
streamer = TextStreamer(tokenizer, skip_prompt=True)
|
||||
outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
|
||||
# Iterator streaming (for custom processing)
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|
||||
|
||||
generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for text in streamer:
|
||||
print(text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
## Advanced Techniques
|
||||
|
||||
### Contrastive Search
|
||||
|
||||
Balance coherence and diversity using contrastive objective:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
penalty_alpha=0.6, # Contrastive penalty
|
||||
top_k=4, # Consider top-4 tokens
|
||||
)
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
- Open-ended text generation
|
||||
- Reduces repetition without sacrificing coherence
|
||||
- Good alternative to sampling
|
||||
|
||||
### Diverse Beam Search
|
||||
|
||||
Generate multiple diverse outputs:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=10,
|
||||
num_beam_groups=5, # 5 groups of 2 beams each
|
||||
diversity_penalty=1.0, # Penalty for similar beams
|
||||
num_return_sequences=5, # Return 5 diverse outputs
|
||||
)
|
||||
```
|
||||
|
||||
### Constrained Beam Search
|
||||
|
||||
Force output to include specific phrases:
|
||||
|
||||
```python
|
||||
from transformers import PhrasalConstraint
|
||||
|
||||
constraints = [
|
||||
PhrasalConstraint(
|
||||
tokenizer("machine learning", add_special_tokens=False).input_ids
|
||||
),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
constraints=constraints,
|
||||
num_beams=10, # Requires beam search
|
||||
)
|
||||
```
|
||||
|
||||
## Speculative Decoding
|
||||
|
||||
Accelerate generation using a smaller draft model:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Load main and assistant models
|
||||
model = AutoModelForCausalLM.from_pretrained("large-model")
|
||||
assistant_model = AutoModelForCausalLM.from_pretrained("small-model")
|
||||
|
||||
# Generate with speculative decoding
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
assistant_model=assistant_model,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- 2-3x faster generation
|
||||
- Identical output distribution to regular generation
|
||||
- Works with sampling and greedy decoding
|
||||
|
||||
## Recipe: Recommended Settings by Task
|
||||
|
||||
### Creative Writing / Dialogue
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=200,
|
||||
temperature=0.9,
|
||||
top_p=0.95,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
```
|
||||
|
||||
### Translation / Summarization
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
num_beams=5,
|
||||
max_new_tokens=150,
|
||||
early_stopping=True,
|
||||
length_penalty=1.0,
|
||||
no_repeat_ngram_size=2,
|
||||
)
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=300,
|
||||
temperature=0.2, # Low temperature for correctness
|
||||
top_p=0.95,
|
||||
do_sample=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Chatbot / Instruction Following
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=256,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.15,
|
||||
)
|
||||
```
|
||||
|
||||
### Factual QA / Information Extraction
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=3,
|
||||
early_stopping=True,
|
||||
# Or greedy for very short answers:
|
||||
# (no special parameters needed)
|
||||
)
|
||||
```
|
||||
|
||||
## Debugging Generation
|
||||
|
||||
### Check Token Probabilities
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=20,
|
||||
output_scores=True, # Return generation scores
|
||||
return_dict_in_generate=True, # Return as dict
|
||||
)
|
||||
|
||||
# Access generation scores
|
||||
scores = outputs.scores # Tuple of tensors (seq_len, vocab_size)
|
||||
|
||||
# Get token probabilities
|
||||
import torch
|
||||
probs = torch.softmax(scores[0], dim=-1)
|
||||
```
|
||||
|
||||
### Monitor Generation Process
|
||||
|
||||
```python
|
||||
from transformers import LogitsProcessor, LogitsProcessorList
|
||||
|
||||
class DebugLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids, scores):
|
||||
# Print top 5 tokens at each step
|
||||
top_tokens = scores[0].topk(5)
|
||||
print(f"Top 5 tokens: {top_tokens}")
|
||||
return scores
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=10,
|
||||
logits_processor=LogitsProcessorList([DebugLogitsProcessor()]),
|
||||
)
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
**Issue: Repetitive output**
|
||||
- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3`
|
||||
- For sampling: Increase `temperature`, enable `top_p`
|
||||
|
||||
**Issue: Incoherent output**
|
||||
- Solution: Lower `temperature` (0.5-0.8), use beam search
|
||||
- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens
|
||||
|
||||
**Issue: Too short output**
|
||||
- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search)
|
||||
- Check if EOS token is being generated early
|
||||
|
||||
**Issue: Too slow generation**
|
||||
- Solution: Use greedy instead of beam search
|
||||
- Reduce `num_beams`
|
||||
- Try speculative decoding with assistant model
|
||||
- Use smaller model variant
|
||||
|
||||
**Issue: Output doesn't follow format**
|
||||
- Solution: Use constrained beam search
|
||||
- Add format examples to prompt
|
||||
- Use `bad_words_ids` to prevent format-breaking tokens
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
```python
|
||||
# Use half precision
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
# Use KV cache optimization (default, but can be disabled)
|
||||
outputs = model.generate(**inputs, use_cache=True)
|
||||
|
||||
# Batch generation
|
||||
inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
|
||||
# Static cache for longer sequences (if supported)
|
||||
outputs = model.generate(**inputs, cache_implementation="static")
|
||||
```
|
||||
|
||||
This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`.
|
||||
504
scientific-packages/transformers/references/quantization.md
Normal file
504
scientific-packages/transformers/references/quantization.md
Normal file
@@ -0,0 +1,504 @@
|
||||
# Model Quantization Guide
|
||||
|
||||
Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy.
|
||||
|
||||
## Overview
|
||||
|
||||
Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables:
|
||||
- Running larger models on limited hardware
|
||||
- Faster inference (reduced memory bandwidth)
|
||||
- Lower deployment costs
|
||||
- Enabling fine-tuning of models that wouldn't fit in memory
|
||||
|
||||
**Tradeoffs:**
|
||||
- Slight accuracy loss (typically < 1-2%)
|
||||
- Initial quantization overhead
|
||||
- Some methods require calibration data
|
||||
|
||||
## Quick Comparison
|
||||
|
||||
| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup |
|
||||
|--------|-----------|-------|----------|-------------|----------|-------|
|
||||
| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy |
|
||||
| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium |
|
||||
| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium |
|
||||
| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy |
|
||||
| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium |
|
||||
|
||||
## Bitsandbytes (BnB)
|
||||
|
||||
On-the-fly quantization with excellent PEFT fine-tuning support.
|
||||
|
||||
### 8-bit Quantization
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True, # Enable 8-bit quantization
|
||||
device_map="auto", # Automatic device placement
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
|
||||
# Use normally
|
||||
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~7GB (50% reduction)
|
||||
- 13B model: ~26GB → ~13GB
|
||||
- 70B model: ~140GB → ~70GB
|
||||
|
||||
**Characteristics:**
|
||||
- Fast inference
|
||||
- Minimal accuracy loss
|
||||
- Works with PEFT (LoRA, QLoRA)
|
||||
- Supports CPU and CUDA GPUs
|
||||
|
||||
### 4-bit Quantization (QLoRA)
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
import torch
|
||||
|
||||
# Configure 4-bit quantization
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True, # Enable 4-bit quantization
|
||||
bnb_4bit_quant_type="nf4", # Quantization type ("nf4" or "fp4")
|
||||
bnb_4bit_compute_dtype=torch.float16, # Computation dtype
|
||||
bnb_4bit_use_double_quant=True, # Nested quantization for more savings
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~4GB (70% reduction)
|
||||
- 13B model: ~26GB → ~7GB
|
||||
- 70B model: ~140GB → ~35GB
|
||||
|
||||
**Quantization Types:**
|
||||
- `nf4`: Normal Float 4 (recommended, better quality)
|
||||
- `fp4`: Float Point 4 (slightly more memory efficient)
|
||||
|
||||
**Compute Dtype:**
|
||||
```python
|
||||
# For better quality
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
|
||||
# For best performance on Ampere+ GPUs
|
||||
bnb_4bit_compute_dtype=torch.bfloat16
|
||||
```
|
||||
|
||||
**Double Quantization:**
|
||||
```python
|
||||
# Enable for additional ~0.4 bits/param savings
|
||||
bnb_4bit_use_double_quant=True # Quantize the quantization constants
|
||||
```
|
||||
|
||||
### Fine-tuning with QLoRA
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
import torch
|
||||
|
||||
# Load quantized model
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Prepare for training
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Train normally
|
||||
trainer = Trainer(model=model, args=training_args, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GPTQ
|
||||
|
||||
Post-training quantization requiring calibration, optimized for inference speed.
|
||||
|
||||
### Loading GPTQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
|
||||
# Load pre-quantized GPTQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ", # Pre-quantized model
|
||||
device_map="auto",
|
||||
revision="gptq-4bit-32g-actorder_True", # Specific quantization config
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # 2, 3, 4, 8 bits
|
||||
dataset="c4", # Calibration dataset
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
device_map="auto",
|
||||
quantization_config=gptq_config,
|
||||
)
|
||||
|
||||
# Save quantized model
|
||||
model.save_pretrained("llama-2-7b-gptq")
|
||||
```
|
||||
|
||||
**Configuration Options:**
|
||||
```python
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # Quantization bits
|
||||
group_size=128, # Group size for quantization (128, 32, -1)
|
||||
dataset="c4", # Calibration dataset
|
||||
desc_act=False, # Activation order (can improve accuracy)
|
||||
sym=True, # Symmetric quantization
|
||||
damp_percent=0.1, # Dampening factor
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Fastest inference among quantization methods
|
||||
- Requires one-time calibration (slow)
|
||||
- Best when using pre-quantized models from Hub
|
||||
- Limited fine-tuning support
|
||||
- Excellent for production deployment
|
||||
|
||||
## AWQ (Activation-aware Weight Quantization)
|
||||
|
||||
Protects important weights for better quality.
|
||||
|
||||
### Loading AWQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AwqConfig
|
||||
|
||||
# Load pre-quantized AWQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-AWQ",
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
awq_config = AwqConfig(
|
||||
bits=4, # 4-bit quantization
|
||||
group_size=128, # Quantization group size
|
||||
zero_point=True, # Use zero-point quantization
|
||||
version="GEMM", # Quantization version
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=awq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Better accuracy than GPTQ at same bit width
|
||||
- Excellent inference speed
|
||||
- Supports PEFT fine-tuning
|
||||
- Requires calibration data
|
||||
|
||||
### Fine-tuning AWQ Models
|
||||
|
||||
```python
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
# AWQ models support LoRA fine-tuning
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "v_proj"],
|
||||
lora_dropout=0.05,
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GGUF (GGML Format)
|
||||
|
||||
CPU-optimized quantization format, popular in llama.cpp ecosystem.
|
||||
|
||||
### Using GGUF Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Load GGUF model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GGUF",
|
||||
gguf_file="llama-2-7b.Q4_K_M.gguf", # Specific quantization file
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF")
|
||||
```
|
||||
|
||||
**GGUF Quantization Types:**
|
||||
- `Q4_0`: 4-bit, smallest, lowest quality
|
||||
- `Q4_K_M`: 4-bit, medium quality (recommended)
|
||||
- `Q5_K_M`: 5-bit, good quality
|
||||
- `Q6_K`: 6-bit, high quality
|
||||
- `Q8_0`: 8-bit, very high quality
|
||||
|
||||
**Characteristics:**
|
||||
- Optimized for CPU inference
|
||||
- Wide range of bit depths (1-8)
|
||||
- Good for Apple Silicon (M1/M2)
|
||||
- No fine-tuning support
|
||||
- Excellent for local/edge deployment
|
||||
|
||||
## HQQ (Half-Quadratic Quantization)
|
||||
|
||||
Flexible quantization with good accuracy retention.
|
||||
|
||||
### Using HQQ
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, HqqConfig
|
||||
|
||||
hqq_config = HqqConfig(
|
||||
nbits=4, # Quantization bits
|
||||
group_size=64, # Group size
|
||||
quant_zero=False, # Quantize zero point
|
||||
quant_scale=False, # Quantize scale
|
||||
axis=0, # Quantization axis
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=hqq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Very fast quantization
|
||||
- No calibration data needed
|
||||
- Support for 1-8 bits
|
||||
- Can serialize/deserialize
|
||||
- Good accuracy vs size tradeoff
|
||||
|
||||
## Choosing a Quantization Method
|
||||
|
||||
### Decision Tree
|
||||
|
||||
**For inference only:**
|
||||
1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models)
|
||||
2. CPU-only deployment? → **GGUF**
|
||||
3. Want easiest setup? → **Bitsandbytes 8-bit**
|
||||
4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit**
|
||||
|
||||
**For fine-tuning:**
|
||||
1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)**
|
||||
2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA**
|
||||
3. Need very large models? → **QLoRA with double quantization**
|
||||
|
||||
**For production:**
|
||||
1. Latency-critical? → **GPTQ or AWQ**
|
||||
2. Cost-optimized? → **Bitsandbytes 8-bit**
|
||||
3. CPU deployment? → **GGUF**
|
||||
|
||||
## Memory Requirements
|
||||
|
||||
Approximate memory for Llama-2 7B model:
|
||||
|
||||
| Method | Memory | vs FP16 |
|
||||
|--------|--------|---------|
|
||||
| FP32 | 28GB | 2x |
|
||||
| FP16 / BF16 | 14GB | 1x |
|
||||
| 8-bit (BnB) | 7GB | 0.5x |
|
||||
| 4-bit (QLoRA) | 3.5GB | 0.25x |
|
||||
| 4-bit Double Quant | 3GB | 0.21x |
|
||||
| GPTQ 4-bit | 4GB | 0.29x |
|
||||
| AWQ 4-bit | 4GB | 0.29x |
|
||||
|
||||
**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Training
|
||||
|
||||
```python
|
||||
# QLoRA recommended configuration
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # BF16 if available
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
# LoRA configuration
|
||||
lora_config = LoraConfig(
|
||||
r=16, # Rank (8, 16, 32, 64)
|
||||
lora_alpha=32, # Scaling (typically 2*r)
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
```
|
||||
|
||||
### For Inference
|
||||
|
||||
```python
|
||||
# High-speed inference
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ",
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16, # Use FP16 for activations
|
||||
)
|
||||
|
||||
# Balanced quality/speed
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Maximum compression
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
),
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-GPU Setups
|
||||
|
||||
```python
|
||||
# Automatically distribute across GPUs
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map="auto", # Automatic distribution
|
||||
max_memory={0: "20GB", 1: "20GB"}, # Optional: limit per GPU
|
||||
)
|
||||
|
||||
# Manual device map
|
||||
device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
# ... distribute layers ...
|
||||
"model.norm": 1,
|
||||
"lm_head": 1,
|
||||
}
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map=device_map,
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: OOM during quantization**
|
||||
```python
|
||||
# Solution: Use low_cpu_mem_usage
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
quantization_config=config,
|
||||
device_map="auto",
|
||||
low_cpu_mem_usage=True, # Reduce CPU memory during loading
|
||||
)
|
||||
```
|
||||
|
||||
**Issue: Slow quantization**
|
||||
```python
|
||||
# GPTQ/AWQ take time to calibrate
|
||||
# Solution: Use pre-quantized models from Hub
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ")
|
||||
|
||||
# Or use BnB for instant quantization
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True)
|
||||
```
|
||||
|
||||
**Issue: Poor quality after quantization**
|
||||
```python
|
||||
# Try different quantization types
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Try "nf4" instead of "fp4"
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 if available
|
||||
)
|
||||
|
||||
# Or use 8-bit instead of 4-bit
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True)
|
||||
```
|
||||
|
||||
**Issue: Can't fine-tune quantized model**
|
||||
```python
|
||||
# Ensure using compatible quantization method
|
||||
from peft import prepare_model_for_kbit_training
|
||||
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Only BnB and AWQ support PEFT fine-tuning
|
||||
# GPTQ has limited support, GGUF doesn't support fine-tuning
|
||||
```
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB:
|
||||
|
||||
| Method | Speed | Memory |
|
||||
|--------|-------|--------|
|
||||
| FP16 | 100 tok/s | 14GB |
|
||||
| 8-bit | 90 tok/s | 7GB |
|
||||
| 4-bit QLoRA | 70 tok/s | 4GB |
|
||||
| GPTQ 4-bit | 95 tok/s | 4GB |
|
||||
| AWQ 4-bit | 95 tok/s | 4GB |
|
||||
|
||||
**Note:** Actual performance varies by hardware, sequence length, and batch size.
|
||||
|
||||
## Resources
|
||||
|
||||
- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub
|
||||
- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes
|
||||
- **PEFT library:** https://github.com/huggingface/peft
|
||||
- **QLoRA paper:** https://arxiv.org/abs/2305.14314
|
||||
|
||||
For task-specific quantization examples, see `training_guide.md`.
|
||||
610
scientific-packages/transformers/references/task_patterns.md
Normal file
610
scientific-packages/transformers/references/task_patterns.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# Task-Specific Patterns
|
||||
|
||||
Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
|
||||
|
||||
## Text Classification
|
||||
|
||||
Classify text into predefined categories (sentiment, topic, intent, etc.).
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSequenceClassification,
|
||||
TrainingArguments, Trainer, DataCollatorWithPadding
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# 1. Load data
|
||||
dataset = load_dataset("imdb")
|
||||
|
||||
# 2. Preprocess
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def preprocess(examples):
|
||||
return tokenizer(examples["text"], truncation=True, max_length=512)
|
||||
|
||||
tokenized = dataset.map(preprocess, batched=True)
|
||||
|
||||
# 3. Model
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2,
|
||||
id2label={0: "negative", 1: "positive"},
|
||||
label2id={"negative": 0, "positive": 1}
|
||||
)
|
||||
|
||||
# 4. Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
eval_strategy="epoch",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized["train"],
|
||||
eval_dataset=tokenized["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# 5. Inference
|
||||
text = "This movie was fantastic!"
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
predictions = outputs.logits.argmax(-1)
|
||||
print(model.config.id2label[predictions.item()]) # "positive"
|
||||
```
|
||||
|
||||
## Token Classification (NER)
|
||||
|
||||
Label each token in text (named entities, POS tags, etc.).
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load data (tokens and NER tags)
|
||||
dataset = load_dataset("conll2003")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
def tokenize_and_align_labels(examples):
|
||||
tokenized_inputs = tokenizer(
|
||||
examples["tokens"],
|
||||
truncation=True,
|
||||
is_split_into_words=True
|
||||
)
|
||||
|
||||
labels = []
|
||||
for i, label in enumerate(examples["ner_tags"]):
|
||||
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
||||
label_ids = []
|
||||
previous_word_idx = None
|
||||
for word_idx in word_ids:
|
||||
if word_idx is None:
|
||||
label_ids.append(-100) # Special tokens
|
||||
elif word_idx != previous_word_idx:
|
||||
label_ids.append(label[word_idx])
|
||||
else:
|
||||
label_ids.append(-100) # Subword tokens
|
||||
previous_word_idx = word_idx
|
||||
labels.append(label_ids)
|
||||
|
||||
tokenized_inputs["labels"] = labels
|
||||
return tokenized_inputs
|
||||
|
||||
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
|
||||
|
||||
# Model
|
||||
label_list = dataset["train"].features["ner_tags"].feature.names
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-cased",
|
||||
num_labels=len(label_list),
|
||||
id2label={i: label for i, label in enumerate(label_list)},
|
||||
label2id={label: i for i, label in enumerate(label_list)}
|
||||
)
|
||||
|
||||
# Training similar to classification
|
||||
# ... (use Trainer with DataCollatorForTokenClassification)
|
||||
```
|
||||
|
||||
## Question Answering (Extractive)
|
||||
|
||||
Extract answer spans from context.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
|
||||
|
||||
question = "What is the capital of France?"
|
||||
context = "Paris is the capital and most populous city of France."
|
||||
|
||||
inputs = tokenizer(question, context, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Get answer span
|
||||
answer_start = outputs.start_logits.argmax()
|
||||
answer_end = outputs.end_logits.argmax() + 1
|
||||
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
|
||||
print(answer) # "Paris"
|
||||
```
|
||||
|
||||
## Text Generation
|
||||
|
||||
Generate text continuations.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
prompt = "In the future, artificial intelligence will"
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
repetition_penalty=1.2,
|
||||
)
|
||||
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
## Summarization
|
||||
|
||||
Condense long text into summaries.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSeq2SeqLM,
|
||||
Seq2SeqTrainingArguments, Seq2SeqTrainer,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
|
||||
def preprocess(examples):
|
||||
inputs = ["summarize: " + doc for doc in examples["document"]]
|
||||
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
||||
|
||||
labels = tokenizer(
|
||||
examples["summary"],
|
||||
max_length=128,
|
||||
truncation=True
|
||||
)
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_dataset = dataset.map(preprocess, batched=True)
|
||||
|
||||
# Training
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./results",
|
||||
predict_with_generate=True, # Important for seq2seq
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["validation"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
text = "Long article text here..."
|
||||
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
|
||||
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
|
||||
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Translation
|
||||
|
||||
Translate text between languages.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
|
||||
result = translator("Hello, how are you?")
|
||||
print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?"
|
||||
|
||||
# For fine-tuning, similar to summarization with Seq2SeqTrainer
|
||||
```
|
||||
|
||||
## Image Classification
|
||||
|
||||
Classify images into categories.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoImageProcessor, AutoModelForImageClassification,
|
||||
TrainingArguments, Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
|
||||
# Load data
|
||||
dataset = load_dataset("food101", split="train[:1000]")
|
||||
|
||||
# Preprocess
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
def transform(examples):
|
||||
examples["pixel_values"] = [
|
||||
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
|
||||
for img in examples["image"]
|
||||
]
|
||||
return examples
|
||||
|
||||
dataset = dataset.with_transform(transform)
|
||||
|
||||
# Model
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
"google/vit-base-patch16-224",
|
||||
num_labels=101,
|
||||
ignore_mismatched_sizes=True
|
||||
)
|
||||
|
||||
# Training
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
remove_unused_columns=False, # Keep image data
|
||||
eval_strategy="epoch",
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=32,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset,
|
||||
tokenizer=processor,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
image = Image.open("food.jpg")
|
||||
inputs = processor(image, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
predicted_class = outputs.logits.argmax(-1).item()
|
||||
print(model.config.id2label[predicted_class])
|
||||
```
|
||||
|
||||
## Object Detection
|
||||
|
||||
Detect and localize objects in images.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
from PIL import Image
|
||||
|
||||
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
||||
|
||||
image = Image.open("street.jpg")
|
||||
results = detector(image)
|
||||
|
||||
for result in results:
|
||||
print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
|
||||
# car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
|
||||
```
|
||||
|
||||
## Image Segmentation
|
||||
|
||||
Segment images into regions.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
|
||||
|
||||
image = "path/to/image.jpg"
|
||||
segments = segmenter(image)
|
||||
|
||||
for segment in segments:
|
||||
print(f"{segment['label']}: {segment['score']:.2f}")
|
||||
# Access mask: segment['mask']
|
||||
```
|
||||
|
||||
## Image Captioning
|
||||
|
||||
Generate textual descriptions of images.
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
from PIL import Image
|
||||
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(**inputs, max_length=50)
|
||||
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||
print(caption) # "a dog sitting on grass"
|
||||
```
|
||||
|
||||
## Speech Recognition (ASR)
|
||||
|
||||
Transcribe speech to text.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
transcriber = pipeline(
|
||||
"automatic-speech-recognition",
|
||||
model="openai/whisper-base"
|
||||
)
|
||||
|
||||
result = transcriber("audio.mp3")
|
||||
print(result["text"]) # "Hello, this is a test."
|
||||
|
||||
# With timestamps
|
||||
result = transcriber("audio.mp3", return_timestamps=True)
|
||||
for chunk in result["chunks"]:
|
||||
print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
|
||||
```
|
||||
|
||||
## Text-to-Speech
|
||||
|
||||
Generate speech from text.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
||||
|
||||
result = synthesizer("Hello, how are you today?")
|
||||
# result["audio"] contains the waveform
|
||||
# result["sampling_rate"] contains the sample rate
|
||||
|
||||
# Save audio
|
||||
import scipy
|
||||
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
|
||||
```
|
||||
|
||||
## Visual Question Answering
|
||||
|
||||
Answer questions about images.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
from PIL import Image
|
||||
|
||||
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
question = "What color is the car?"
|
||||
|
||||
result = vqa(image=image, question=question)
|
||||
print(result[0]["answer"]) # "red"
|
||||
```
|
||||
|
||||
## Document Question Answering
|
||||
|
||||
Extract information from documents (PDFs, images with text).
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
||||
|
||||
result = doc_qa(
|
||||
image="invoice.png",
|
||||
question="What is the total amount?"
|
||||
)
|
||||
|
||||
print(result["answer"]) # "$1,234.56"
|
||||
```
|
||||
|
||||
## Zero-Shot Classification
|
||||
|
||||
Classify without training data.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
||||
|
||||
text = "This is a delicious Italian restaurant with great pasta."
|
||||
candidate_labels = ["food", "travel", "technology", "sports"]
|
||||
|
||||
result = classifier(text, candidate_labels)
|
||||
print(result["labels"][0]) # "food"
|
||||
print(result["scores"][0]) # 0.95
|
||||
```
|
||||
|
||||
## Few-Shot Learning with LLMs
|
||||
|
||||
Use large language models for few-shot tasks.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
|
||||
# Few-shot prompt
|
||||
prompt = """
|
||||
Classify the sentiment: positive, negative, or neutral.
|
||||
|
||||
Text: "I love this product!"
|
||||
Sentiment: positive
|
||||
|
||||
Text: "This is terrible."
|
||||
Sentiment: negative
|
||||
|
||||
Text: "It's okay, nothing special."
|
||||
Sentiment: neutral
|
||||
|
||||
Text: "Best purchase ever!"
|
||||
Sentiment:"""
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
|
||||
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(response.split("Sentiment:")[-1].strip()) # "positive"
|
||||
```
|
||||
|
||||
## Instruction-Following / Chat
|
||||
|
||||
Use instruction-tuned models.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
]
|
||||
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
|
||||
inputs = tokenizer(formatted, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
|
||||
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# Extract assistant response
|
||||
assistant_response = response.split("[/INST]")[-1].strip()
|
||||
print(assistant_response)
|
||||
```
|
||||
|
||||
## Embeddings / Semantic Search
|
||||
|
||||
Generate embeddings for semantic similarity.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
||||
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
||||
|
||||
def get_embedding(text):
|
||||
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Mean pooling
|
||||
embeddings = outputs.last_hidden_state.mean(dim=1)
|
||||
return embeddings
|
||||
|
||||
# Get embeddings
|
||||
text1 = "Machine learning is a subset of AI"
|
||||
text2 = "AI includes machine learning"
|
||||
|
||||
emb1 = get_embedding(text1)
|
||||
emb2 = get_embedding(text2)
|
||||
|
||||
# Compute similarity
|
||||
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
|
||||
print(f"Similarity: {similarity.item():.4f}") # ~0.85
|
||||
```
|
||||
|
||||
## Multimodal Understanding (CLIP)
|
||||
|
||||
Connect vision and language.
|
||||
|
||||
```python
|
||||
from transformers import CLIPProcessor, CLIPModel
|
||||
from PIL import Image
|
||||
|
||||
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
texts = ["a dog", "a cat", "a car", "a house"]
|
||||
|
||||
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Get similarity scores
|
||||
logits_per_image = outputs.logits_per_image
|
||||
probs = logits_per_image.softmax(dim=1)
|
||||
|
||||
for text, prob in zip(texts, probs[0]):
|
||||
print(f"{text}: {prob.item():.4f}")
|
||||
```
|
||||
|
||||
## Common Evaluation Metrics
|
||||
|
||||
```python
|
||||
from datasets import load_metric
|
||||
|
||||
# Accuracy (classification)
|
||||
metric = load_metric("accuracy")
|
||||
predictions = [0, 1, 1, 0]
|
||||
references = [0, 1, 0, 0]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# F1 Score (classification, NER)
|
||||
metric = load_metric("f1")
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# BLEU (translation)
|
||||
metric = load_metric("bleu")
|
||||
predictions = ["hello there general kenobi"]
|
||||
references = [["hello there general kenobi", "hello there!"]]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# ROUGE (summarization)
|
||||
metric = load_metric("rouge")
|
||||
predictions = ["summary text"]
|
||||
references = ["reference summary"]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
```
|
||||
|
||||
## Common Data Collators
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding,
|
||||
DataCollatorForTokenClassification,
|
||||
DataCollatorForSeq2Seq,
|
||||
DataCollatorForLanguageModeling,
|
||||
)
|
||||
|
||||
# Classification: dynamic padding
|
||||
DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# NER: pad labels too
|
||||
DataCollatorForTokenClassification(tokenizer=tokenizer)
|
||||
|
||||
# Seq2Seq: pad inputs and labels
|
||||
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
||||
|
||||
# Language modeling: create MLM masks
|
||||
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
|
||||
```
|
||||
|
||||
This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`.
|
||||
212
scientific-packages/transformers/scripts/fine_tune_classifier.py
Executable file
212
scientific-packages/transformers/scripts/fine_tune_classifier.py
Executable file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete example for fine-tuning a text classification model.
|
||||
|
||||
This script demonstrates the full workflow:
|
||||
1. Load dataset
|
||||
2. Preprocess with tokenizer
|
||||
3. Configure model
|
||||
4. Train with Trainer
|
||||
5. Evaluate and save
|
||||
|
||||
Usage:
|
||||
python fine_tune_classifier.py --model bert-base-uncased --dataset imdb --epochs 3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
"""Compute accuracy and F1 score."""
|
||||
metric_accuracy = evaluate.load("accuracy")
|
||||
metric_f1 = evaluate.load("f1")
|
||||
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
|
||||
accuracy = metric_accuracy.compute(predictions=predictions, references=labels)
|
||||
f1 = metric_f1.compute(predictions=predictions, references=labels)
|
||||
|
||||
return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fine-tune a text classification model")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="bert-base-uncased",
|
||||
help="Pretrained model name or path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="imdb",
|
||||
help="Dataset name from Hugging Face Hub",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum samples to use (for quick testing)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for checkpoints",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--epochs",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of training epochs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Batch size per device",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning-rate",
|
||||
type=float,
|
||||
default=2e-5,
|
||||
help="Learning rate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Push model to Hugging Face Hub after training",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Text Classification Fine-Tuning")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Dataset: {args.dataset}")
|
||||
print(f"Epochs: {args.epochs}")
|
||||
print(f"Batch size: {args.batch_size}")
|
||||
print(f"Learning rate: {args.learning_rate}")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Load dataset
|
||||
print("\n[1/5] Loading dataset...")
|
||||
dataset = load_dataset(args.dataset)
|
||||
|
||||
if args.max_samples:
|
||||
dataset["train"] = dataset["train"].select(range(args.max_samples))
|
||||
dataset["test"] = dataset["test"].select(range(args.max_samples // 5))
|
||||
|
||||
print(f"Train samples: {len(dataset['train'])}")
|
||||
print(f"Test samples: {len(dataset['test'])}")
|
||||
|
||||
# 2. Preprocess
|
||||
print("\n[2/5] Preprocessing data...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
def preprocess_function(examples):
|
||||
return tokenizer(examples["text"], truncation=True, max_length=512)
|
||||
|
||||
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# 3. Load model
|
||||
print("\n[3/5] Loading model...")
|
||||
|
||||
# Determine number of labels
|
||||
num_labels = len(set(dataset["train"]["label"]))
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
args.model,
|
||||
num_labels=num_labels,
|
||||
)
|
||||
|
||||
print(f"Number of labels: {num_labels}")
|
||||
print(f"Model parameters: {model.num_parameters():,}")
|
||||
|
||||
# 4. Configure training
|
||||
print("\n[4/5] Configuring training...")
|
||||
training_args = TrainingArguments(
|
||||
output_dir=args.output_dir,
|
||||
learning_rate=args.learning_rate,
|
||||
per_device_train_batch_size=args.batch_size,
|
||||
per_device_eval_batch_size=args.batch_size,
|
||||
num_train_epochs=args.epochs,
|
||||
weight_decay=0.01,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
push_to_hub=args.push_to_hub,
|
||||
logging_steps=100,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# 5. Train
|
||||
print("\n[5/5] Training...")
|
||||
print("-" * 60)
|
||||
trainer.train()
|
||||
|
||||
# Evaluate
|
||||
print("\n" + "=" * 60)
|
||||
print("Final Evaluation")
|
||||
print("=" * 60)
|
||||
metrics = trainer.evaluate()
|
||||
|
||||
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
|
||||
print(f"F1 Score: {metrics['eval_f1']:.4f}")
|
||||
print(f"Loss: {metrics['eval_loss']:.4f}")
|
||||
|
||||
# Save
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Saving model to {args.output_dir}")
|
||||
trainer.save_model(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
if args.push_to_hub:
|
||||
print("Pushing to Hugging Face Hub...")
|
||||
trainer.push_to_hub()
|
||||
|
||||
print("=" * 60)
|
||||
print("Training complete!")
|
||||
print("=" * 60)
|
||||
|
||||
# Quick inference example
|
||||
print("\nQuick inference example:")
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline(
|
||||
"text-classification",
|
||||
model=args.output_dir,
|
||||
tokenizer=args.output_dir,
|
||||
)
|
||||
|
||||
example_text = "This is a great example of how to use transformers!"
|
||||
result = classifier(example_text)
|
||||
print(f"Text: {example_text}")
|
||||
print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
232
scientific-packages/transformers/scripts/generate_text.py
Executable file
232
scientific-packages/transformers/scripts/generate_text.py
Executable file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Text generation with various strategies.
|
||||
|
||||
This script demonstrates different generation strategies:
|
||||
- Greedy decoding
|
||||
- Beam search
|
||||
- Sampling with temperature
|
||||
- Top-k and top-p sampling
|
||||
|
||||
Usage:
|
||||
python generate_text.py --model gpt2 --prompt "The future of AI" --strategy sampling
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
|
||||
def generate_with_greedy(model, tokenizer, prompt, max_length):
|
||||
"""Greedy decoding (deterministic)."""
|
||||
print("\n" + "=" * 60)
|
||||
print("GREEDY DECODING")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
|
||||
|
||||
def generate_with_beam_search(model, tokenizer, prompt, max_length, num_beams=5):
|
||||
"""Beam search for higher quality."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"BEAM SEARCH (num_beams={num_beams})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
num_beams=num_beams,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=2,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
|
||||
|
||||
def generate_with_sampling(model, tokenizer, prompt, max_length, temperature=0.8):
|
||||
"""Sampling with temperature."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"SAMPLING (temperature={temperature})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
|
||||
|
||||
def generate_with_top_k_top_p(model, tokenizer, prompt, max_length, top_k=50, top_p=0.95, temperature=0.8):
|
||||
"""Top-k and top-p (nucleus) sampling."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"TOP-K TOP-P SAMPLING (k={top_k}, p={top_p}, temp={temperature})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
do_sample=True,
|
||||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
|
||||
|
||||
def generate_multiple(model, tokenizer, prompt, max_length, num_sequences=3):
|
||||
"""Generate multiple diverse sequences."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"MULTIPLE SEQUENCES (n={num_sequences})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
do_sample=True,
|
||||
num_return_sequences=num_sequences,
|
||||
temperature=0.9,
|
||||
top_p=0.95,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
print(f"\nPrompt: {prompt}\n")
|
||||
for i, output in enumerate(outputs, 1):
|
||||
text = tokenizer.decode(output, skip_special_tokens=True)
|
||||
print(f"\n--- Sequence {i} ---\n{text}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Text generation with various strategies")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="gpt2",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input prompt for generation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strategy",
|
||||
type=str,
|
||||
default="all",
|
||||
choices=["greedy", "beam", "sampling", "top_k_top_p", "multiple", "all"],
|
||||
help="Generation strategy to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Maximum number of new tokens to generate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Device (cuda, cpu, or auto)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Sampling temperature",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantize",
|
||||
action="store_true",
|
||||
help="Use 8-bit quantization",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Text Generation Demo")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Strategy: {args.strategy}")
|
||||
print(f"Max length: {args.max_length}")
|
||||
print(f"Device: {args.device}")
|
||||
print("=" * 60)
|
||||
|
||||
# Load model and tokenizer
|
||||
print("\nLoading model...")
|
||||
|
||||
if args.device == "auto":
|
||||
device_map = "auto"
|
||||
device = None
|
||||
else:
|
||||
device_map = None
|
||||
device = args.device
|
||||
|
||||
model_kwargs = {"device_map": device_map} if device_map else {}
|
||||
|
||||
if args.quantize:
|
||||
print("Using 8-bit quantization...")
|
||||
model_kwargs["load_in_8bit"] = True
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
if device and not device_map:
|
||||
model = model.to(device)
|
||||
|
||||
print(f"Model loaded on: {model.device if hasattr(model, 'device') else 'multiple devices'}")
|
||||
|
||||
# Generate based on strategy
|
||||
strategies = {
|
||||
"greedy": lambda: generate_with_greedy(model, tokenizer, args.prompt, args.max_length),
|
||||
"beam": lambda: generate_with_beam_search(model, tokenizer, args.prompt, args.max_length),
|
||||
"sampling": lambda: generate_with_sampling(model, tokenizer, args.prompt, args.max_length, args.temperature),
|
||||
"top_k_top_p": lambda: generate_with_top_k_top_p(model, tokenizer, args.prompt, args.max_length),
|
||||
"multiple": lambda: generate_multiple(model, tokenizer, args.prompt, args.max_length),
|
||||
}
|
||||
|
||||
if args.strategy == "all":
|
||||
for strategy_fn in strategies.values():
|
||||
strategy_fn()
|
||||
else:
|
||||
strategies[args.strategy]()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Generation complete!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
106
scientific-packages/transformers/scripts/quick_inference.py
Executable file
106
scientific-packages/transformers/scripts/quick_inference.py
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick inference script using Transformers pipelines.
|
||||
|
||||
This script demonstrates how to use various pipeline tasks for quick inference
|
||||
without manually managing models, tokenizers, or preprocessing.
|
||||
|
||||
Usage:
|
||||
python quick_inference.py --task text-generation --model gpt2 --input "Hello world"
|
||||
python quick_inference.py --task sentiment-analysis --input "I love this!"
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from transformers import pipeline, infer_device
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Quick inference with Transformers pipelines")
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Pipeline task (text-generation, sentiment-analysis, question-answering, etc.)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model name or path (default: use task default)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input text for inference",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--context",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Context for question-answering tasks",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-length",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Maximum generation length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Device (cuda, cpu, or auto-detect)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Auto-detect device if not specified
|
||||
if args.device is None:
|
||||
device = infer_device()
|
||||
else:
|
||||
device = args.device
|
||||
|
||||
print(f"Using device: {device}")
|
||||
print(f"Task: {args.task}")
|
||||
print(f"Model: {args.model or 'default'}")
|
||||
print("-" * 50)
|
||||
|
||||
# Create pipeline
|
||||
pipe = pipeline(
|
||||
args.task,
|
||||
model=args.model,
|
||||
device=device,
|
||||
)
|
||||
|
||||
# Run inference based on task
|
||||
if args.task == "question-answering":
|
||||
if args.context is None:
|
||||
print("Error: --context required for question-answering")
|
||||
return
|
||||
result = pipe(question=args.input, context=args.context)
|
||||
print(f"Question: {args.input}")
|
||||
print(f"Context: {args.context}")
|
||||
print(f"\nAnswer: {result['answer']}")
|
||||
print(f"Score: {result['score']:.4f}")
|
||||
|
||||
elif args.task == "text-generation":
|
||||
result = pipe(args.input, max_length=args.max_length)
|
||||
print(f"Prompt: {args.input}")
|
||||
print(f"\nGenerated: {result[0]['generated_text']}")
|
||||
|
||||
elif args.task in ["sentiment-analysis", "text-classification"]:
|
||||
result = pipe(args.input)
|
||||
print(f"Text: {args.input}")
|
||||
print(f"\nLabel: {result[0]['label']}")
|
||||
print(f"Score: {result[0]['score']:.4f}")
|
||||
|
||||
else:
|
||||
# Generic handling for other tasks
|
||||
result = pipe(args.input)
|
||||
print(f"Input: {args.input}")
|
||||
print(f"\nResult: {result}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user