mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
Improve the Hugging Face transformers skill
This commit is contained in:
@@ -1,485 +0,0 @@
|
||||
# Transformers API Reference
|
||||
|
||||
This reference covers the core classes and APIs in the Transformers library.
|
||||
|
||||
## Core Auto Classes
|
||||
|
||||
Auto classes provide a convenient way to automatically select the appropriate architecture based on model name or checkpoint.
|
||||
|
||||
### AutoTokenizer
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Tokenize single text
|
||||
encoded = tokenizer("Hello, how are you?")
|
||||
# Returns: {'input_ids': [...], 'attention_mask': [...]}
|
||||
|
||||
# Tokenize with options
|
||||
encoded = tokenizer(
|
||||
"Hello, how are you?",
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=512,
|
||||
return_tensors="pt" # "pt" for PyTorch, "tf" for TensorFlow
|
||||
)
|
||||
|
||||
# Tokenize pairs (for classification, QA, etc.)
|
||||
encoded = tokenizer(
|
||||
"Question or sentence A",
|
||||
"Context or sentence B",
|
||||
padding=True,
|
||||
truncation=True
|
||||
)
|
||||
|
||||
# Batch tokenization
|
||||
texts = ["Text 1", "Text 2", "Text 3"]
|
||||
encoded = tokenizer(texts, padding=True, truncation=True)
|
||||
|
||||
# Decode tokens back to text
|
||||
text = tokenizer.decode(token_ids, skip_special_tokens=True)
|
||||
|
||||
# Batch decode
|
||||
texts = tokenizer.batch_decode(batch_token_ids, skip_special_tokens=True)
|
||||
```
|
||||
|
||||
**Key Parameters:**
|
||||
- `padding`: "max_length", "longest", or True (pad to max in batch)
|
||||
- `truncation`: True or strategy ("longest_first", "only_first", "only_second")
|
||||
- `max_length`: Maximum sequence length
|
||||
- `return_tensors`: "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
|
||||
- `return_attention_mask`: Return attention masks (default True)
|
||||
- `return_token_type_ids`: Return token type IDs for pairs (default True)
|
||||
- `add_special_tokens`: Add special tokens like [CLS], [SEP] (default True)
|
||||
|
||||
**Special Properties:**
|
||||
- `tokenizer.vocab_size`: Size of vocabulary
|
||||
- `tokenizer.pad_token_id`: ID of padding token
|
||||
- `tokenizer.eos_token_id`: ID of end-of-sequence token
|
||||
- `tokenizer.bos_token_id`: ID of beginning-of-sequence token
|
||||
- `tokenizer.unk_token_id`: ID of unknown token
|
||||
|
||||
### AutoModel
|
||||
|
||||
Base model class that outputs hidden states.
|
||||
|
||||
```python
|
||||
from transformers import AutoModel
|
||||
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Forward pass
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Access hidden states
|
||||
last_hidden_state = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]
|
||||
pooler_output = outputs.pooler_output # [batch_size, hidden_size]
|
||||
|
||||
# Get all hidden states
|
||||
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
|
||||
outputs = model(**inputs)
|
||||
all_hidden_states = outputs.hidden_states # Tuple of tensors
|
||||
```
|
||||
|
||||
### Task-Specific Auto Classes
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoModelForSequenceClassification,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForMaskedLM,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoModelForImageClassification,
|
||||
AutoModelForObjectDetection,
|
||||
AutoModelForVision2Seq,
|
||||
)
|
||||
|
||||
# Sequence classification (sentiment, topic, etc.)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=3,
|
||||
id2label={0: "negative", 1: "neutral", 2: "positive"},
|
||||
label2id={"negative": 0, "neutral": 1, "positive": 2}
|
||||
)
|
||||
|
||||
# Token classification (NER, POS tagging)
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=9 # Number of entity types
|
||||
)
|
||||
|
||||
# Question answering
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Causal language modeling (GPT-style)
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
|
||||
# Masked language modeling (BERT-style)
|
||||
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Sequence-to-sequence (T5, BART)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
||||
|
||||
# Image classification
|
||||
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Object detection
|
||||
model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50")
|
||||
|
||||
# Vision-to-text (image captioning, VQA)
|
||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base")
|
||||
```
|
||||
|
||||
### AutoProcessor
|
||||
|
||||
For multimodal models that need both text and image processing.
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor
|
||||
|
||||
# For vision-language models
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
|
||||
# Process image and text
|
||||
from PIL import Image
|
||||
image = Image.open("image.jpg")
|
||||
inputs = processor(images=image, text="caption", return_tensors="pt")
|
||||
|
||||
# For audio models
|
||||
processor = AutoProcessor.from_pretrained("openai/whisper-base")
|
||||
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
|
||||
```
|
||||
|
||||
### AutoImageProcessor
|
||||
|
||||
For vision-only models.
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Process single image
|
||||
from PIL import Image
|
||||
image = Image.open("image.jpg")
|
||||
inputs = processor(image, return_tensors="pt")
|
||||
|
||||
# Batch processing
|
||||
images = [Image.open(f"image{i}.jpg") for i in range(10)]
|
||||
inputs = processor(images, return_tensors="pt")
|
||||
```
|
||||
|
||||
## Model Loading Options
|
||||
|
||||
### from_pretrained Parameters
|
||||
|
||||
```python
|
||||
model = AutoModel.from_pretrained(
|
||||
"model-name",
|
||||
# Device and precision
|
||||
device_map="auto", # Automatic device placement
|
||||
torch_dtype=torch.float16, # Use fp16
|
||||
low_cpu_mem_usage=True, # Reduce CPU memory during loading
|
||||
|
||||
# Quantization
|
||||
load_in_8bit=True, # 8-bit quantization
|
||||
load_in_4bit=True, # 4-bit quantization
|
||||
|
||||
# Model configuration
|
||||
num_labels=3, # For classification
|
||||
id2label={...}, # Label mapping
|
||||
label2id={...},
|
||||
|
||||
# Outputs
|
||||
output_hidden_states=True,
|
||||
output_attentions=True,
|
||||
|
||||
# Trust remote code
|
||||
trust_remote_code=True, # For custom models
|
||||
|
||||
# Caching
|
||||
cache_dir="./cache",
|
||||
force_download=False,
|
||||
resume_download=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Quantization with BitsAndBytes
|
||||
|
||||
```python
|
||||
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
|
||||
|
||||
# 4-bit quantization
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
## Training Components
|
||||
|
||||
### TrainingArguments
|
||||
|
||||
See `training.md` for comprehensive coverage. Key parameters:
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments
|
||||
|
||||
args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
learning_rate=2e-5,
|
||||
weight_decay=0.01,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="accuracy",
|
||||
fp16=True,
|
||||
logging_steps=100,
|
||||
save_total_limit=2,
|
||||
)
|
||||
```
|
||||
|
||||
### Trainer
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
data_collator=data_collator,
|
||||
callbacks=[callback1, callback2],
|
||||
)
|
||||
|
||||
# Train
|
||||
trainer.train()
|
||||
|
||||
# Resume from checkpoint
|
||||
trainer.train(resume_from_checkpoint=True)
|
||||
|
||||
# Evaluate
|
||||
metrics = trainer.evaluate()
|
||||
|
||||
# Predict
|
||||
predictions = trainer.predict(test_dataset)
|
||||
|
||||
# Hyperparameter search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
direction="maximize",
|
||||
backend="optuna",
|
||||
n_trials=10,
|
||||
)
|
||||
|
||||
# Save model
|
||||
trainer.save_model("./final_model")
|
||||
|
||||
# Push to Hub
|
||||
trainer.push_to_hub(commit_message="Training complete")
|
||||
```
|
||||
|
||||
### Data Collators
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding,
|
||||
DataCollatorForTokenClassification,
|
||||
DataCollatorForSeq2Seq,
|
||||
DataCollatorForLanguageModeling,
|
||||
DefaultDataCollator,
|
||||
)
|
||||
|
||||
# For classification/regression with dynamic padding
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# For token classification (NER)
|
||||
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
|
||||
|
||||
# For seq2seq tasks
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
||||
|
||||
# For language modeling
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=True, # True for masked LM, False for causal LM
|
||||
mlm_probability=0.15
|
||||
)
|
||||
|
||||
# Default (no special handling)
|
||||
data_collator = DefaultDataCollator()
|
||||
```
|
||||
|
||||
## Generation Components
|
||||
|
||||
### GenerationConfig
|
||||
|
||||
See `generation_strategies.md` for comprehensive coverage.
|
||||
|
||||
```python
|
||||
from transformers import GenerationConfig
|
||||
|
||||
config = GenerationConfig(
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
top_k=50,
|
||||
num_beams=5,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
|
||||
# Use with model
|
||||
outputs = model.generate(**inputs, generation_config=config)
|
||||
```
|
||||
|
||||
### generate() Method
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
input_ids=inputs.input_ids,
|
||||
attention_mask=inputs.attention_mask,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
num_return_sequences=3,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
```
|
||||
|
||||
## Pipeline API
|
||||
|
||||
See `pipelines.md` for comprehensive coverage.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Basic usage
|
||||
pipe = pipeline("task-name", model="model-name", device=0)
|
||||
results = pipe(inputs)
|
||||
|
||||
# With custom model
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained("model-name")
|
||||
tokenizer = AutoTokenizer.from_pretrained("model-name")
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
## Configuration Classes
|
||||
|
||||
### Model Configuration
|
||||
|
||||
```python
|
||||
from transformers import AutoConfig
|
||||
|
||||
# Load configuration
|
||||
config = AutoConfig.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Access configuration
|
||||
print(config.hidden_size)
|
||||
print(config.num_attention_heads)
|
||||
print(config.num_hidden_layers)
|
||||
|
||||
# Modify configuration
|
||||
config.num_labels = 5
|
||||
config.output_hidden_states = True
|
||||
|
||||
# Create model with config
|
||||
model = AutoModel.from_config(config)
|
||||
|
||||
# Save configuration
|
||||
config.save_pretrained("./config")
|
||||
```
|
||||
|
||||
## Utilities
|
||||
|
||||
### Hub Utilities
|
||||
|
||||
```python
|
||||
from huggingface_hub import login, snapshot_download
|
||||
|
||||
# Login
|
||||
login(token="hf_...")
|
||||
|
||||
# Download model
|
||||
snapshot_download(repo_id="model-name", cache_dir="./cache")
|
||||
|
||||
# Push to Hub
|
||||
model.push_to_hub("username/model-name", commit_message="Initial commit")
|
||||
tokenizer.push_to_hub("username/model-name")
|
||||
```
|
||||
|
||||
### Evaluation Metrics
|
||||
|
||||
```python
|
||||
import evaluate
|
||||
|
||||
# Load metric
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
# Compute metric
|
||||
results = metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
# Common metrics
|
||||
accuracy = evaluate.load("accuracy")
|
||||
precision = evaluate.load("precision")
|
||||
recall = evaluate.load("recall")
|
||||
f1 = evaluate.load("f1")
|
||||
bleu = evaluate.load("bleu")
|
||||
rouge = evaluate.load("rouge")
|
||||
```
|
||||
|
||||
## Model Outputs
|
||||
|
||||
All models return dataclass objects with named attributes:
|
||||
|
||||
```python
|
||||
# Sequence classification output
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits # [batch_size, num_labels]
|
||||
loss = outputs.loss # If labels provided
|
||||
|
||||
# Causal LM output
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits # [batch_size, seq_length, vocab_size]
|
||||
past_key_values = outputs.past_key_values # KV cache
|
||||
|
||||
# Seq2Seq output
|
||||
outputs = model(**inputs, labels=labels)
|
||||
loss = outputs.loss
|
||||
logits = outputs.logits
|
||||
encoder_last_hidden_state = outputs.encoder_last_hidden_state
|
||||
|
||||
# Access as dict
|
||||
outputs_dict = outputs.to_tuple() # or dict(outputs)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use Auto classes**: AutoModel, AutoTokenizer for flexibility
|
||||
2. **Device management**: Use `device_map="auto"` for multi-GPU
|
||||
3. **Memory optimization**: Use `torch_dtype=torch.float16` and quantization
|
||||
4. **Caching**: Set `cache_dir` to avoid re-downloading
|
||||
5. **Batch processing**: Process multiple inputs at once for efficiency
|
||||
6. **Trust remote code**: Only set `trust_remote_code=True` for trusted sources
|
||||
467
scientific-packages/transformers/references/generation.md
Normal file
467
scientific-packages/transformers/references/generation.md
Normal file
@@ -0,0 +1,467 @@
|
||||
# Text Generation
|
||||
|
||||
## Overview
|
||||
|
||||
Generate text with language models using the `generate()` method. Control output quality and style through generation strategies and parameters.
|
||||
|
||||
## Basic Generation
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
# Tokenize input
|
||||
inputs = tokenizer("Once upon a time", return_tensors="pt")
|
||||
|
||||
# Generate
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
|
||||
# Decode
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(text)
|
||||
```
|
||||
|
||||
## Generation Strategies
|
||||
|
||||
### Greedy Decoding
|
||||
|
||||
Select highest probability token at each step (deterministic):
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=False # Greedy decoding (default)
|
||||
)
|
||||
```
|
||||
|
||||
**Use for**: Factual text, translations, where determinism is needed.
|
||||
|
||||
### Sampling
|
||||
|
||||
Randomly sample from probability distribution:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_k=50,
|
||||
top_p=0.95
|
||||
)
|
||||
```
|
||||
|
||||
**Use for**: Creative writing, diverse outputs, open-ended generation.
|
||||
|
||||
### Beam Search
|
||||
|
||||
Explore multiple hypotheses in parallel:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5,
|
||||
early_stopping=True
|
||||
)
|
||||
```
|
||||
|
||||
**Use for**: Translations, summarization, where quality is critical.
|
||||
|
||||
### Contrastive Search
|
||||
|
||||
Balance quality and diversity:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
penalty_alpha=0.6,
|
||||
top_k=4
|
||||
)
|
||||
```
|
||||
|
||||
**Use for**: Long-form generation, reducing repetition.
|
||||
|
||||
## Key Parameters
|
||||
|
||||
### Length Control
|
||||
|
||||
**max_new_tokens**: Maximum tokens to generate
|
||||
```python
|
||||
max_new_tokens=100 # Generate up to 100 new tokens
|
||||
```
|
||||
|
||||
**max_length**: Maximum total length (input + output)
|
||||
```python
|
||||
max_length=512 # Total sequence length
|
||||
```
|
||||
|
||||
**min_new_tokens**: Minimum tokens to generate
|
||||
```python
|
||||
min_new_tokens=50 # Force at least 50 tokens
|
||||
```
|
||||
|
||||
**min_length**: Minimum total length
|
||||
```python
|
||||
min_length=100
|
||||
```
|
||||
|
||||
### Temperature
|
||||
|
||||
Controls randomness (only with sampling):
|
||||
|
||||
```python
|
||||
temperature=1.0 # Default, balanced
|
||||
temperature=0.7 # More focused, less random
|
||||
temperature=1.5 # More creative, more random
|
||||
```
|
||||
|
||||
Lower temperature → more deterministic
|
||||
Higher temperature → more random
|
||||
|
||||
### Top-K Sampling
|
||||
|
||||
Consider only top K most likely tokens:
|
||||
|
||||
```python
|
||||
do_sample=True
|
||||
top_k=50 # Sample from top 50 tokens
|
||||
```
|
||||
|
||||
**Common values**: 40-100 for balanced output, 10-20 for focused output.
|
||||
|
||||
### Top-P (Nucleus) Sampling
|
||||
|
||||
Consider tokens with cumulative probability ≥ P:
|
||||
|
||||
```python
|
||||
do_sample=True
|
||||
top_p=0.95 # Sample from smallest set with 95% cumulative probability
|
||||
```
|
||||
|
||||
**Common values**: 0.9-0.95 for balanced, 0.7-0.85 for focused.
|
||||
|
||||
### Repetition Penalty
|
||||
|
||||
Discourage repetition:
|
||||
|
||||
```python
|
||||
repetition_penalty=1.2 # Penalize repeated tokens
|
||||
```
|
||||
|
||||
**Values**: 1.0 = no penalty, 1.2-1.5 = moderate, 2.0+ = strong penalty.
|
||||
|
||||
### Beam Search Parameters
|
||||
|
||||
**num_beams**: Number of beams
|
||||
```python
|
||||
num_beams=5 # Keep 5 hypotheses
|
||||
```
|
||||
|
||||
**early_stopping**: Stop when num_beams sentences are finished
|
||||
```python
|
||||
early_stopping=True
|
||||
```
|
||||
|
||||
**no_repeat_ngram_size**: Prevent n-gram repetition
|
||||
```python
|
||||
no_repeat_ngram_size=3 # Don't repeat any 3-gram
|
||||
```
|
||||
|
||||
### Output Control
|
||||
|
||||
**num_return_sequences**: Generate multiple outputs
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5,
|
||||
num_return_sequences=3 # Return 3 different sequences
|
||||
)
|
||||
```
|
||||
|
||||
**pad_token_id**: Specify padding token
|
||||
```python
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
```
|
||||
|
||||
**eos_token_id**: Stop generation at specific token
|
||||
```python
|
||||
eos_token_id=tokenizer.eos_token_id
|
||||
```
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Batch Generation
|
||||
|
||||
Generate for multiple prompts:
|
||||
|
||||
```python
|
||||
prompts = ["Hello, my name is", "Once upon a time"]
|
||||
inputs = tokenizer(prompts, return_tensors="pt", padding=True)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
|
||||
for i, output in enumerate(outputs):
|
||||
text = tokenizer.decode(output, skip_special_tokens=True)
|
||||
print(f"Prompt {i}: {text}\n")
|
||||
```
|
||||
|
||||
### Streaming Generation
|
||||
|
||||
Stream tokens as generated:
|
||||
|
||||
```python
|
||||
from transformers import TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
||||
|
||||
generation_kwargs = dict(
|
||||
inputs,
|
||||
streamer=streamer,
|
||||
max_new_tokens=100
|
||||
)
|
||||
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for text in streamer:
|
||||
print(text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
### Constrained Generation
|
||||
|
||||
Force specific token sequences:
|
||||
|
||||
```python
|
||||
# Force generation to start with specific tokens
|
||||
force_words = ["Paris", "France"]
|
||||
force_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in force_words]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
force_words_ids=force_words_ids,
|
||||
num_beams=5
|
||||
)
|
||||
```
|
||||
|
||||
### Guidance and Control
|
||||
|
||||
**Prevent bad words:**
|
||||
```python
|
||||
bad_words = ["offensive", "inappropriate"]
|
||||
bad_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in bad_words]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
bad_words_ids=bad_words_ids
|
||||
)
|
||||
```
|
||||
|
||||
### Generation Config
|
||||
|
||||
Save and reuse generation parameters:
|
||||
|
||||
```python
|
||||
from transformers import GenerationConfig
|
||||
|
||||
# Create config
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=100,
|
||||
temperature=0.7,
|
||||
top_k=50,
|
||||
top_p=0.95,
|
||||
do_sample=True
|
||||
)
|
||||
|
||||
# Save
|
||||
generation_config.save_pretrained("./my_generation_config")
|
||||
|
||||
# Load and use
|
||||
generation_config = GenerationConfig.from_pretrained("./my_generation_config")
|
||||
outputs = model.generate(**inputs, generation_config=generation_config)
|
||||
```
|
||||
|
||||
## Model-Specific Generation
|
||||
|
||||
### Chat Models
|
||||
|
||||
Use chat templates:
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"}
|
||||
]
|
||||
|
||||
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
|
||||
inputs = tokenizer(input_text, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=100)
|
||||
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
### Encoder-Decoder Models
|
||||
|
||||
For T5, BART, etc.:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
|
||||
# T5 uses task prefixes
|
||||
input_text = "translate English to French: Hello, how are you?"
|
||||
inputs = tokenizer(input_text, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Optimization
|
||||
|
||||
### Caching
|
||||
|
||||
Enable KV cache for faster generation:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
use_cache=True # Default, faster generation
|
||||
)
|
||||
```
|
||||
|
||||
### Static Cache
|
||||
|
||||
For fixed sequence lengths:
|
||||
|
||||
```python
|
||||
from transformers import StaticCache
|
||||
|
||||
cache = StaticCache(model.config, max_batch_size=1, max_cache_len=1024, device="cuda")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
past_key_values=cache
|
||||
)
|
||||
```
|
||||
|
||||
### Attention Implementation
|
||||
|
||||
Use Flash Attention for speed:
|
||||
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-id",
|
||||
attn_implementation="flash_attention_2"
|
||||
)
|
||||
```
|
||||
|
||||
## Generation Recipes
|
||||
|
||||
### Creative Writing
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=200,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
top_k=50,
|
||||
top_p=0.95,
|
||||
repetition_penalty=1.2
|
||||
)
|
||||
```
|
||||
|
||||
### Factual Generation
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
do_sample=False, # Greedy
|
||||
repetition_penalty=1.1
|
||||
)
|
||||
```
|
||||
|
||||
### Diverse Outputs
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
num_beams=5,
|
||||
num_return_sequences=5,
|
||||
temperature=1.5,
|
||||
do_sample=True
|
||||
)
|
||||
```
|
||||
|
||||
### Long-Form Generation
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=1000,
|
||||
penalty_alpha=0.6, # Contrastive search
|
||||
top_k=4,
|
||||
repetition_penalty=1.2
|
||||
)
|
||||
```
|
||||
|
||||
### Translation/Summarization
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
num_beams=5,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=3
|
||||
)
|
||||
```
|
||||
|
||||
## Common Issues
|
||||
|
||||
**Repetitive output:**
|
||||
- Increase repetition_penalty (1.2-1.5)
|
||||
- Use no_repeat_ngram_size (2-3)
|
||||
- Try contrastive search
|
||||
- Lower temperature
|
||||
|
||||
**Poor quality:**
|
||||
- Use beam search (num_beams=5)
|
||||
- Lower temperature
|
||||
- Adjust top_k/top_p
|
||||
|
||||
**Too deterministic:**
|
||||
- Enable sampling (do_sample=True)
|
||||
- Increase temperature (0.7-1.0)
|
||||
- Adjust top_k/top_p
|
||||
|
||||
**Slow generation:**
|
||||
- Reduce batch size
|
||||
- Enable use_cache=True
|
||||
- Use Flash Attention
|
||||
- Reduce max_new_tokens
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start with defaults**: Then tune based on output
|
||||
2. **Use appropriate strategy**: Greedy for factual, sampling for creative
|
||||
3. **Set max_new_tokens**: Avoid unnecessarily long generation
|
||||
4. **Enable caching**: For faster sequential generation
|
||||
5. **Tune temperature**: Most impactful parameter for sampling
|
||||
6. **Use beam search carefully**: Slower but higher quality
|
||||
7. **Test different seeds**: For reproducibility with sampling
|
||||
8. **Monitor memory**: Large beams use significant memory
|
||||
@@ -1,373 +0,0 @@
|
||||
# Text Generation Strategies
|
||||
|
||||
Transformers provides flexible text generation capabilities through the `generate()` method, supporting multiple decoding strategies and configuration options.
|
||||
|
||||
## Basic Generation
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
inputs = tokenizer("Once upon a time", return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
generated_text = tokenizer.decode(outputs[0])
|
||||
```
|
||||
|
||||
## Decoding Strategies
|
||||
|
||||
### 1. Greedy Decoding
|
||||
|
||||
Selects the token with highest probability at each step. Deterministic but can be repetitive.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=False,
|
||||
num_beams=1 # Greedy is default when num_beams=1 and do_sample=False
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Beam Search
|
||||
|
||||
Explores multiple hypotheses simultaneously, keeping top-k candidates at each step.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5, # Number of beams
|
||||
early_stopping=True, # Stop when all beams reach EOS
|
||||
no_repeat_ngram_size=2, # Prevent repeating n-grams
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- `num_beams`: Number of beams (higher = more thorough but slower)
|
||||
- `early_stopping`: Stop when all beams finish (True/False)
|
||||
- `length_penalty`: Exponential penalty for length (>1.0 favors longer sequences)
|
||||
- `no_repeat_ngram_size`: Prevent repeating n-grams
|
||||
|
||||
### 3. Sampling (Multinomial)
|
||||
|
||||
Samples from probability distribution, introducing randomness and diversity.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True,
|
||||
temperature=0.7, # Controls randomness (lower = more focused)
|
||||
top_k=50, # Consider only top-k tokens
|
||||
top_p=0.9, # Nucleus sampling (cumulative probability threshold)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- `temperature`: Scales logits before softmax (0.1-2.0 typical range)
|
||||
- Lower (0.1-0.7): More focused, deterministic
|
||||
- Higher (0.8-1.5): More creative, random
|
||||
- `top_k`: Sample from top-k tokens only
|
||||
- `top_p`: Nucleus sampling - sample from smallest set with cumulative probability > p
|
||||
|
||||
### 4. Beam Search with Sampling
|
||||
|
||||
Combines beam search with sampling for diverse but coherent outputs.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
top_k=50,
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Contrastive Search
|
||||
|
||||
Balances coherence and diversity using contrastive objective.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
penalty_alpha=0.6, # Contrastive penalty
|
||||
top_k=4, # Consider top-k candidates
|
||||
)
|
||||
```
|
||||
|
||||
### 6. Assisted Decoding
|
||||
|
||||
Uses a smaller "assistant" model to speed up generation of larger model.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
|
||||
assistant_model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
assistant_model=assistant_model,
|
||||
max_new_tokens=50,
|
||||
)
|
||||
```
|
||||
|
||||
## GenerationConfig
|
||||
|
||||
Configure generation parameters with `GenerationConfig` for reusability.
|
||||
|
||||
```python
|
||||
from transformers import GenerationConfig
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
|
||||
# Use with model
|
||||
outputs = model.generate(**inputs, generation_config=generation_config)
|
||||
|
||||
# Save and load
|
||||
generation_config.save_pretrained("./config")
|
||||
loaded_config = GenerationConfig.from_pretrained("./config")
|
||||
```
|
||||
|
||||
## Key Parameters Reference
|
||||
|
||||
### Output Length Control
|
||||
|
||||
- `max_length`: Maximum total tokens (input + output)
|
||||
- `max_new_tokens`: Maximum new tokens to generate (recommended over max_length)
|
||||
- `min_length`: Minimum total tokens
|
||||
- `min_new_tokens`: Minimum new tokens to generate
|
||||
|
||||
### Sampling Parameters
|
||||
|
||||
- `temperature`: Sampling temperature (0.1-2.0, default 1.0)
|
||||
- `top_k`: Top-k sampling (1-100, typically 50)
|
||||
- `top_p`: Nucleus sampling (0.0-1.0, typically 0.9)
|
||||
- `do_sample`: Enable sampling (True/False)
|
||||
|
||||
### Beam Search Parameters
|
||||
|
||||
- `num_beams`: Number of beams (1-20, typically 5)
|
||||
- `early_stopping`: Stop when beams finish (True/False)
|
||||
- `length_penalty`: Length penalty (>1.0 favors longer, <1.0 favors shorter)
|
||||
- `num_beam_groups`: Diverse beam search groups
|
||||
- `diversity_penalty`: Penalty for similar beams
|
||||
|
||||
### Repetition Control
|
||||
|
||||
- `repetition_penalty`: Penalty for repeating tokens (1.0-2.0, default 1.0)
|
||||
- `no_repeat_ngram_size`: Prevent repeating n-grams (2-5 typical)
|
||||
- `encoder_repetition_penalty`: Penalty for repeating encoder tokens
|
||||
|
||||
### Special Tokens
|
||||
|
||||
- `bos_token_id`: Beginning of sequence token
|
||||
- `eos_token_id`: End of sequence token (or list of tokens)
|
||||
- `pad_token_id`: Padding token
|
||||
- `forced_bos_token_id`: Force specific token at beginning
|
||||
- `forced_eos_token_id`: Force specific token at end
|
||||
|
||||
### Multiple Sequences
|
||||
|
||||
- `num_return_sequences`: Number of sequences to return
|
||||
- `num_beam_groups`: Number of diverse beam groups
|
||||
|
||||
## Advanced Generation Techniques
|
||||
|
||||
### Constrained Generation
|
||||
|
||||
Force generation to include specific tokens or follow patterns.
|
||||
|
||||
```python
|
||||
from transformers import PhrasalConstraint
|
||||
|
||||
constraints = [
|
||||
PhrasalConstraint(tokenizer("New York", add_special_tokens=False).input_ids)
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
constraints=constraints,
|
||||
num_beams=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Streaming Generation
|
||||
|
||||
Generate tokens one at a time for real-time display.
|
||||
|
||||
```python
|
||||
from transformers import TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
||||
|
||||
generation_kwargs = dict(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
streamer=streamer,
|
||||
)
|
||||
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for new_text in streamer:
|
||||
print(new_text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
### Logit Processors
|
||||
|
||||
Customize token selection with custom logit processors.
|
||||
|
||||
```python
|
||||
from transformers import LogitsProcessor, LogitsProcessorList
|
||||
|
||||
class CustomLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids, scores):
|
||||
# Modify scores here
|
||||
return scores
|
||||
|
||||
logits_processor = LogitsProcessorList([CustomLogitsProcessor()])
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
```
|
||||
|
||||
### Stopping Criteria
|
||||
|
||||
Define custom stopping conditions.
|
||||
|
||||
```python
|
||||
from transformers import StoppingCriteria, StoppingCriteriaList
|
||||
|
||||
class CustomStoppingCriteria(StoppingCriteria):
|
||||
def __call__(self, input_ids, scores, **kwargs):
|
||||
# Return True to stop generation
|
||||
return False
|
||||
|
||||
stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria()])
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
stopping_criteria=stopping_criteria,
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Creative Tasks (Stories, Dialogue)
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=200,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
```
|
||||
|
||||
### For Factual Tasks (Summaries, QA)
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
num_beams=4,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=2,
|
||||
length_penalty=1.0,
|
||||
)
|
||||
```
|
||||
|
||||
### For Chat/Instruction Following
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=512,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
)
|
||||
```
|
||||
|
||||
## Vision-Language Model Generation
|
||||
|
||||
For models like LLaVA, BLIP-2, etc.:
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
from PIL import Image
|
||||
|
||||
model = AutoModelForVision2Seq.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
image = Image.open("image.jpg")
|
||||
inputs = processor(text="Describe this image", images=image, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Use KV Cache
|
||||
```python
|
||||
# KV cache is enabled by default
|
||||
outputs = model.generate(**inputs, use_cache=True)
|
||||
```
|
||||
|
||||
### Mixed Precision
|
||||
```python
|
||||
import torch
|
||||
|
||||
with torch.cuda.amp.autocast():
|
||||
outputs = model.generate(**inputs, max_new_tokens=100)
|
||||
```
|
||||
|
||||
### Batch Generation
|
||||
```python
|
||||
texts = ["Prompt 1", "Prompt 2", "Prompt 3"]
|
||||
inputs = tokenizer(texts, return_tensors="pt", padding=True)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
```
|
||||
|
||||
### Quantization
|
||||
```python
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
361
scientific-packages/transformers/references/models.md
Normal file
361
scientific-packages/transformers/references/models.md
Normal file
@@ -0,0 +1,361 @@
|
||||
# Model Loading and Management
|
||||
|
||||
## Overview
|
||||
|
||||
The transformers library provides flexible model loading with automatic architecture detection, device management, and configuration control.
|
||||
|
||||
## Loading Models
|
||||
|
||||
### AutoModel Classes
|
||||
|
||||
Use AutoModel classes for automatic architecture selection:
|
||||
|
||||
```python
|
||||
from transformers import AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM
|
||||
|
||||
# Base model (no task head)
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Sequence classification
|
||||
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
||||
|
||||
# Causal language modeling (GPT-style)
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
|
||||
# Masked language modeling (BERT-style)
|
||||
from transformers import AutoModelForMaskedLM
|
||||
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Sequence-to-sequence (T5-style)
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
```
|
||||
|
||||
### Common AutoModel Classes
|
||||
|
||||
**NLP Tasks:**
|
||||
- `AutoModelForSequenceClassification`: Text classification, sentiment analysis
|
||||
- `AutoModelForTokenClassification`: NER, POS tagging
|
||||
- `AutoModelForQuestionAnswering`: Extractive QA
|
||||
- `AutoModelForCausalLM`: Text generation (GPT, Llama)
|
||||
- `AutoModelForMaskedLM`: Masked language modeling (BERT)
|
||||
- `AutoModelForSeq2SeqLM`: Translation, summarization (T5, BART)
|
||||
|
||||
**Vision Tasks:**
|
||||
- `AutoModelForImageClassification`: Image classification
|
||||
- `AutoModelForObjectDetection`: Object detection
|
||||
- `AutoModelForImageSegmentation`: Image segmentation
|
||||
|
||||
**Audio Tasks:**
|
||||
- `AutoModelForAudioClassification`: Audio classification
|
||||
- `AutoModelForSpeechSeq2Seq`: Speech recognition
|
||||
|
||||
**Multimodal:**
|
||||
- `AutoModelForVision2Seq`: Image captioning, VQA
|
||||
|
||||
## Loading Parameters
|
||||
|
||||
### Basic Parameters
|
||||
|
||||
**pretrained_model_name_or_path**: Model identifier or local path
|
||||
```python
|
||||
model = AutoModel.from_pretrained("bert-base-uncased") # From Hub
|
||||
model = AutoModel.from_pretrained("./local/model/path") # From disk
|
||||
```
|
||||
|
||||
**num_labels**: Number of output labels for classification
|
||||
```python
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=3
|
||||
)
|
||||
```
|
||||
|
||||
**cache_dir**: Custom cache location
|
||||
```python
|
||||
model = AutoModel.from_pretrained("model-id", cache_dir="./my_cache")
|
||||
```
|
||||
|
||||
### Device Management
|
||||
|
||||
**device_map**: Automatic device allocation for large models
|
||||
```python
|
||||
# Automatically distribute across GPUs and CPU
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
# Sequential placement
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-id",
|
||||
device_map="sequential"
|
||||
)
|
||||
|
||||
# Custom device map
|
||||
device_map = {
|
||||
"transformer.layers.0": 0, # GPU 0
|
||||
"transformer.layers.1": 1, # GPU 1
|
||||
"transformer.layers.2": "cpu", # CPU
|
||||
}
|
||||
model = AutoModel.from_pretrained("model-id", device_map=device_map)
|
||||
```
|
||||
|
||||
Manual device placement:
|
||||
```python
|
||||
import torch
|
||||
model = AutoModel.from_pretrained("model-id")
|
||||
model.to("cuda:0") # Move to GPU 0
|
||||
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
||||
```
|
||||
|
||||
### Precision Control
|
||||
|
||||
**torch_dtype**: Set model precision
|
||||
```python
|
||||
import torch
|
||||
|
||||
# Float16 (half precision)
|
||||
model = AutoModel.from_pretrained("model-id", torch_dtype=torch.float16)
|
||||
|
||||
# BFloat16 (better range than float16)
|
||||
model = AutoModel.from_pretrained("model-id", torch_dtype=torch.bfloat16)
|
||||
|
||||
# Auto (use original dtype)
|
||||
model = AutoModel.from_pretrained("model-id", torch_dtype="auto")
|
||||
```
|
||||
|
||||
### Attention Implementation
|
||||
|
||||
**attn_implementation**: Choose attention mechanism
|
||||
```python
|
||||
# Scaled Dot Product Attention (PyTorch 2.0+, fastest)
|
||||
model = AutoModel.from_pretrained("model-id", attn_implementation="sdpa")
|
||||
|
||||
# Flash Attention 2 (requires flash-attn package)
|
||||
model = AutoModel.from_pretrained("model-id", attn_implementation="flash_attention_2")
|
||||
|
||||
# Eager (default, most compatible)
|
||||
model = AutoModel.from_pretrained("model-id", attn_implementation="eager")
|
||||
```
|
||||
|
||||
### Memory Optimization
|
||||
|
||||
**low_cpu_mem_usage**: Reduce CPU memory during loading
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"large-model-id",
|
||||
low_cpu_mem_usage=True,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
**load_in_8bit**: 8-bit quantization (requires bitsandbytes)
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-id",
|
||||
load_in_8bit=True,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
**load_in_4bit**: 4-bit quantization
|
||||
```python
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-id",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
## Model Configuration
|
||||
|
||||
### Loading with Custom Config
|
||||
|
||||
```python
|
||||
from transformers import AutoConfig, AutoModel
|
||||
|
||||
# Load and modify config
|
||||
config = AutoConfig.from_pretrained("bert-base-uncased")
|
||||
config.hidden_dropout_prob = 0.2
|
||||
config.attention_probs_dropout_prob = 0.2
|
||||
|
||||
# Initialize model with custom config
|
||||
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
|
||||
```
|
||||
|
||||
### Initializing from Config Only
|
||||
|
||||
```python
|
||||
config = AutoConfig.from_pretrained("gpt2")
|
||||
model = AutoModelForCausalLM.from_config(config) # Random weights
|
||||
```
|
||||
|
||||
## Model Modes
|
||||
|
||||
### Training vs Evaluation Mode
|
||||
|
||||
Models load in evaluation mode by default:
|
||||
|
||||
```python
|
||||
model = AutoModel.from_pretrained("model-id")
|
||||
print(model.training) # False
|
||||
|
||||
# Switch to training mode
|
||||
model.train()
|
||||
|
||||
# Switch back to evaluation mode
|
||||
model.eval()
|
||||
```
|
||||
|
||||
Evaluation mode disables dropout and uses batch norm statistics.
|
||||
|
||||
## Saving Models
|
||||
|
||||
### Save Locally
|
||||
|
||||
```python
|
||||
model.save_pretrained("./my_model")
|
||||
```
|
||||
|
||||
This creates:
|
||||
- `config.json`: Model configuration
|
||||
- `pytorch_model.bin` or `model.safetensors`: Model weights
|
||||
|
||||
### Save to Hugging Face Hub
|
||||
|
||||
```python
|
||||
model.push_to_hub("username/model-name")
|
||||
|
||||
# With custom commit message
|
||||
model.push_to_hub("username/model-name", commit_message="Update model")
|
||||
|
||||
# Private repository
|
||||
model.push_to_hub("username/model-name", private=True)
|
||||
```
|
||||
|
||||
## Model Inspection
|
||||
|
||||
### Parameter Count
|
||||
|
||||
```python
|
||||
# Total parameters
|
||||
total_params = model.num_parameters()
|
||||
|
||||
# Trainable parameters only
|
||||
trainable_params = model.num_parameters(only_trainable=True)
|
||||
|
||||
print(f"Total: {total_params:,}")
|
||||
print(f"Trainable: {trainable_params:,}")
|
||||
```
|
||||
|
||||
### Memory Footprint
|
||||
|
||||
```python
|
||||
memory_bytes = model.get_memory_footprint()
|
||||
memory_mb = memory_bytes / 1024**2
|
||||
print(f"Memory: {memory_mb:.2f} MB")
|
||||
```
|
||||
|
||||
### Model Architecture
|
||||
|
||||
```python
|
||||
print(model) # Print full architecture
|
||||
|
||||
# Access specific components
|
||||
print(model.config)
|
||||
print(model.base_model)
|
||||
```
|
||||
|
||||
## Forward Pass
|
||||
|
||||
Basic inference:
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("model-id")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("model-id")
|
||||
|
||||
inputs = tokenizer("Sample text", return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
|
||||
logits = outputs.logits
|
||||
predictions = logits.argmax(dim=-1)
|
||||
```
|
||||
|
||||
## Model Formats
|
||||
|
||||
### SafeTensors vs PyTorch
|
||||
|
||||
SafeTensors is faster and safer:
|
||||
|
||||
```python
|
||||
# Save as safetensors (recommended)
|
||||
model.save_pretrained("./model", safe_serialization=True)
|
||||
|
||||
# Load either format automatically
|
||||
model = AutoModel.from_pretrained("./model")
|
||||
```
|
||||
|
||||
### ONNX Export
|
||||
|
||||
Export for optimized inference:
|
||||
|
||||
```python
|
||||
from transformers.onnx import export
|
||||
|
||||
# Export to ONNX
|
||||
export(
|
||||
tokenizer=tokenizer,
|
||||
model=model,
|
||||
config=config,
|
||||
output=Path("model.onnx")
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use AutoModel classes**: Automatic architecture detection
|
||||
2. **Specify dtype explicitly**: Control precision and memory
|
||||
3. **Use device_map="auto"**: For large models
|
||||
4. **Enable low_cpu_mem_usage**: When loading large models
|
||||
5. **Use safetensors format**: Faster and safer serialization
|
||||
6. **Check model.training**: Ensure correct mode for task
|
||||
7. **Consider quantization**: For deployment on resource-constrained devices
|
||||
8. **Cache models locally**: Set TRANSFORMERS_CACHE environment variable
|
||||
|
||||
## Common Issues
|
||||
|
||||
**CUDA out of memory:**
|
||||
```python
|
||||
# Use smaller precision
|
||||
model = AutoModel.from_pretrained("model-id", torch_dtype=torch.float16)
|
||||
|
||||
# Or use quantization
|
||||
model = AutoModel.from_pretrained("model-id", load_in_8bit=True)
|
||||
|
||||
# Or use CPU
|
||||
model = AutoModel.from_pretrained("model-id", device_map="cpu")
|
||||
```
|
||||
|
||||
**Slow loading:**
|
||||
```python
|
||||
# Enable low CPU memory mode
|
||||
model = AutoModel.from_pretrained("model-id", low_cpu_mem_usage=True)
|
||||
```
|
||||
|
||||
**Model not found:**
|
||||
```python
|
||||
# Verify model ID on hub.co
|
||||
# Check authentication for private models
|
||||
from huggingface_hub import login
|
||||
login()
|
||||
```
|
||||
@@ -1,234 +1,335 @@
|
||||
# Transformers Pipelines
|
||||
# Pipeline API Reference
|
||||
|
||||
Pipelines provide a simple and optimized interface for inference across many machine learning tasks. They abstract away the complexity of tokenization, model invocation, and post-processing.
|
||||
## Overview
|
||||
|
||||
## Usage Pattern
|
||||
Pipelines provide the simplest way to use pre-trained models for inference. They abstract away tokenization, model loading, and post-processing, offering a unified interface for dozens of tasks.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
Create a pipeline by specifying a task:
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Basic usage
|
||||
classifier = pipeline("text-classification")
|
||||
result = classifier("This movie was amazing!")
|
||||
|
||||
# With specific model
|
||||
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
|
||||
result = classifier("This movie was amazing!")
|
||||
# Auto-select default model for task
|
||||
pipe = pipeline("text-classification")
|
||||
result = pipe("This is great!")
|
||||
```
|
||||
|
||||
## Natural Language Processing Pipelines
|
||||
Or specify a model:
|
||||
|
||||
### Text Classification
|
||||
```python
|
||||
pipe = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
|
||||
```
|
||||
|
||||
## Supported Tasks
|
||||
|
||||
### Natural Language Processing
|
||||
|
||||
**text-generation**: Generate text continuations
|
||||
```python
|
||||
generator = pipeline("text-generation", model="gpt2")
|
||||
output = generator("Once upon a time", max_length=50, num_return_sequences=2)
|
||||
```
|
||||
|
||||
**text-classification**: Classify text into categories
|
||||
```python
|
||||
classifier = pipeline("text-classification")
|
||||
classifier("I love this product!")
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998}]
|
||||
result = classifier("I love this product!") # Returns label and score
|
||||
```
|
||||
|
||||
### Zero-Shot Classification
|
||||
**token-classification**: Label individual tokens (NER, POS tagging)
|
||||
```python
|
||||
classifier = pipeline("zero-shot-classification")
|
||||
classifier("This is about climate change", candidate_labels=["politics", "science", "sports"])
|
||||
ner = pipeline("token-classification", model="dslim/bert-base-NER")
|
||||
entities = ner("Hugging Face is based in New York City")
|
||||
```
|
||||
|
||||
### Token Classification (NER)
|
||||
```python
|
||||
ner = pipeline("token-classification")
|
||||
ner("My name is Sarah and I work at Microsoft in Seattle")
|
||||
```
|
||||
|
||||
### Question Answering
|
||||
**question-answering**: Extract answers from context
|
||||
```python
|
||||
qa = pipeline("question-answering")
|
||||
qa(question="What is the capital?", context="The capital of France is Paris.")
|
||||
result = qa(question="What is the capital?", context="Paris is the capital of France.")
|
||||
```
|
||||
|
||||
### Text Generation
|
||||
**fill-mask**: Predict masked tokens
|
||||
```python
|
||||
generator = pipeline("text-generation")
|
||||
generator("Once upon a time", max_length=50)
|
||||
unmasker = pipeline("fill-mask", model="bert-base-uncased")
|
||||
result = unmasker("Paris is the [MASK] of France")
|
||||
```
|
||||
|
||||
### Text2Text Generation
|
||||
**summarization**: Summarize long texts
|
||||
```python
|
||||
generator = pipeline("text2text-generation", model="t5-base")
|
||||
generator("translate English to French: Hello")
|
||||
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
||||
summary = summarizer("Long article text...", max_length=130, min_length=30)
|
||||
```
|
||||
|
||||
### Summarization
|
||||
**translation**: Translate between languages
|
||||
```python
|
||||
summarizer = pipeline("summarization")
|
||||
summarizer("Long article text here...", max_length=130, min_length=30)
|
||||
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
|
||||
result = translator("Hello, how are you?")
|
||||
```
|
||||
|
||||
### Translation
|
||||
**zero-shot-classification**: Classify without training data
|
||||
```python
|
||||
translator = pipeline("translation_en_to_fr")
|
||||
translator("Hello, how are you?")
|
||||
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
||||
result = classifier(
|
||||
"This is a course about Python programming",
|
||||
candidate_labels=["education", "politics", "business"]
|
||||
)
|
||||
```
|
||||
|
||||
### Fill Mask
|
||||
**sentiment-analysis**: Alias for text-classification focused on sentiment
|
||||
```python
|
||||
unmasker = pipeline("fill-mask")
|
||||
unmasker("Paris is the [MASK] of France.")
|
||||
sentiment = pipeline("sentiment-analysis")
|
||||
result = sentiment("This product exceeded my expectations!")
|
||||
```
|
||||
|
||||
### Feature Extraction
|
||||
### Computer Vision
|
||||
|
||||
**image-classification**: Classify images
|
||||
```python
|
||||
extractor = pipeline("feature-extraction")
|
||||
embeddings = extractor("This is a sentence")
|
||||
classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
|
||||
result = classifier("path/to/image.jpg")
|
||||
# Or use PIL Image or URL
|
||||
from PIL import Image
|
||||
result = classifier(Image.open("image.jpg"))
|
||||
```
|
||||
|
||||
### Document Question Answering
|
||||
**object-detection**: Detect objects in images
|
||||
```python
|
||||
doc_qa = pipeline("document-question-answering")
|
||||
doc_qa(image="document.png", question="What is the invoice number?")
|
||||
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
||||
results = detector("image.jpg") # Returns bounding boxes and labels
|
||||
```
|
||||
|
||||
### Table Question Answering
|
||||
**image-segmentation**: Segment images
|
||||
```python
|
||||
table_qa = pipeline("table-question-answering")
|
||||
table_qa(table=data, query="How many employees?")
|
||||
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
|
||||
segments = segmenter("image.jpg")
|
||||
```
|
||||
|
||||
## Computer Vision Pipelines
|
||||
|
||||
### Image Classification
|
||||
**depth-estimation**: Estimate depth from images
|
||||
```python
|
||||
classifier = pipeline("image-classification")
|
||||
classifier("cat.jpg")
|
||||
depth = pipeline("depth-estimation", model="Intel/dpt-large")
|
||||
result = depth("image.jpg")
|
||||
```
|
||||
|
||||
### Zero-Shot Image Classification
|
||||
**zero-shot-image-classification**: Classify images without training
|
||||
```python
|
||||
classifier = pipeline("zero-shot-image-classification")
|
||||
classifier("cat.jpg", candidate_labels=["cat", "dog", "bird"])
|
||||
classifier = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")
|
||||
result = classifier("image.jpg", candidate_labels=["cat", "dog", "bird"])
|
||||
```
|
||||
|
||||
### Object Detection
|
||||
### Audio
|
||||
|
||||
**automatic-speech-recognition**: Transcribe speech
|
||||
```python
|
||||
detector = pipeline("object-detection")
|
||||
detector("street.jpg")
|
||||
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
||||
text = asr("audio.mp3")
|
||||
```
|
||||
|
||||
### Image Segmentation
|
||||
**audio-classification**: Classify audio
|
||||
```python
|
||||
segmenter = pipeline("image-segmentation")
|
||||
segmenter("image.jpg")
|
||||
classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
|
||||
result = classifier("audio.wav")
|
||||
```
|
||||
|
||||
### Image-to-Image
|
||||
**text-to-speech**: Generate speech from text (with specific models)
|
||||
```python
|
||||
img2img = pipeline("image-to-image", model="lllyasviel/sd-controlnet-canny")
|
||||
img2img("input.jpg")
|
||||
tts = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
||||
audio = tts("Hello, this is a test")
|
||||
```
|
||||
|
||||
### Depth Estimation
|
||||
### Multimodal
|
||||
|
||||
**visual-question-answering**: Answer questions about images
|
||||
```python
|
||||
depth = pipeline("depth-estimation")
|
||||
depth("image.jpg")
|
||||
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
||||
result = vqa(image="image.jpg", question="What color is the car?")
|
||||
```
|
||||
|
||||
### Video Classification
|
||||
**document-question-answering**: Answer questions about documents
|
||||
```python
|
||||
classifier = pipeline("video-classification")
|
||||
classifier("video.mp4")
|
||||
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
||||
result = doc_qa(image="document.png", question="What is the invoice number?")
|
||||
```
|
||||
|
||||
### Keypoint Matching
|
||||
**image-to-text**: Generate captions for images
|
||||
```python
|
||||
matcher = pipeline("keypoint-matching")
|
||||
matcher(image1="img1.jpg", image2="img2.jpg")
|
||||
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
||||
caption = captioner("image.jpg")
|
||||
```
|
||||
|
||||
## Audio Pipelines
|
||||
|
||||
### Automatic Speech Recognition
|
||||
```python
|
||||
asr = pipeline("automatic-speech-recognition")
|
||||
asr("audio.wav")
|
||||
```
|
||||
|
||||
### Audio Classification
|
||||
```python
|
||||
classifier = pipeline("audio-classification")
|
||||
classifier("audio.wav")
|
||||
```
|
||||
|
||||
### Zero-Shot Audio Classification
|
||||
```python
|
||||
classifier = pipeline("zero-shot-audio-classification")
|
||||
classifier("audio.wav", candidate_labels=["speech", "music", "noise"])
|
||||
```
|
||||
|
||||
### Text-to-Audio/Text-to-Speech
|
||||
```python
|
||||
synthesizer = pipeline("text-to-audio")
|
||||
audio = synthesizer("Hello, how are you today?")
|
||||
```
|
||||
|
||||
## Multimodal Pipelines
|
||||
|
||||
### Image-to-Text (Image Captioning)
|
||||
```python
|
||||
captioner = pipeline("image-to-text")
|
||||
captioner("image.jpg")
|
||||
```
|
||||
|
||||
### Visual Question Answering
|
||||
```python
|
||||
vqa = pipeline("visual-question-answering")
|
||||
vqa(image="image.jpg", question="What color is the car?")
|
||||
```
|
||||
|
||||
### Image-Text-to-Text (VLMs)
|
||||
```python
|
||||
vlm = pipeline("image-text-to-text")
|
||||
vlm(images="image.jpg", text="Describe this image in detail")
|
||||
```
|
||||
|
||||
### Zero-Shot Object Detection
|
||||
```python
|
||||
detector = pipeline("zero-shot-object-detection")
|
||||
detector("image.jpg", candidate_labels=["car", "person", "tree"])
|
||||
```
|
||||
|
||||
## Pipeline Configuration
|
||||
## Pipeline Parameters
|
||||
|
||||
### Common Parameters
|
||||
|
||||
- `model`: Specify model identifier or path
|
||||
- `device`: Set device (0 for GPU, -1 for CPU, or "cuda:0")
|
||||
- `batch_size`: Process multiple inputs at once
|
||||
- `torch_dtype`: Set precision (torch.float16, torch.bfloat16)
|
||||
|
||||
**model**: Model identifier or path
|
||||
```python
|
||||
# GPU with half precision
|
||||
pipe = pipeline("text-generation", model="gpt2", device=0, torch_dtype=torch.float16)
|
||||
|
||||
# Batch processing
|
||||
pipe(["text 1", "text 2", "text 3"], batch_size=8)
|
||||
pipe = pipeline("task", model="model-id")
|
||||
```
|
||||
|
||||
### Task-Specific Parameters
|
||||
**device**: GPU device index (-1 for CPU, 0+ for GPU)
|
||||
```python
|
||||
pipe = pipeline("task", device=0) # Use first GPU
|
||||
```
|
||||
|
||||
Each pipeline accepts task-specific parameters in the call:
|
||||
**device_map**: Automatic device allocation for large models
|
||||
```python
|
||||
pipe = pipeline("task", model="large-model", device_map="auto")
|
||||
```
|
||||
|
||||
**dtype**: Model precision (reduces memory)
|
||||
```python
|
||||
import torch
|
||||
pipe = pipeline("task", torch_dtype=torch.float16)
|
||||
```
|
||||
|
||||
**batch_size**: Process multiple inputs at once
|
||||
```python
|
||||
pipe = pipeline("task", batch_size=8)
|
||||
results = pipe(["text1", "text2", "text3"])
|
||||
```
|
||||
|
||||
**framework**: Choose PyTorch or TensorFlow
|
||||
```python
|
||||
pipe = pipeline("task", framework="pt") # or "tf"
|
||||
```
|
||||
|
||||
## Batch Processing
|
||||
|
||||
Process multiple inputs efficiently:
|
||||
|
||||
```python
|
||||
# Text generation
|
||||
generator("prompt", max_length=100, temperature=0.7, top_p=0.9, num_return_sequences=3)
|
||||
classifier = pipeline("text-classification")
|
||||
texts = ["Great product!", "Terrible experience", "Just okay"]
|
||||
results = classifier(texts)
|
||||
```
|
||||
|
||||
# Summarization
|
||||
summarizer("text", max_length=130, min_length=30, do_sample=False)
|
||||
For large datasets, use generators or KeyDataset:
|
||||
|
||||
# Translation
|
||||
translator("text", max_length=512, num_beams=4)
|
||||
```python
|
||||
from transformers.pipelines.pt_utils import KeyDataset
|
||||
import datasets
|
||||
|
||||
dataset = datasets.load_dataset("dataset-name", split="test")
|
||||
pipe = pipeline("task", device=0)
|
||||
|
||||
for output in pipe(KeyDataset(dataset, "text")):
|
||||
print(output)
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### GPU Acceleration
|
||||
|
||||
Always specify device for GPU usage:
|
||||
```python
|
||||
pipe = pipeline("task", device=0)
|
||||
```
|
||||
|
||||
### Mixed Precision
|
||||
|
||||
Use float16 for 2x speedup on supported GPUs:
|
||||
```python
|
||||
import torch
|
||||
pipe = pipeline("task", torch_dtype=torch.float16, device=0)
|
||||
```
|
||||
|
||||
### Batching Guidelines
|
||||
|
||||
- **CPU**: Usually skip batching
|
||||
- **GPU with variable lengths**: May reduce efficiency
|
||||
- **GPU with similar lengths**: Significant speedup
|
||||
- **Real-time applications**: Skip batching (increases latency)
|
||||
|
||||
```python
|
||||
# Good for throughput
|
||||
pipe = pipeline("task", batch_size=32, device=0)
|
||||
results = pipe(list_of_texts)
|
||||
```
|
||||
|
||||
### Streaming Output
|
||||
|
||||
For text generation, stream tokens as they're generated:
|
||||
|
||||
```python
|
||||
from transformers import TextStreamer
|
||||
|
||||
generator = pipeline("text-generation", model="gpt2", streamer=TextStreamer())
|
||||
generator("The future of AI", max_length=100)
|
||||
```
|
||||
|
||||
## Custom Pipeline Configuration
|
||||
|
||||
Specify tokenizer and model separately:
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("model-id")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("model-id")
|
||||
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
Use custom pipeline classes:
|
||||
|
||||
```python
|
||||
from transformers import TextClassificationPipeline
|
||||
|
||||
class CustomPipeline(TextClassificationPipeline):
|
||||
def postprocess(self, model_outputs, **kwargs):
|
||||
# Custom post-processing
|
||||
return super().postprocess(model_outputs, **kwargs)
|
||||
|
||||
pipe = pipeline("text-classification", model="model-id", pipeline_class=CustomPipeline)
|
||||
```
|
||||
|
||||
## Input Formats
|
||||
|
||||
Pipelines accept various input types:
|
||||
|
||||
**Text tasks**: Strings or lists of strings
|
||||
```python
|
||||
pipe("single text")
|
||||
pipe(["text1", "text2"])
|
||||
```
|
||||
|
||||
**Image tasks**: URLs, file paths, PIL Images, or numpy arrays
|
||||
```python
|
||||
pipe("https://example.com/image.jpg")
|
||||
pipe("local/path/image.png")
|
||||
pipe(PIL.Image.open("image.jpg"))
|
||||
pipe(numpy_array)
|
||||
```
|
||||
|
||||
**Audio tasks**: File paths, numpy arrays, or raw waveforms
|
||||
```python
|
||||
pipe("audio.mp3")
|
||||
pipe(audio_array)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
Handle common issues:
|
||||
|
||||
```python
|
||||
try:
|
||||
result = pipe(input_data)
|
||||
except Exception as e:
|
||||
if "CUDA out of memory" in str(e):
|
||||
# Reduce batch size or use CPU
|
||||
pipe = pipeline("task", device=-1)
|
||||
elif "does not appear to have a file named" in str(e):
|
||||
# Model not found
|
||||
print("Check model identifier")
|
||||
else:
|
||||
raise
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Reuse pipelines**: Create once, use multiple times for efficiency
|
||||
2. **Batch processing**: Use batches for multiple inputs to maximize throughput
|
||||
3. **GPU acceleration**: Set `device=0` for GPU when available
|
||||
4. **Model selection**: Choose task-specific models for best results
|
||||
5. **Memory management**: Use `torch_dtype=torch.float16` for large models
|
||||
1. **Use pipelines for prototyping**: Fast iteration without boilerplate
|
||||
2. **Specify models explicitly**: Default models may change
|
||||
3. **Enable GPU when available**: Significant speedup
|
||||
4. **Use batching for throughput**: When processing many inputs
|
||||
5. **Consider memory usage**: Use float16 or smaller models for large batches
|
||||
6. **Cache models locally**: Avoid repeated downloads
|
||||
|
||||
@@ -1,599 +0,0 @@
|
||||
# Common Task Patterns
|
||||
|
||||
This document provides common patterns and workflows for typical tasks using Transformers.
|
||||
|
||||
## Text Classification
|
||||
|
||||
### Binary or Multi-class Classification
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSequenceClassification,
|
||||
TrainingArguments,
|
||||
Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("imdb")
|
||||
|
||||
# Tokenize
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
# Load model
|
||||
id2label = {0: "negative", 1: "positive"}
|
||||
label2id = {"negative": 0, "positive": 1}
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
|
||||
# Metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
num_train_epochs=3,
|
||||
weight_decay=0.01,
|
||||
load_best_model_at_end=True,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["test"],
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
text = "This movie was fantastic!"
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
predictions = outputs.logits.argmax(-1)
|
||||
print(id2label[predictions.item()])
|
||||
```
|
||||
|
||||
## Named Entity Recognition (Token Classification)
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForTokenClassification,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForTokenClassification
|
||||
)
|
||||
from datasets import load_dataset
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("conll2003")
|
||||
|
||||
# Tokenize (align labels with tokenized words)
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def tokenize_and_align_labels(examples):
|
||||
tokenized_inputs = tokenizer(
|
||||
examples["tokens"],
|
||||
truncation=True,
|
||||
is_split_into_words=True
|
||||
)
|
||||
|
||||
labels = []
|
||||
for i, label in enumerate(examples["ner_tags"]):
|
||||
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
||||
label_ids = []
|
||||
previous_word_idx = None
|
||||
for word_idx in word_ids:
|
||||
if word_idx is None:
|
||||
label_ids.append(-100)
|
||||
elif word_idx != previous_word_idx:
|
||||
label_ids.append(label[word_idx])
|
||||
else:
|
||||
label_ids.append(-100)
|
||||
previous_word_idx = word_idx
|
||||
labels.append(label_ids)
|
||||
|
||||
tokenized_inputs["labels"] = labels
|
||||
return tokenized_inputs
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
|
||||
|
||||
# Model
|
||||
label_list = dataset["train"].features["ner_tags"].feature.names
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=len(label_list)
|
||||
)
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForTokenClassification(tokenizer)
|
||||
|
||||
# Metrics
|
||||
metric = evaluate.load("seqeval")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions, labels = eval_pred
|
||||
predictions = np.argmax(predictions, axis=2)
|
||||
|
||||
true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
|
||||
true_predictions = [
|
||||
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
|
||||
for prediction, label in zip(predictions, labels)
|
||||
]
|
||||
|
||||
return metric.compute(predictions=true_predictions, references=true_labels)
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## Question Answering
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForQuestionAnswering,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DefaultDataCollator
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("squad")
|
||||
|
||||
# Tokenize
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def preprocess_function(examples):
|
||||
questions = [q.strip() for q in examples["question"]]
|
||||
inputs = tokenizer(
|
||||
questions,
|
||||
examples["context"],
|
||||
max_length=384,
|
||||
truncation="only_second",
|
||||
return_offsets_mapping=True,
|
||||
padding="max_length",
|
||||
)
|
||||
|
||||
offset_mapping = inputs.pop("offset_mapping")
|
||||
answers = examples["answers"]
|
||||
start_positions = []
|
||||
end_positions = []
|
||||
|
||||
for i, offset in enumerate(offset_mapping):
|
||||
answer = answers[i]
|
||||
start_char = answer["answer_start"][0]
|
||||
end_char = start_char + len(answer["text"][0])
|
||||
|
||||
# Find start and end token positions
|
||||
sequence_ids = inputs.sequence_ids(i)
|
||||
context_start = sequence_ids.index(1)
|
||||
context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
|
||||
|
||||
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
||||
start_positions.append(0)
|
||||
end_positions.append(0)
|
||||
else:
|
||||
idx = context_start
|
||||
while idx <= context_end and offset[idx][0] <= start_char:
|
||||
idx += 1
|
||||
start_positions.append(idx - 1)
|
||||
|
||||
idx = context_end
|
||||
while idx >= context_start and offset[idx][1] >= end_char:
|
||||
idx -= 1
|
||||
end_positions.append(idx + 1)
|
||||
|
||||
inputs["start_positions"] = start_positions
|
||||
inputs["end_positions"] = end_positions
|
||||
return inputs
|
||||
|
||||
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
|
||||
|
||||
# Model
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["validation"],
|
||||
data_collator=DefaultDataCollator(),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
question = "What is the capital of France?"
|
||||
context = "Paris is the capital and most populous city of France."
|
||||
inputs = tokenizer(question, context, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
|
||||
start_pos = outputs.start_logits.argmax()
|
||||
end_pos = outputs.end_logits.argmax()
|
||||
answer_tokens = inputs.input_ids[0][start_pos:end_pos+1]
|
||||
answer = tokenizer.decode(answer_tokens)
|
||||
```
|
||||
|
||||
## Text Summarization
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSeq2SeqLM,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
from datasets import load_dataset
|
||||
import evaluate
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("cnn_dailymail", "3.0.0")
|
||||
|
||||
# Tokenize
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
|
||||
def preprocess_function(examples):
|
||||
inputs = ["summarize: " + doc for doc in examples["article"]]
|
||||
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
||||
|
||||
labels = tokenizer(
|
||||
text_target=examples["highlights"],
|
||||
max_length=128,
|
||||
truncation=True
|
||||
)
|
||||
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_datasets = dataset.map(preprocess_function, batched=True)
|
||||
|
||||
# Model
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
||||
|
||||
# Metrics
|
||||
rouge = evaluate.load("rouge")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions, labels = eval_pred
|
||||
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
|
||||
result = rouge.compute(
|
||||
predictions=decoded_preds,
|
||||
references=decoded_labels,
|
||||
use_stemmer=True
|
||||
)
|
||||
|
||||
return {k: round(v, 4) for k, v in result.items()}
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
per_device_eval_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
predict_with_generate=True,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
text = "Long article text..."
|
||||
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
|
||||
outputs = model.generate(**inputs, max_length=128, num_beams=4)
|
||||
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Translation
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForSeq2SeqLM,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("wmt16", "de-en")
|
||||
|
||||
# Tokenize
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
|
||||
def preprocess_function(examples):
|
||||
inputs = [f"translate German to English: {de}" for de in examples["de"]]
|
||||
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
|
||||
|
||||
labels = tokenizer(
|
||||
text_target=examples["en"],
|
||||
max_length=128,
|
||||
truncation=True
|
||||
)
|
||||
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_datasets = dataset.map(preprocess_function, batched=True)
|
||||
|
||||
# Model and training (similar to summarization)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
|
||||
# Inference
|
||||
text = "Guten Tag, wie geht es Ihnen?"
|
||||
inputs = tokenizer(f"translate German to English: {text}", return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_length=128)
|
||||
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Causal Language Modeling (Training from Scratch or Fine-tuning)
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
TrainingArguments,
|
||||
Trainer,
|
||||
DataCollatorForLanguageModeling
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
|
||||
|
||||
# Tokenize
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], truncation=True, max_length=512)
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
||||
|
||||
# Group texts into chunks
|
||||
block_size = 128
|
||||
|
||||
def group_texts(examples):
|
||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||
total_length = (total_length // block_size) * block_size
|
||||
result = {
|
||||
k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
|
||||
for k, t in concatenated_examples.items()
|
||||
}
|
||||
result["labels"] = result["input_ids"].copy()
|
||||
return result
|
||||
|
||||
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
|
||||
|
||||
# Model
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=lm_datasets["train"],
|
||||
eval_dataset=lm_datasets["validation"],
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## Image Classification
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoImageProcessor,
|
||||
AutoModelForImageClassification,
|
||||
TrainingArguments,
|
||||
Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
|
||||
import numpy as np
|
||||
import evaluate
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("food101", split="train[:5000]")
|
||||
|
||||
# Prepare image transforms
|
||||
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
|
||||
size = image_processor.size["height"]
|
||||
|
||||
transforms = Compose([
|
||||
Resize((size, size)),
|
||||
ToTensor(),
|
||||
normalize,
|
||||
])
|
||||
|
||||
def preprocess_function(examples):
|
||||
examples["pixel_values"] = [transforms(img.convert("RGB")) for img in examples["image"]]
|
||||
return examples
|
||||
|
||||
dataset = dataset.with_transform(preprocess_function)
|
||||
|
||||
# Model
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
"google/vit-base-patch16-224",
|
||||
num_labels=len(dataset["train"].features["label"].names),
|
||||
ignore_mismatched_sizes=True
|
||||
)
|
||||
|
||||
# Metrics
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
predictions = np.argmax(eval_pred.predictions, axis=1)
|
||||
return metric.compute(predictions=predictions, references=eval_pred.label_ids)
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset["train"],
|
||||
eval_dataset=dataset["validation"],
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## Vision-Language Tasks (Image Captioning)
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
AutoModelForVision2Seq,
|
||||
TrainingArguments,
|
||||
Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset("ybelkada/football-dataset")
|
||||
|
||||
# Processor
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
|
||||
def preprocess_function(examples):
|
||||
images = [Image.open(img).convert("RGB") for img in examples["image"]]
|
||||
texts = examples["caption"]
|
||||
|
||||
inputs = processor(images=images, text=texts, padding="max_length", truncation=True)
|
||||
inputs["labels"] = inputs["input_ids"]
|
||||
return inputs
|
||||
|
||||
dataset = dataset.map(preprocess_function, batched=True)
|
||||
|
||||
# Model
|
||||
model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base")
|
||||
|
||||
# Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset["train"],
|
||||
eval_dataset=dataset["test"],
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
image = Image.open("image.jpg")
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
outputs = model.generate(**inputs)
|
||||
caption = processor.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
1. **Use appropriate Auto* classes**: AutoTokenizer, AutoModel, etc. for model loading
|
||||
2. **Proper preprocessing**: Tokenize, align labels, handle special cases
|
||||
3. **Data collators**: Use appropriate collators for dynamic padding
|
||||
4. **Metrics**: Load and compute relevant metrics for evaluation
|
||||
5. **Training arguments**: Configure properly for task and hardware
|
||||
6. **Inference**: Use pipeline() for quick inference, or manual tokenization for custom needs
|
||||
447
scientific-packages/transformers/references/tokenizers.md
Normal file
447
scientific-packages/transformers/references/tokenizers.md
Normal file
@@ -0,0 +1,447 @@
|
||||
# Tokenizers
|
||||
|
||||
## Overview
|
||||
|
||||
Tokenizers convert text into numerical representations (tokens) that models can process. They handle special tokens, padding, truncation, and attention masks.
|
||||
|
||||
## Loading Tokenizers
|
||||
|
||||
### AutoTokenizer
|
||||
|
||||
Automatically load the correct tokenizer for a model:
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
Load from local path:
|
||||
```python
|
||||
tokenizer = AutoTokenizer.from_pretrained("./local/tokenizer/path")
|
||||
```
|
||||
|
||||
## Basic Tokenization
|
||||
|
||||
### Encode Text
|
||||
|
||||
```python
|
||||
# Simple encoding
|
||||
text = "Hello, how are you?"
|
||||
tokens = tokenizer.encode(text)
|
||||
print(tokens) # [101, 7592, 1010, 2129, 2024, 2017, 1029, 102]
|
||||
|
||||
# With text tokenization
|
||||
tokens = tokenizer.tokenize(text)
|
||||
print(tokens) # ['hello', ',', 'how', 'are', 'you', '?']
|
||||
```
|
||||
|
||||
### Decode Tokens
|
||||
|
||||
```python
|
||||
token_ids = [101, 7592, 1010, 2129, 2024, 2017, 1029, 102]
|
||||
text = tokenizer.decode(token_ids)
|
||||
print(text) # "hello, how are you?"
|
||||
|
||||
# Skip special tokens
|
||||
text = tokenizer.decode(token_ids, skip_special_tokens=True)
|
||||
print(text) # "hello, how are you?"
|
||||
```
|
||||
|
||||
## The `__call__` Method
|
||||
|
||||
Primary tokenization interface:
|
||||
|
||||
```python
|
||||
# Single text
|
||||
inputs = tokenizer("Hello, how are you?")
|
||||
|
||||
# Returns dictionary with input_ids, attention_mask
|
||||
print(inputs)
|
||||
# {
|
||||
# 'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102],
|
||||
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]
|
||||
# }
|
||||
```
|
||||
|
||||
Multiple texts:
|
||||
```python
|
||||
texts = ["Hello", "How are you?"]
|
||||
inputs = tokenizer(texts, padding=True, truncation=True)
|
||||
```
|
||||
|
||||
## Key Parameters
|
||||
|
||||
### Return Tensors
|
||||
|
||||
**return_tensors**: Output format ("pt", "tf", "np")
|
||||
```python
|
||||
# PyTorch tensors
|
||||
inputs = tokenizer("text", return_tensors="pt")
|
||||
|
||||
# TensorFlow tensors
|
||||
inputs = tokenizer("text", return_tensors="tf")
|
||||
|
||||
# NumPy arrays
|
||||
inputs = tokenizer("text", return_tensors="np")
|
||||
```
|
||||
|
||||
### Padding
|
||||
|
||||
**padding**: Pad sequences to same length
|
||||
```python
|
||||
# Pad to longest sequence in batch
|
||||
inputs = tokenizer(texts, padding=True)
|
||||
|
||||
# Pad to specific length
|
||||
inputs = tokenizer(texts, padding="max_length", max_length=128)
|
||||
|
||||
# No padding
|
||||
inputs = tokenizer(texts, padding=False)
|
||||
```
|
||||
|
||||
**pad_to_multiple_of**: Pad to multiple of specified value
|
||||
```python
|
||||
inputs = tokenizer(texts, padding=True, pad_to_multiple_of=8)
|
||||
```
|
||||
|
||||
### Truncation
|
||||
|
||||
**truncation**: Limit sequence length
|
||||
```python
|
||||
# Truncate to max_length
|
||||
inputs = tokenizer(text, truncation=True, max_length=512)
|
||||
|
||||
# Truncate first sequence in pairs
|
||||
inputs = tokenizer(text1, text2, truncation="only_first")
|
||||
|
||||
# Truncate second sequence
|
||||
inputs = tokenizer(text1, text2, truncation="only_second")
|
||||
|
||||
# Truncate longest first (default for pairs)
|
||||
inputs = tokenizer(text1, text2, truncation="longest_first", max_length=512)
|
||||
```
|
||||
|
||||
### Max Length
|
||||
|
||||
**max_length**: Maximum sequence length
|
||||
```python
|
||||
inputs = tokenizer(text, max_length=512, truncation=True)
|
||||
```
|
||||
|
||||
### Additional Outputs
|
||||
|
||||
**return_attention_mask**: Include attention mask (default True)
|
||||
```python
|
||||
inputs = tokenizer(text, return_attention_mask=True)
|
||||
```
|
||||
|
||||
**return_token_type_ids**: Segment IDs for sentence pairs
|
||||
```python
|
||||
inputs = tokenizer(text1, text2, return_token_type_ids=True)
|
||||
```
|
||||
|
||||
**return_offsets_mapping**: Character position mapping (Fast tokenizers only)
|
||||
```python
|
||||
inputs = tokenizer(text, return_offsets_mapping=True)
|
||||
```
|
||||
|
||||
**return_length**: Include sequence lengths
|
||||
```python
|
||||
inputs = tokenizer(texts, padding=True, return_length=True)
|
||||
```
|
||||
|
||||
## Special Tokens
|
||||
|
||||
### Predefined Special Tokens
|
||||
|
||||
Access special tokens:
|
||||
```python
|
||||
print(tokenizer.cls_token) # [CLS] or <s>
|
||||
print(tokenizer.sep_token) # [SEP] or </s>
|
||||
print(tokenizer.pad_token) # [PAD]
|
||||
print(tokenizer.unk_token) # [UNK]
|
||||
print(tokenizer.mask_token) # [MASK]
|
||||
print(tokenizer.eos_token) # End of sequence
|
||||
print(tokenizer.bos_token) # Beginning of sequence
|
||||
|
||||
# Get IDs
|
||||
print(tokenizer.cls_token_id)
|
||||
print(tokenizer.sep_token_id)
|
||||
```
|
||||
|
||||
### Add Special Tokens
|
||||
|
||||
Manual control:
|
||||
```python
|
||||
# Automatically add special tokens (default True)
|
||||
inputs = tokenizer(text, add_special_tokens=True)
|
||||
|
||||
# Skip special tokens
|
||||
inputs = tokenizer(text, add_special_tokens=False)
|
||||
```
|
||||
|
||||
### Custom Special Tokens
|
||||
|
||||
```python
|
||||
special_tokens_dict = {
|
||||
"additional_special_tokens": ["<CUSTOM>", "<SPECIAL>"]
|
||||
}
|
||||
|
||||
num_added = tokenizer.add_special_tokens(special_tokens_dict)
|
||||
print(f"Added {num_added} tokens")
|
||||
|
||||
# Resize model embeddings after adding tokens
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
```
|
||||
|
||||
## Sentence Pairs
|
||||
|
||||
Tokenize text pairs:
|
||||
|
||||
```python
|
||||
text1 = "What is the capital of France?"
|
||||
text2 = "Paris is the capital of France."
|
||||
|
||||
# Automatically handles separation
|
||||
inputs = tokenizer(text1, text2, padding=True, truncation=True)
|
||||
|
||||
# Results in: [CLS] text1 [SEP] text2 [SEP]
|
||||
```
|
||||
|
||||
## Batch Encoding
|
||||
|
||||
Process multiple texts:
|
||||
|
||||
```python
|
||||
texts = ["First text", "Second text", "Third text"]
|
||||
|
||||
# Basic batch encoding
|
||||
batch = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
|
||||
|
||||
# Access individual encodings
|
||||
for i in range(len(texts)):
|
||||
input_ids = batch["input_ids"][i]
|
||||
attention_mask = batch["attention_mask"][i]
|
||||
```
|
||||
|
||||
## Fast Tokenizers
|
||||
|
||||
Use Rust-based tokenizers for speed:
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Automatically loads Fast version if available
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Check if Fast
|
||||
print(tokenizer.is_fast) # True
|
||||
|
||||
# Force Fast tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
|
||||
|
||||
# Force slow (Python) tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=False)
|
||||
```
|
||||
|
||||
### Fast Tokenizer Features
|
||||
|
||||
**Offset mapping** (character positions):
|
||||
```python
|
||||
inputs = tokenizer("Hello world", return_offsets_mapping=True)
|
||||
print(inputs["offset_mapping"])
|
||||
# [(0, 0), (0, 5), (6, 11), (0, 0)] # [CLS], "Hello", "world", [SEP]
|
||||
```
|
||||
|
||||
**Token to word mapping**:
|
||||
```python
|
||||
encoding = tokenizer("Hello world")
|
||||
word_ids = encoding.word_ids()
|
||||
print(word_ids) # [None, 0, 1, None] # [CLS]=None, "Hello"=0, "world"=1, [SEP]=None
|
||||
```
|
||||
|
||||
## Saving Tokenizers
|
||||
|
||||
Save locally:
|
||||
```python
|
||||
tokenizer.save_pretrained("./my_tokenizer")
|
||||
```
|
||||
|
||||
Push to Hub:
|
||||
```python
|
||||
tokenizer.push_to_hub("username/my-tokenizer")
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Vocabulary
|
||||
|
||||
Access vocabulary:
|
||||
```python
|
||||
vocab = tokenizer.get_vocab()
|
||||
vocab_size = len(vocab)
|
||||
|
||||
# Get token for ID
|
||||
token = tokenizer.convert_ids_to_tokens(100)
|
||||
|
||||
# Get ID for token
|
||||
token_id = tokenizer.convert_tokens_to_ids("hello")
|
||||
```
|
||||
|
||||
### Encoding Details
|
||||
|
||||
Get detailed encoding information:
|
||||
|
||||
```python
|
||||
encoding = tokenizer("Hello world", return_tensors="pt")
|
||||
|
||||
# Original methods still available
|
||||
tokens = encoding.tokens()
|
||||
word_ids = encoding.word_ids()
|
||||
sequence_ids = encoding.sequence_ids()
|
||||
```
|
||||
|
||||
### Custom Preprocessing
|
||||
|
||||
Subclass for custom behavior:
|
||||
|
||||
```python
|
||||
class CustomTokenizer(AutoTokenizer):
|
||||
def __call__(self, text, **kwargs):
|
||||
# Custom preprocessing
|
||||
text = text.lower().strip()
|
||||
return super().__call__(text, **kwargs)
|
||||
```
|
||||
|
||||
## Chat Templates
|
||||
|
||||
For conversational models:
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
{"role": "assistant", "content": "Hi there!"},
|
||||
{"role": "user", "content": "How are you?"}
|
||||
]
|
||||
|
||||
# Apply chat template
|
||||
text = tokenizer.apply_chat_template(messages, tokenize=False)
|
||||
print(text)
|
||||
|
||||
# Tokenize directly
|
||||
inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt")
|
||||
```
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Pattern 1: Simple Text Classification
|
||||
|
||||
```python
|
||||
texts = ["I love this!", "I hate this!"]
|
||||
labels = [1, 0]
|
||||
|
||||
inputs = tokenizer(
|
||||
texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=512,
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
# Use with model
|
||||
outputs = model(**inputs, labels=torch.tensor(labels))
|
||||
```
|
||||
|
||||
### Pattern 2: Question Answering
|
||||
|
||||
```python
|
||||
question = "What is the capital?"
|
||||
context = "Paris is the capital of France."
|
||||
|
||||
inputs = tokenizer(
|
||||
question,
|
||||
context,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=384,
|
||||
return_tensors="pt"
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 3: Text Generation
|
||||
|
||||
```python
|
||||
prompt = "Once upon a time"
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
# Generate
|
||||
outputs = model.generate(
|
||||
inputs["input_ids"],
|
||||
max_new_tokens=50,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Decode
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
### Pattern 4: Dataset Tokenization
|
||||
|
||||
```python
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(
|
||||
examples["text"],
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=512
|
||||
)
|
||||
|
||||
# Apply to dataset
|
||||
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always specify return_tensors**: For model input
|
||||
2. **Use padding and truncation**: For batch processing
|
||||
3. **Set max_length explicitly**: Prevent memory issues
|
||||
4. **Use Fast tokenizers**: When available for speed
|
||||
5. **Handle pad_token**: Set to eos_token if None for generation
|
||||
6. **Add special tokens**: Leave enabled (default) unless specific reason
|
||||
7. **Resize embeddings**: After adding custom tokens
|
||||
8. **Decode with skip_special_tokens**: For cleaner output
|
||||
9. **Use batched processing**: For efficiency with datasets
|
||||
10. **Save tokenizer with model**: Ensure compatibility
|
||||
|
||||
## Common Issues
|
||||
|
||||
**Padding token not set:**
|
||||
```python
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
```
|
||||
|
||||
**Sequence too long:**
|
||||
```python
|
||||
# Enable truncation
|
||||
inputs = tokenizer(text, truncation=True, max_length=512)
|
||||
```
|
||||
|
||||
**Mismatched vocabulary:**
|
||||
```python
|
||||
# Always load tokenizer and model from same checkpoint
|
||||
tokenizer = AutoTokenizer.from_pretrained("model-id")
|
||||
model = AutoModel.from_pretrained("model-id")
|
||||
```
|
||||
|
||||
**Attention mask issues:**
|
||||
```python
|
||||
# Ensure attention_mask is passed
|
||||
outputs = model(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"]
|
||||
)
|
||||
```
|
||||
@@ -1,182 +1,50 @@
|
||||
# Training with Transformers
|
||||
# Training and Fine-Tuning
|
||||
|
||||
Transformers provides comprehensive training capabilities through the `Trainer` API, supporting distributed training, mixed precision, and advanced optimization techniques.
|
||||
## Overview
|
||||
|
||||
## Basic Training Workflow
|
||||
Fine-tune pre-trained models on custom datasets using the Trainer API. The Trainer handles training loops, gradient accumulation, mixed precision, logging, and checkpointing.
|
||||
|
||||
## Basic Fine-Tuning Workflow
|
||||
|
||||
### Step 1: Load and Preprocess Data
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
Trainer,
|
||||
TrainingArguments
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# 1. Load and preprocess data
|
||||
dataset = load_dataset("imdb")
|
||||
# Load dataset
|
||||
dataset = load_dataset("yelp_review_full")
|
||||
train_dataset = dataset["train"]
|
||||
eval_dataset = dataset["test"]
|
||||
|
||||
# Tokenize
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||
return tokenizer(
|
||||
examples["text"],
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=512
|
||||
)
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
train_dataset = train_dataset.map(tokenize_function, batched=True)
|
||||
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
|
||||
```
|
||||
|
||||
### Step 2: Load Model
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
# 2. Load model
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2
|
||||
)
|
||||
|
||||
# 3. Define training arguments
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
learning_rate=2e-5,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
)
|
||||
|
||||
# 4. Create trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["test"],
|
||||
)
|
||||
|
||||
# 5. Train
|
||||
trainer.train()
|
||||
|
||||
# 6. Evaluate
|
||||
trainer.evaluate()
|
||||
|
||||
# 7. Save model
|
||||
trainer.save_model("./final_model")
|
||||
```
|
||||
|
||||
## TrainingArguments Configuration
|
||||
|
||||
### Essential Parameters
|
||||
|
||||
**Output and Logging:**
|
||||
- `output_dir`: Directory for checkpoints and outputs (required)
|
||||
- `logging_dir`: TensorBoard log directory (default: `{output_dir}/runs`)
|
||||
- `logging_steps`: Log every N steps (default: 500)
|
||||
- `logging_strategy`: "steps" or "epoch"
|
||||
|
||||
**Training Duration:**
|
||||
- `num_train_epochs`: Number of epochs (default: 3.0)
|
||||
- `max_steps`: Max training steps (overrides num_train_epochs if set)
|
||||
|
||||
**Batch Size and Gradient Accumulation:**
|
||||
- `per_device_train_batch_size`: Batch size per device (default: 8)
|
||||
- `per_device_eval_batch_size`: Eval batch size per device (default: 8)
|
||||
- `gradient_accumulation_steps`: Accumulate gradients over N steps (default: 1)
|
||||
- Effective batch size = `per_device_train_batch_size * gradient_accumulation_steps * num_gpus`
|
||||
|
||||
**Learning Rate:**
|
||||
- `learning_rate`: Peak learning rate (default: 5e-5)
|
||||
- `lr_scheduler_type`: Scheduler type ("linear", "cosine", "constant", etc.)
|
||||
- `warmup_steps`: Warmup steps (default: 0)
|
||||
- `warmup_ratio`: Warmup as fraction of total steps
|
||||
|
||||
**Evaluation:**
|
||||
- `eval_strategy`: "no", "steps", or "epoch" (default: "no")
|
||||
- `eval_steps`: Evaluate every N steps (if eval_strategy="steps")
|
||||
- `eval_delay`: Delay evaluation until N steps
|
||||
|
||||
**Checkpointing:**
|
||||
- `save_strategy`: "no", "steps", or "epoch" (default: "steps")
|
||||
- `save_steps`: Save checkpoint every N steps (default: 500)
|
||||
- `save_total_limit`: Keep only N most recent checkpoints
|
||||
- `load_best_model_at_end`: Load best checkpoint at end (default: False)
|
||||
- `metric_for_best_model`: Metric to determine best model
|
||||
|
||||
**Optimization:**
|
||||
- `optim`: Optimizer ("adamw_torch", "adamw_hf", "sgd", etc.)
|
||||
- `weight_decay`: Weight decay coefficient (default: 0.0)
|
||||
- `adam_beta1`, `adam_beta2`: Adam optimizer betas
|
||||
- `adam_epsilon`: Epsilon for Adam (default: 1e-8)
|
||||
- `max_grad_norm`: Max gradient norm for clipping (default: 1.0)
|
||||
|
||||
### Mixed Precision Training
|
||||
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
fp16=True, # Use fp16 on NVIDIA GPUs
|
||||
fp16_opt_level="O1", # O0, O1, O2, O3 (Apex levels)
|
||||
# or
|
||||
bf16=True, # Use bf16 on Ampere+ GPUs (better than fp16)
|
||||
num_labels=5 # Number of classes
|
||||
)
|
||||
```
|
||||
|
||||
### Distributed Training
|
||||
|
||||
**DataParallel (single-node multi-GPU):**
|
||||
```python
|
||||
# Automatic with multiple GPUs
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
per_device_train_batch_size=16, # Per GPU
|
||||
)
|
||||
# Run: python script.py
|
||||
```
|
||||
|
||||
**DistributedDataParallel (multi-node or multi-GPU):**
|
||||
```bash
|
||||
# Single node, multiple GPUs
|
||||
python -m torch.distributed.launch --nproc_per_node=4 script.py
|
||||
|
||||
# Or use accelerate
|
||||
accelerate config
|
||||
accelerate launch script.py
|
||||
```
|
||||
|
||||
**DeepSpeed Integration:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
deepspeed="ds_config.json", # DeepSpeed config file
|
||||
)
|
||||
```
|
||||
|
||||
### Advanced Features
|
||||
|
||||
**Gradient Checkpointing (reduce memory):**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
gradient_checkpointing=True,
|
||||
)
|
||||
```
|
||||
|
||||
**Compilation with torch.compile:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
torch_compile=True,
|
||||
torch_compile_backend="inductor", # or "cudagraphs"
|
||||
)
|
||||
```
|
||||
|
||||
**Push to Hub:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
push_to_hub=True,
|
||||
hub_model_id="username/model-name",
|
||||
hub_strategy="every_save", # or "end"
|
||||
)
|
||||
```
|
||||
|
||||
## Custom Training Components
|
||||
|
||||
### Custom Metrics
|
||||
### Step 3: Define Metrics
|
||||
|
||||
```python
|
||||
import evaluate
|
||||
@@ -188,32 +56,195 @@ def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
```
|
||||
|
||||
### Step 4: Configure Training
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments
|
||||
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
per_device_eval_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
weight_decay=0.01,
|
||||
logging_dir="./logs",
|
||||
logging_steps=10,
|
||||
load_best_model_at_end=True,
|
||||
metric_for_best_model="accuracy",
|
||||
)
|
||||
```
|
||||
|
||||
### Step 5: Create Trainer and Train
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# Start training
|
||||
trainer.train()
|
||||
|
||||
# Evaluate
|
||||
results = trainer.evaluate()
|
||||
print(results)
|
||||
```
|
||||
|
||||
### Custom Loss Function
|
||||
### Step 6: Save Model
|
||||
|
||||
```python
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop("labels")
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
trainer.save_model("./fine_tuned_model")
|
||||
tokenizer.save_pretrained("./fine_tuned_model")
|
||||
|
||||
# Custom loss calculation
|
||||
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
# Or push to Hub
|
||||
trainer.push_to_hub("username/my-finetuned-model")
|
||||
```
|
||||
|
||||
### Data Collator
|
||||
## TrainingArguments Parameters
|
||||
|
||||
### Essential Parameters
|
||||
|
||||
**output_dir**: Directory for checkpoints and logs
|
||||
```python
|
||||
output_dir="./results"
|
||||
```
|
||||
|
||||
**num_train_epochs**: Number of training epochs
|
||||
```python
|
||||
num_train_epochs=3
|
||||
```
|
||||
|
||||
**per_device_train_batch_size**: Batch size per GPU/CPU
|
||||
```python
|
||||
per_device_train_batch_size=8
|
||||
```
|
||||
|
||||
**learning_rate**: Optimizer learning rate
|
||||
```python
|
||||
learning_rate=2e-5 # Common for BERT-style models
|
||||
learning_rate=5e-5 # Common for smaller models
|
||||
```
|
||||
|
||||
**weight_decay**: L2 regularization
|
||||
```python
|
||||
weight_decay=0.01
|
||||
```
|
||||
|
||||
### Evaluation and Saving
|
||||
|
||||
**eval_strategy**: When to evaluate ("no", "steps", "epoch")
|
||||
```python
|
||||
eval_strategy="epoch" # Evaluate after each epoch
|
||||
eval_strategy="steps" # Evaluate every eval_steps
|
||||
```
|
||||
|
||||
**save_strategy**: When to save checkpoints
|
||||
```python
|
||||
save_strategy="epoch"
|
||||
save_strategy="steps"
|
||||
save_steps=500
|
||||
```
|
||||
|
||||
**load_best_model_at_end**: Load best checkpoint after training
|
||||
```python
|
||||
load_best_model_at_end=True
|
||||
metric_for_best_model="accuracy" # Metric to compare
|
||||
```
|
||||
|
||||
### Optimization
|
||||
|
||||
**gradient_accumulation_steps**: Accumulate gradients over multiple steps
|
||||
```python
|
||||
gradient_accumulation_steps=4 # Effective batch size = batch_size * 4
|
||||
```
|
||||
|
||||
**fp16**: Enable mixed precision (NVIDIA GPUs)
|
||||
```python
|
||||
fp16=True
|
||||
```
|
||||
|
||||
**bf16**: Enable bfloat16 (newer GPUs)
|
||||
```python
|
||||
bf16=True
|
||||
```
|
||||
|
||||
**gradient_checkpointing**: Trade compute for memory
|
||||
```python
|
||||
gradient_checkpointing=True # Slower but uses less memory
|
||||
```
|
||||
|
||||
**optim**: Optimizer choice
|
||||
```python
|
||||
optim="adamw_torch" # Default
|
||||
optim="adamw_8bit" # 8-bit Adam (requires bitsandbytes)
|
||||
optim="adafactor" # Memory-efficient alternative
|
||||
```
|
||||
|
||||
### Learning Rate Scheduling
|
||||
|
||||
**lr_scheduler_type**: Learning rate schedule
|
||||
```python
|
||||
lr_scheduler_type="linear" # Linear decay
|
||||
lr_scheduler_type="cosine" # Cosine annealing
|
||||
lr_scheduler_type="constant" # No decay
|
||||
lr_scheduler_type="constant_with_warmup"
|
||||
```
|
||||
|
||||
**warmup_steps** or **warmup_ratio**: Warmup period
|
||||
```python
|
||||
warmup_steps=500
|
||||
# Or
|
||||
warmup_ratio=0.1 # 10% of total steps
|
||||
```
|
||||
|
||||
### Logging
|
||||
|
||||
**logging_dir**: TensorBoard logs directory
|
||||
```python
|
||||
logging_dir="./logs"
|
||||
```
|
||||
|
||||
**logging_steps**: Log every N steps
|
||||
```python
|
||||
logging_steps=10
|
||||
```
|
||||
|
||||
**report_to**: Logging integrations
|
||||
```python
|
||||
report_to=["tensorboard"]
|
||||
report_to=["wandb"]
|
||||
report_to=["tensorboard", "wandb"]
|
||||
```
|
||||
|
||||
### Distributed Training
|
||||
|
||||
**ddp_backend**: Distributed backend
|
||||
```python
|
||||
ddp_backend="nccl" # For multi-GPU
|
||||
```
|
||||
|
||||
**deepspeed**: DeepSpeed config file
|
||||
```python
|
||||
deepspeed="ds_config.json"
|
||||
```
|
||||
|
||||
## Data Collators
|
||||
|
||||
Handle dynamic padding and special preprocessing:
|
||||
|
||||
### DataCollatorWithPadding
|
||||
|
||||
Pad sequences to longest in batch:
|
||||
```python
|
||||
from transformers import DataCollatorWithPadding
|
||||
|
||||
@@ -227,102 +258,243 @@ trainer = Trainer(
|
||||
)
|
||||
```
|
||||
|
||||
### Callbacks
|
||||
### DataCollatorForLanguageModeling
|
||||
|
||||
For masked language modeling:
|
||||
```python
|
||||
from transformers import DataCollatorForLanguageModeling
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=True,
|
||||
mlm_probability=0.15
|
||||
)
|
||||
```
|
||||
|
||||
### DataCollatorForSeq2Seq
|
||||
|
||||
For sequence-to-sequence tasks:
|
||||
```python
|
||||
from transformers import DataCollatorForSeq2Seq
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer=tokenizer,
|
||||
model=model,
|
||||
padding=True
|
||||
)
|
||||
```
|
||||
|
||||
## Custom Training
|
||||
|
||||
### Custom Trainer
|
||||
|
||||
Override methods for custom behavior:
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop("labels")
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
|
||||
# Custom loss computation
|
||||
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
```
|
||||
|
||||
### Custom Callbacks
|
||||
|
||||
Monitor and control training:
|
||||
|
||||
```python
|
||||
from transformers import TrainerCallback
|
||||
|
||||
class CustomCallback(TrainerCallback):
|
||||
def on_epoch_end(self, args, state, control, **kwargs):
|
||||
print(f"Epoch {state.epoch} completed!")
|
||||
print(f"Epoch {state.epoch} completed")
|
||||
# Custom logic here
|
||||
return control
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
callbacks=[CustomCallback],
|
||||
)
|
||||
```
|
||||
|
||||
## Hyperparameter Search
|
||||
## Advanced Training Techniques
|
||||
|
||||
### Parameter-Efficient Fine-Tuning (PEFT)
|
||||
|
||||
Use LoRA for efficient fine-tuning:
|
||||
|
||||
```python
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["query", "value"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="SEQ_CLS"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
model.print_trainable_parameters() # Shows reduced parameter count
|
||||
|
||||
# Train normally with Trainer
|
||||
trainer = Trainer(model=model, args=training_args, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
### Gradient Checkpointing
|
||||
|
||||
Reduce memory at cost of speed:
|
||||
|
||||
```python
|
||||
model.gradient_checkpointing_enable()
|
||||
|
||||
training_args = TrainingArguments(
|
||||
gradient_checkpointing=True,
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
### Mixed Precision Training
|
||||
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
fp16=True, # For NVIDIA GPUs with Tensor Cores
|
||||
# or
|
||||
bf16=True, # For newer GPUs (A100, H100)
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
### DeepSpeed Integration
|
||||
|
||||
For very large models:
|
||||
|
||||
```python
|
||||
# ds_config.json
|
||||
{
|
||||
"train_batch_size": 16,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": 2e-5
|
||||
}
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": true
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
deepspeed="ds_config.json",
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
## Training Tips
|
||||
|
||||
### Hyperparameter Tuning
|
||||
|
||||
Common starting points:
|
||||
- **Learning rate**: 2e-5 to 5e-5 for BERT-like models, 1e-4 to 1e-3 for smaller models
|
||||
- **Batch size**: 8-32 depending on GPU memory
|
||||
- **Epochs**: 2-4 for fine-tuning, more for domain adaptation
|
||||
- **Warmup**: 10% of total steps
|
||||
|
||||
Use Optuna for hyperparameter search:
|
||||
|
||||
```python
|
||||
def model_init():
|
||||
return AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2
|
||||
num_labels=5
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model_init=model_init,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# Optuna-based search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
direction="maximize",
|
||||
backend="optuna",
|
||||
n_trials=10,
|
||||
hp_space=lambda trial: {
|
||||
def optuna_hp_space(trial):
|
||||
return {
|
||||
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
|
||||
"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
|
||||
"num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
|
||||
}
|
||||
|
||||
trainer = Trainer(model_init=model_init, args=training_args, ...)
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
direction="maximize",
|
||||
backend="optuna",
|
||||
hp_space=optuna_hp_space,
|
||||
n_trials=10,
|
||||
)
|
||||
```
|
||||
|
||||
## Training Best Practices
|
||||
### Monitoring Training
|
||||
|
||||
1. **Start with small learning rates**: 2e-5 to 5e-5 for fine-tuning
|
||||
2. **Use warmup**: 5-10% of total steps for learning rate warmup
|
||||
3. **Monitor training**: Use eval_strategy="epoch" or "steps" to track progress
|
||||
4. **Save checkpoints**: Set save_strategy and save_total_limit
|
||||
5. **Use mixed precision**: Enable fp16 or bf16 for faster training
|
||||
6. **Gradient accumulation**: For large effective batch sizes on limited memory
|
||||
7. **Load best model**: Set load_best_model_at_end=True to avoid overfitting
|
||||
8. **Push to Hub**: Enable push_to_hub for easy model sharing and versioning
|
||||
Use TensorBoard:
|
||||
```bash
|
||||
tensorboard --logdir ./logs
|
||||
```
|
||||
|
||||
## Common Training Patterns
|
||||
|
||||
### Classification
|
||||
Or Weights & Biases:
|
||||
```python
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=num_classes,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
import wandb
|
||||
wandb.init(project="my-project")
|
||||
|
||||
training_args = TrainingArguments(
|
||||
report_to=["wandb"],
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
### Question Answering
|
||||
### Resume Training
|
||||
|
||||
Resume from checkpoint:
|
||||
```python
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
||||
trainer.train(resume_from_checkpoint="./results/checkpoint-1000")
|
||||
```
|
||||
|
||||
### Token Classification (NER)
|
||||
```python
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=num_tags,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
```
|
||||
## Common Issues
|
||||
|
||||
### Sequence-to-Sequence
|
||||
```python
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
||||
```
|
||||
**CUDA out of memory:**
|
||||
- Reduce batch size
|
||||
- Enable gradient checkpointing
|
||||
- Use gradient accumulation
|
||||
- Use 8-bit optimizers
|
||||
|
||||
### Causal Language Modeling
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
```
|
||||
**Overfitting:**
|
||||
- Increase weight_decay
|
||||
- Add dropout
|
||||
- Use early stopping
|
||||
- Reduce model size or training epochs
|
||||
|
||||
### Masked Language Modeling
|
||||
```python
|
||||
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
**Slow training:**
|
||||
- Increase batch size
|
||||
- Enable mixed precision (fp16/bf16)
|
||||
- Use multiple GPUs
|
||||
- Optimize data loading
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start small**: Test on small dataset subset first
|
||||
2. **Use evaluation**: Monitor validation metrics
|
||||
3. **Save checkpoints**: Enable save_strategy
|
||||
4. **Log extensively**: Use TensorBoard or W&B
|
||||
5. **Try different learning rates**: Start with 2e-5
|
||||
6. **Use warmup**: Helps training stability
|
||||
7. **Enable mixed precision**: Faster training
|
||||
8. **Consider PEFT**: For large models with limited resources
|
||||
|
||||
Reference in New Issue
Block a user