mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
Add more scientific skills
This commit is contained in:
699
scientific-packages/transformers/references/api_reference.md
Normal file
699
scientific-packages/transformers/references/api_reference.md
Normal file
@@ -0,0 +1,699 @@
|
||||
# Transformers API Reference
|
||||
|
||||
This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
|
||||
|
||||
## Core Model Classes
|
||||
|
||||
### PreTrainedModel
|
||||
|
||||
Base class for all models. Handles loading, saving, and common model operations.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import PreTrainedModel
|
||||
|
||||
# Load pretrained model
|
||||
model = ModelClass.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
config=None, # Custom config
|
||||
cache_dir=None, # Custom cache location
|
||||
force_download=False, # Force re-download
|
||||
resume_download=False, # Resume interrupted download
|
||||
proxies=None, # HTTP proxies
|
||||
local_files_only=False, # Only use cached files
|
||||
token=None, # HF auth token
|
||||
revision="main", # Git branch/tag
|
||||
trust_remote_code=False, # Allow custom model code
|
||||
device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.)
|
||||
torch_dtype=None, # Model dtype (torch.float16, "auto", etc.)
|
||||
low_cpu_mem_usage=False, # Reduce CPU memory during loading
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Save model
|
||||
model.save_pretrained(
|
||||
save_directory,
|
||||
save_config=True, # Save config.json
|
||||
state_dict=None, # Custom state dict
|
||||
save_function=torch.save, # Custom save function
|
||||
push_to_hub=False, # Upload to Hub
|
||||
max_shard_size="5GB", # Max checkpoint size
|
||||
safe_serialization=True, # Use SafeTensors format
|
||||
variant=None, # Model variant name
|
||||
)
|
||||
|
||||
# Generate text (for generative models)
|
||||
outputs = model.generate(
|
||||
inputs=None, # Input token IDs
|
||||
max_length=20, # Max total length
|
||||
max_new_tokens=None, # Max new tokens to generate
|
||||
min_length=0, # Minimum length
|
||||
do_sample=False, # Enable sampling
|
||||
early_stopping=False, # Stop when num_beams finish
|
||||
num_beams=1, # Beam search width
|
||||
temperature=1.0, # Sampling temperature
|
||||
top_k=50, # Top-k sampling
|
||||
top_p=1.0, # Nucleus sampling
|
||||
repetition_penalty=1.0, # Penalize repetition
|
||||
length_penalty=1.0, # Beam search length penalty
|
||||
no_repeat_ngram_size=0, # Block repeated n-grams
|
||||
num_return_sequences=1, # Number of sequences to return
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Resize token embeddings (after adding tokens)
|
||||
new_embeddings = model.resize_token_embeddings(
|
||||
new_num_tokens,
|
||||
pad_to_multiple_of=None
|
||||
)
|
||||
|
||||
# Utility methods
|
||||
num_params = model.num_parameters(only_trainable=False)
|
||||
model.gradient_checkpointing_enable() # Enable gradient checkpointing
|
||||
model.enable_input_require_grads() # For PEFT with frozen models
|
||||
```
|
||||
|
||||
### AutoModel Classes
|
||||
|
||||
Automatically instantiate the correct model architecture.
|
||||
|
||||
**Available Classes:**
|
||||
|
||||
- `AutoModel`: Base model (returns hidden states)
|
||||
- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
|
||||
- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
|
||||
- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
|
||||
- `AutoModelForSequenceClassification`: Text classification
|
||||
- `AutoModelForTokenClassification`: Token classification (NER)
|
||||
- `AutoModelForQuestionAnswering`: Extractive QA
|
||||
- `AutoModelForImageClassification`: Image classification
|
||||
- `AutoModelForObjectDetection`: Object detection
|
||||
- `AutoModelForSemanticSegmentation`: Semantic segmentation
|
||||
- `AutoModelForAudioClassification`: Audio classification
|
||||
- `AutoModelForSpeechSeq2Seq`: Speech-to-text
|
||||
- `AutoModelForVision2Seq`: Image captioning, VQA
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
from transformers import AutoModel, AutoConfig
|
||||
|
||||
# Load with default configuration
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Load with custom configuration
|
||||
config = AutoConfig.from_pretrained("bert-base-uncased")
|
||||
config.hidden_dropout_prob = 0.2
|
||||
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
|
||||
|
||||
# Register custom models
|
||||
from transformers import AutoConfig, AutoModel
|
||||
|
||||
AutoConfig.register("my-model", MyModelConfig)
|
||||
AutoModel.register(MyModelConfig, MyModel)
|
||||
```
|
||||
|
||||
## Tokenizer Classes
|
||||
|
||||
### PreTrainedTokenizer / PreTrainedTokenizerFast
|
||||
|
||||
Convert text to token IDs and vice versa.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
use_fast=True, # Use fast (Rust) tokenizer if available
|
||||
revision="main",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Encoding (text → token IDs)
|
||||
encoded = tokenizer(
|
||||
text, # String or List[str]
|
||||
text_pair=None, # Second sequence for pairs
|
||||
add_special_tokens=True, # Add [CLS], [SEP], etc.
|
||||
padding=False, # True, False, "longest", "max_length"
|
||||
truncation=False, # True, False, "longest_first", "only_first", "only_second"
|
||||
max_length=None, # Max sequence length
|
||||
stride=0, # Overlap for split sequences
|
||||
return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
|
||||
return_token_type_ids=None, # Return token type IDs
|
||||
return_attention_mask=None, # Return attention mask
|
||||
return_overflowing_tokens=False, # Return overflowing tokens
|
||||
return_special_tokens_mask=False, # Return special token mask
|
||||
return_offsets_mapping=False, # Return char-level offsets (fast only)
|
||||
return_length=False, # Return sequence lengths
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Decoding (token IDs → text)
|
||||
text = tokenizer.decode(
|
||||
token_ids,
|
||||
skip_special_tokens=False, # Remove special tokens
|
||||
clean_up_tokenization_spaces=True, # Clean up spacing
|
||||
)
|
||||
|
||||
# Batch decoding
|
||||
texts = tokenizer.batch_decode(
|
||||
sequences,
|
||||
skip_special_tokens=False,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
# Tokenization (text → tokens)
|
||||
tokens = tokenizer.tokenize(text, **kwargs)
|
||||
|
||||
# Convert tokens to IDs
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# Convert IDs to tokens
|
||||
tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
|
||||
# Add new tokens
|
||||
num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
|
||||
|
||||
# Add special tokens
|
||||
tokenizer.add_special_tokens({
|
||||
"bos_token": "[BOS]",
|
||||
"eos_token": "[EOS]",
|
||||
"unk_token": "[UNK]",
|
||||
"sep_token": "[SEP]",
|
||||
"pad_token": "[PAD]",
|
||||
"cls_token": "[CLS]",
|
||||
"mask_token": "[MASK]",
|
||||
"additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
|
||||
})
|
||||
|
||||
# Chat template formatting
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
conversation, # List[Dict[str, str]] with "role" and "content"
|
||||
chat_template=None, # Custom template
|
||||
add_generation_prompt=False, # Add prompt for model to continue
|
||||
tokenize=True, # Return token IDs
|
||||
padding=False,
|
||||
truncation=False,
|
||||
max_length=None,
|
||||
return_tensors=None,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
# Save tokenizer
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
|
||||
# Get vocab size
|
||||
vocab_size = len(tokenizer)
|
||||
|
||||
# Get special tokens
|
||||
pad_token = tokenizer.pad_token
|
||||
pad_token_id = tokenizer.pad_token_id
|
||||
# Similar for: bos, eos, unk, sep, cls, mask
|
||||
```
|
||||
|
||||
**Special Token Attributes:**
|
||||
|
||||
```python
|
||||
tokenizer.bos_token # Beginning of sequence
|
||||
tokenizer.eos_token # End of sequence
|
||||
tokenizer.unk_token # Unknown token
|
||||
tokenizer.sep_token # Separator token
|
||||
tokenizer.pad_token # Padding token
|
||||
tokenizer.cls_token # Classification token
|
||||
tokenizer.mask_token # Mask token
|
||||
|
||||
# Corresponding IDs
|
||||
tokenizer.bos_token_id
|
||||
tokenizer.eos_token_id
|
||||
# ... etc
|
||||
```
|
||||
|
||||
## Image Processors
|
||||
|
||||
### AutoImageProcessor
|
||||
|
||||
Preprocess images for vision models.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Process images
|
||||
inputs = processor(
|
||||
images, # PIL Image, np.array, torch.Tensor, or List
|
||||
return_tensors="pt", # "pt", "tf", "np", None
|
||||
do_resize=True, # Resize to model size
|
||||
size=None, # Target size dict
|
||||
resample=None, # Resampling method
|
||||
do_rescale=True, # Rescale pixel values
|
||||
do_normalize=True, # Normalize with mean/std
|
||||
image_mean=None, # Custom mean
|
||||
image_std=None, # Custom std
|
||||
do_center_crop=False, # Center crop
|
||||
crop_size=None, # Crop size
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Returns: BatchFeature with 'pixel_values' key
|
||||
```
|
||||
|
||||
## Training Components
|
||||
|
||||
### TrainingArguments
|
||||
|
||||
Configuration for the Trainer class.
|
||||
|
||||
**Essential Arguments:**
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments
|
||||
|
||||
args = TrainingArguments(
|
||||
# ===== Output & Logging =====
|
||||
output_dir="./results", # REQUIRED: Output directory
|
||||
overwrite_output_dir=False, # Overwrite output directory
|
||||
|
||||
# ===== Training Parameters =====
|
||||
num_train_epochs=3.0, # Number of epochs
|
||||
max_steps=-1, # Max training steps (overrides epochs)
|
||||
per_device_train_batch_size=8, # Train batch size per device
|
||||
per_device_eval_batch_size=8, # Eval batch size per device
|
||||
gradient_accumulation_steps=1, # Accumulation steps
|
||||
|
||||
# ===== Learning Rate & Optimization =====
|
||||
learning_rate=5e-5, # Initial learning rate
|
||||
weight_decay=0.0, # Weight decay
|
||||
adam_beta1=0.9, # Adam beta1
|
||||
adam_beta2=0.999, # Adam beta2
|
||||
adam_epsilon=1e-8, # Adam epsilon
|
||||
max_grad_norm=1.0, # Gradient clipping
|
||||
optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
|
||||
|
||||
# ===== Learning Rate Scheduler =====
|
||||
lr_scheduler_type="linear", # Scheduler type
|
||||
warmup_steps=0, # Warmup steps
|
||||
warmup_ratio=0.0, # Warmup ratio (alternative to steps)
|
||||
|
||||
# ===== Evaluation =====
|
||||
eval_strategy="no", # "no", "steps", "epoch"
|
||||
eval_steps=None, # Eval every N steps
|
||||
eval_delay=0, # Delay first eval
|
||||
eval_accumulation_steps=None, # Accumulate eval outputs
|
||||
|
||||
# ===== Checkpointing =====
|
||||
save_strategy="steps", # "no", "steps", "epoch"
|
||||
save_steps=500, # Save every N steps
|
||||
save_total_limit=None, # Max checkpoints to keep
|
||||
save_safetensors=True, # Save as SafeTensors
|
||||
save_on_each_node=False, # Save on each node (distributed)
|
||||
|
||||
# ===== Best Model Selection =====
|
||||
load_best_model_at_end=False, # Load best checkpoint at end
|
||||
metric_for_best_model=None, # Metric to use
|
||||
greater_is_better=None, # True if higher is better
|
||||
|
||||
# ===== Logging =====
|
||||
logging_dir=None, # TensorBoard log directory
|
||||
logging_strategy="steps", # "no", "steps", "epoch"
|
||||
logging_steps=500, # Log every N steps
|
||||
logging_first_step=False, # Log first step
|
||||
logging_nan_inf_filter=True, # Filter NaN/Inf
|
||||
|
||||
# ===== Mixed Precision =====
|
||||
fp16=False, # Use fp16 training
|
||||
fp16_opt_level="O1", # Apex AMP optimization level
|
||||
fp16_backend="auto", # "auto", "apex", "cpu_amp"
|
||||
bf16=False, # Use bfloat16 training
|
||||
bf16_full_eval=False, # Use bf16 for evaluation
|
||||
tf32=None, # Use TF32 (Ampere+ GPUs)
|
||||
|
||||
# ===== Memory Optimization =====
|
||||
gradient_checkpointing=False, # Enable gradient checkpointing
|
||||
gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing
|
||||
torch_empty_cache_steps=None, # Clear cache every N steps
|
||||
|
||||
# ===== Distributed Training =====
|
||||
local_rank=-1, # Local rank for distributed
|
||||
ddp_backend=None, # "nccl", "gloo", "mpi", "ccl"
|
||||
ddp_find_unused_parameters=None, # Find unused parameters
|
||||
ddp_bucket_cap_mb=None, # DDP bucket size
|
||||
fsdp="", # FSDP configuration
|
||||
fsdp_config=None, # FSDP config dict
|
||||
deepspeed=None, # DeepSpeed config
|
||||
|
||||
# ===== Hub Integration =====
|
||||
push_to_hub=False, # Push to Hugging Face Hub
|
||||
hub_model_id=None, # Hub model ID
|
||||
hub_strategy="every_save", # "every_save", "checkpoint", "end"
|
||||
hub_token=None, # Hub authentication token
|
||||
hub_private_repo=False, # Make repo private
|
||||
|
||||
# ===== Data Handling =====
|
||||
dataloader_num_workers=0, # DataLoader workers
|
||||
dataloader_pin_memory=True, # Pin memory
|
||||
dataloader_drop_last=False, # Drop last incomplete batch
|
||||
dataloader_prefetch_factor=None, # Prefetch factor
|
||||
remove_unused_columns=True, # Remove unused dataset columns
|
||||
label_names=None, # Label column names
|
||||
|
||||
# ===== Other =====
|
||||
seed=42, # Random seed
|
||||
data_seed=None, # Data sampling seed
|
||||
jit_mode_eval=False, # Use PyTorch JIT for eval
|
||||
use_ipex=False, # Use Intel Extension for PyTorch
|
||||
torch_compile=False, # Use torch.compile()
|
||||
torch_compile_backend=None, # Compile backend
|
||||
torch_compile_mode=None, # Compile mode
|
||||
include_inputs_for_metrics=False, # Pass inputs to compute_metrics
|
||||
skip_memory_metrics=True, # Skip memory profiling
|
||||
)
|
||||
```
|
||||
|
||||
### Trainer
|
||||
|
||||
Main training class with full training loop.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=None, # Model to train
|
||||
args=None, # TrainingArguments
|
||||
data_collator=None, # Data collator
|
||||
train_dataset=None, # Training dataset
|
||||
eval_dataset=None, # Evaluation dataset
|
||||
tokenizer=None, # Tokenizer
|
||||
model_init=None, # Function to instantiate model
|
||||
compute_metrics=None, # Function to compute metrics
|
||||
callbacks=None, # List of callbacks
|
||||
optimizers=(None, None), # (optimizer, scheduler) tuple
|
||||
preprocess_logits_for_metrics=None, # Preprocess logits before metrics
|
||||
)
|
||||
|
||||
# Train model
|
||||
train_result = trainer.train(
|
||||
resume_from_checkpoint=None, # Resume from checkpoint
|
||||
trial=None, # Optuna/Ray trial
|
||||
ignore_keys_for_eval=None, # Keys to ignore in eval
|
||||
)
|
||||
|
||||
# Evaluate model
|
||||
eval_result = trainer.evaluate(
|
||||
eval_dataset=None, # Eval dataset (default: self.eval_dataset)
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="eval", # Prefix for metric names
|
||||
)
|
||||
|
||||
# Make predictions
|
||||
predictions = trainer.predict(
|
||||
test_dataset, # Test dataset
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="test", # Metric prefix
|
||||
)
|
||||
# Returns: PredictionOutput(predictions, label_ids, metrics)
|
||||
|
||||
# Save model
|
||||
trainer.save_model(output_dir=None)
|
||||
|
||||
# Push to Hub
|
||||
trainer.push_to_hub(
|
||||
commit_message="End of training",
|
||||
blocking=True,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Hyperparameter search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
hp_space=None, # Hyperparameter search space
|
||||
compute_objective=None, # Objective function
|
||||
n_trials=20, # Number of trials
|
||||
direction="minimize", # "minimize" or "maximize"
|
||||
backend=None, # "optuna", "ray", "sigopt"
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = trainer.create_optimizer()
|
||||
|
||||
# Create scheduler
|
||||
scheduler = trainer.create_scheduler(
|
||||
num_training_steps,
|
||||
optimizer=None
|
||||
)
|
||||
|
||||
# Log metrics
|
||||
trainer.log_metrics(split, metrics)
|
||||
trainer.save_metrics(split, metrics)
|
||||
|
||||
# Save checkpoint
|
||||
trainer.save_state()
|
||||
|
||||
# Access current step/epoch
|
||||
current_step = trainer.state.global_step
|
||||
current_epoch = trainer.state.epoch
|
||||
|
||||
# Access training logs
|
||||
logs = trainer.state.log_history
|
||||
```
|
||||
|
||||
### Seq2SeqTrainer
|
||||
|
||||
Specialized trainer for sequence-to-sequence models.
|
||||
|
||||
```python
|
||||
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
|
||||
|
||||
# Use Seq2SeqTrainingArguments with additional parameters
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./results",
|
||||
predict_with_generate=True, # Use generate() for evaluation
|
||||
generation_max_length=None, # Max length for generation
|
||||
generation_num_beams=None, # Num beams for generation
|
||||
**other_training_arguments
|
||||
)
|
||||
|
||||
# Trainer usage is identical to Trainer
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
```
|
||||
|
||||
## Pipeline Classes
|
||||
|
||||
### pipeline()
|
||||
|
||||
Unified inference API for all tasks.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
pipe = pipeline(
|
||||
task=None, # Task name (required)
|
||||
model=None, # Model name/path or model object
|
||||
config=None, # Model config
|
||||
tokenizer=None, # Tokenizer
|
||||
feature_extractor=None, # Feature extractor
|
||||
image_processor=None, # Image processor
|
||||
framework=None, # "pt" or "tf"
|
||||
revision=None, # Model revision
|
||||
use_fast=True, # Use fast tokenizer
|
||||
token=None, # HF token
|
||||
device=None, # Device (-1 for CPU, 0+ for GPU)
|
||||
device_map=None, # Device map for multi-GPU
|
||||
torch_dtype=None, # Model dtype
|
||||
trust_remote_code=False, # Allow custom code
|
||||
model_kwargs=None, # Additional model kwargs
|
||||
pipeline_class=None, # Custom pipeline class
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Use pipeline
|
||||
results = pipe(
|
||||
inputs, # Input data
|
||||
**task_specific_parameters
|
||||
)
|
||||
```
|
||||
|
||||
## Data Collators
|
||||
|
||||
Batch and pad data for training.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding, # Dynamic padding for classification
|
||||
DataCollatorForTokenClassification, # Padding for token classification
|
||||
DataCollatorForSeq2Seq, # Padding for seq2seq
|
||||
DataCollatorForLanguageModeling, # MLM/CLM data collation
|
||||
default_data_collator, # Simple collator (no padding)
|
||||
)
|
||||
|
||||
# Text classification
|
||||
data_collator = DataCollatorWithPadding(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
|
||||
# Token classification
|
||||
data_collator = DataCollatorForTokenClassification(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Seq2Seq
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer=tokenizer,
|
||||
model=None,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Language modeling
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=True, # Masked LM (False for causal LM)
|
||||
mlm_probability=0.15, # Mask probability
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
```
|
||||
|
||||
## Optimization & Scheduling
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AdamW, # AdamW optimizer
|
||||
Adafactor, # Adafactor optimizer
|
||||
get_scheduler, # Get LR scheduler
|
||||
get_linear_schedule_with_warmup,
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_polynomial_decay_schedule_with_warmup,
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = AdamW(
|
||||
model.parameters(),
|
||||
lr=5e-5,
|
||||
betas=(0.9, 0.999),
|
||||
eps=1e-8,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
# Create scheduler
|
||||
scheduler = get_scheduler(
|
||||
name="linear", # "linear", "cosine", "polynomial", "constant"
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
# Or use specific schedulers
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
scheduler = get_cosine_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
num_cycles=0.5,
|
||||
)
|
||||
```
|
||||
|
||||
## Configuration Classes
|
||||
|
||||
```python
|
||||
from transformers import AutoConfig
|
||||
|
||||
# Load configuration
|
||||
config = AutoConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Common configuration attributes
|
||||
config.vocab_size # Vocabulary size
|
||||
config.hidden_size # Hidden layer size
|
||||
config.num_hidden_layers # Number of layers
|
||||
config.num_attention_heads # Attention heads
|
||||
config.intermediate_size # FFN intermediate size
|
||||
config.hidden_dropout_prob # Dropout probability
|
||||
config.attention_probs_dropout_prob # Attention dropout
|
||||
config.max_position_embeddings # Max sequence length
|
||||
|
||||
# Save configuration
|
||||
config.save_pretrained(save_directory)
|
||||
|
||||
# Create model from config
|
||||
from transformers import AutoModel
|
||||
model = AutoModel.from_config(config)
|
||||
```
|
||||
|
||||
## Utility Functions
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
set_seed, # Set random seed
|
||||
logging, # Logging utilities
|
||||
)
|
||||
|
||||
# Set seed for reproducibility
|
||||
set_seed(42)
|
||||
|
||||
# Configure logging
|
||||
logging.set_verbosity_info()
|
||||
logging.set_verbosity_warning()
|
||||
logging.set_verbosity_error()
|
||||
logging.set_verbosity_debug()
|
||||
|
||||
# Get logger
|
||||
logger = logging.get_logger(__name__)
|
||||
```
|
||||
|
||||
## Model Outputs
|
||||
|
||||
All models return model-specific output classes (subclasses of `ModelOutput`):
|
||||
|
||||
```python
|
||||
# Common output attributes
|
||||
outputs.loss # Loss (if labels provided)
|
||||
outputs.logits # Model logits
|
||||
outputs.hidden_states # All hidden states (if output_hidden_states=True)
|
||||
outputs.attentions # Attention weights (if output_attentions=True)
|
||||
|
||||
# Seq2Seq specific
|
||||
outputs.encoder_last_hidden_state
|
||||
outputs.encoder_hidden_states
|
||||
outputs.encoder_attentions
|
||||
outputs.decoder_hidden_states
|
||||
outputs.decoder_attentions
|
||||
outputs.cross_attentions
|
||||
|
||||
# Access as dict or tuple
|
||||
logits = outputs.logits
|
||||
logits = outputs["logits"]
|
||||
loss, logits = outputs.to_tuple()[:2]
|
||||
```
|
||||
|
||||
This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.
|
||||
@@ -0,0 +1,530 @@
|
||||
# Text Generation Strategies
|
||||
|
||||
Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity.
|
||||
|
||||
## Overview
|
||||
|
||||
Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost.
|
||||
|
||||
**When to use each strategy:**
|
||||
- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical
|
||||
- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization)
|
||||
- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue)
|
||||
- **Top-k/Top-p**: Balanced creativity and coherence
|
||||
|
||||
## Basic Generation Methods
|
||||
|
||||
### Greedy Decoding
|
||||
|
||||
Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
inputs = tokenizer("The future of AI", return_tensors="pt")
|
||||
|
||||
# Greedy decoding (default)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Deterministic (always same output for same input)
|
||||
- Fast (single forward pass per token)
|
||||
- Prone to repetition in longer sequences
|
||||
- Best for: Short generations, deterministic applications
|
||||
|
||||
**Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50, # Number of tokens to generate
|
||||
min_length=10, # Minimum total length
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
```
|
||||
|
||||
### Beam Search
|
||||
|
||||
Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5, # Number of beams
|
||||
early_stopping=True, # Stop when all beams finish
|
||||
no_repeat_ngram_size=2, # Prevent 2-gram repetition
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Higher quality than greedy for tasks with "correct" answers
|
||||
- Slower than greedy (num_beams forward passes per step)
|
||||
- Still can suffer from repetition
|
||||
- Best for: Translation, summarization, QA generation
|
||||
|
||||
**Advanced Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
num_beams=5,
|
||||
num_beam_groups=1, # Diverse beam search groups
|
||||
diversity_penalty=0.0, # Penalty for similar beams
|
||||
length_penalty=1.0, # >1: longer sequences, <1: shorter
|
||||
early_stopping=True, # Stop when num_beams sequences finish
|
||||
no_repeat_ngram_size=2, # Block repeating n-grams
|
||||
num_return_sequences=1, # Return top-k sequences (≤ num_beams)
|
||||
)
|
||||
```
|
||||
|
||||
**Length Penalty:**
|
||||
- `length_penalty > 1.0`: Favor longer sequences
|
||||
- `length_penalty = 1.0`: No penalty
|
||||
- `length_penalty < 1.0`: Favor shorter sequences
|
||||
|
||||
### Sampling (Multinomial)
|
||||
|
||||
Randomly sample tokens according to the probability distribution.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True, # Enable sampling
|
||||
temperature=1.0, # Sampling temperature
|
||||
num_beams=1, # Must be 1 for sampling
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Non-deterministic (different output each time)
|
||||
- More diverse and creative than greedy/beam search
|
||||
- Can produce incoherent output if not controlled
|
||||
- Best for: Creative writing, dialogue, open-ended generation
|
||||
|
||||
**Temperature Parameter:**
|
||||
```python
|
||||
# Low temperature (0.1-0.7): More focused, less random
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.5)
|
||||
|
||||
# Medium temperature (0.7-1.0): Balanced
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.8)
|
||||
|
||||
# High temperature (1.0-2.0): More random, more creative
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=1.5)
|
||||
```
|
||||
|
||||
- `temperature → 0`: Approaches greedy decoding
|
||||
- `temperature = 1.0`: Sample from original distribution
|
||||
- `temperature > 1.0`: Flatter distribution, more random
|
||||
- `temperature < 1.0`: Sharper distribution, more confident
|
||||
|
||||
## Advanced Sampling Methods
|
||||
|
||||
### Top-k Sampling
|
||||
|
||||
Sample from only the k most likely tokens.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_k=50, # Consider top 50 tokens
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Filter to top-k most probable tokens
|
||||
2. Renormalize probabilities
|
||||
3. Sample from filtered distribution
|
||||
|
||||
**Choosing k:**
|
||||
- `k=1`: Equivalent to greedy decoding
|
||||
- `k=10-50`: More focused, coherent output
|
||||
- `k=100-500`: More diverse output
|
||||
- Too high k: Includes low-probability tokens (noise)
|
||||
- Too low k: Less diverse, may miss good alternatives
|
||||
|
||||
### Top-p (Nucleus) Sampling
|
||||
|
||||
Sample from the smallest set of tokens whose cumulative probability ≥ p.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_p=0.95, # Nucleus probability
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Sort tokens by probability
|
||||
2. Find smallest set with cumulative probability ≥ p
|
||||
3. Sample from this set
|
||||
|
||||
**Choosing p:**
|
||||
- `p=0.9-0.95`: Good balance (recommended)
|
||||
- `p=1.0`: Sample from full distribution
|
||||
- Higher p: More diverse, might include unlikely tokens
|
||||
- Lower p: More focused, like top-k with adaptive k
|
||||
|
||||
**Top-p vs Top-k:**
|
||||
- Top-p adapts to probability distribution shape
|
||||
- Top-k is fixed regardless of distribution
|
||||
- Top-p generally better for variable-quality contexts
|
||||
- Can combine: `top_k=50, top_p=0.95` (apply both filters)
|
||||
|
||||
### Combining Strategies
|
||||
|
||||
```python
|
||||
# Recommended for high-quality open-ended generation
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=100,
|
||||
temperature=0.8, # Moderate temperature
|
||||
top_k=50, # Limit to top 50 tokens
|
||||
top_p=0.95, # Nucleus sampling
|
||||
repetition_penalty=1.2, # Discourage repetition
|
||||
no_repeat_ngram_size=3, # Block 3-gram repetition
|
||||
)
|
||||
```
|
||||
|
||||
## Controlling Generation Quality
|
||||
|
||||
### Repetition Control
|
||||
|
||||
Prevent models from repeating themselves:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
|
||||
# Method 1: Repetition penalty
|
||||
repetition_penalty=1.2, # Penalize repeated tokens (>1.0)
|
||||
|
||||
# Method 2: Block n-gram repetition
|
||||
no_repeat_ngram_size=3, # Never repeat 3-grams
|
||||
|
||||
# Method 3: Encoder repetition penalty (for seq2seq)
|
||||
encoder_repetition_penalty=1.0, # Penalize input tokens
|
||||
)
|
||||
```
|
||||
|
||||
**Repetition Penalty Values:**
|
||||
- `1.0`: No penalty
|
||||
- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3)
|
||||
- `>1.5`: Strong penalty (may harm coherence)
|
||||
|
||||
### Length Control
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
|
||||
# Hard constraints
|
||||
min_length=20, # Minimum total length
|
||||
max_length=100, # Maximum total length
|
||||
max_new_tokens=50, # Maximum new tokens (excluding input)
|
||||
|
||||
# Soft constraints (with beam search)
|
||||
length_penalty=1.0, # Encourage longer/shorter outputs
|
||||
|
||||
# Early stopping
|
||||
early_stopping=True, # Stop when condition met
|
||||
)
|
||||
```
|
||||
|
||||
### Bad Words and Forced Tokens
|
||||
|
||||
```python
|
||||
# Prevent specific tokens
|
||||
bad_words_ids = [
|
||||
tokenizer.encode("badword1", add_special_tokens=False),
|
||||
tokenizer.encode("badword2", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
bad_words_ids=bad_words_ids,
|
||||
)
|
||||
|
||||
# Force specific tokens
|
||||
force_words_ids = [
|
||||
tokenizer.encode("important", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
force_words_ids=force_words_ids,
|
||||
)
|
||||
```
|
||||
|
||||
## Streaming Generation
|
||||
|
||||
Generate and process tokens as they're produced:
|
||||
|
||||
```python
|
||||
from transformers import TextStreamer, TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
# Simple streaming (prints to stdout)
|
||||
streamer = TextStreamer(tokenizer, skip_prompt=True)
|
||||
outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
|
||||
# Iterator streaming (for custom processing)
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|
||||
|
||||
generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for text in streamer:
|
||||
print(text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
## Advanced Techniques
|
||||
|
||||
### Contrastive Search
|
||||
|
||||
Balance coherence and diversity using contrastive objective:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
penalty_alpha=0.6, # Contrastive penalty
|
||||
top_k=4, # Consider top-4 tokens
|
||||
)
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
- Open-ended text generation
|
||||
- Reduces repetition without sacrificing coherence
|
||||
- Good alternative to sampling
|
||||
|
||||
### Diverse Beam Search
|
||||
|
||||
Generate multiple diverse outputs:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=10,
|
||||
num_beam_groups=5, # 5 groups of 2 beams each
|
||||
diversity_penalty=1.0, # Penalty for similar beams
|
||||
num_return_sequences=5, # Return 5 diverse outputs
|
||||
)
|
||||
```
|
||||
|
||||
### Constrained Beam Search
|
||||
|
||||
Force output to include specific phrases:
|
||||
|
||||
```python
|
||||
from transformers import PhrasalConstraint
|
||||
|
||||
constraints = [
|
||||
PhrasalConstraint(
|
||||
tokenizer("machine learning", add_special_tokens=False).input_ids
|
||||
),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
constraints=constraints,
|
||||
num_beams=10, # Requires beam search
|
||||
)
|
||||
```
|
||||
|
||||
## Speculative Decoding
|
||||
|
||||
Accelerate generation using a smaller draft model:
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Load main and assistant models
|
||||
model = AutoModelForCausalLM.from_pretrained("large-model")
|
||||
assistant_model = AutoModelForCausalLM.from_pretrained("small-model")
|
||||
|
||||
# Generate with speculative decoding
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
assistant_model=assistant_model,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- 2-3x faster generation
|
||||
- Identical output distribution to regular generation
|
||||
- Works with sampling and greedy decoding
|
||||
|
||||
## Recipe: Recommended Settings by Task
|
||||
|
||||
### Creative Writing / Dialogue
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=200,
|
||||
temperature=0.9,
|
||||
top_p=0.95,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
```
|
||||
|
||||
### Translation / Summarization
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
num_beams=5,
|
||||
max_new_tokens=150,
|
||||
early_stopping=True,
|
||||
length_penalty=1.0,
|
||||
no_repeat_ngram_size=2,
|
||||
)
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=300,
|
||||
temperature=0.2, # Low temperature for correctness
|
||||
top_p=0.95,
|
||||
do_sample=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Chatbot / Instruction Following
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=256,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.15,
|
||||
)
|
||||
```
|
||||
|
||||
### Factual QA / Information Extraction
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=3,
|
||||
early_stopping=True,
|
||||
# Or greedy for very short answers:
|
||||
# (no special parameters needed)
|
||||
)
|
||||
```
|
||||
|
||||
## Debugging Generation
|
||||
|
||||
### Check Token Probabilities
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=20,
|
||||
output_scores=True, # Return generation scores
|
||||
return_dict_in_generate=True, # Return as dict
|
||||
)
|
||||
|
||||
# Access generation scores
|
||||
scores = outputs.scores # Tuple of tensors (seq_len, vocab_size)
|
||||
|
||||
# Get token probabilities
|
||||
import torch
|
||||
probs = torch.softmax(scores[0], dim=-1)
|
||||
```
|
||||
|
||||
### Monitor Generation Process
|
||||
|
||||
```python
|
||||
from transformers import LogitsProcessor, LogitsProcessorList
|
||||
|
||||
class DebugLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids, scores):
|
||||
# Print top 5 tokens at each step
|
||||
top_tokens = scores[0].topk(5)
|
||||
print(f"Top 5 tokens: {top_tokens}")
|
||||
return scores
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=10,
|
||||
logits_processor=LogitsProcessorList([DebugLogitsProcessor()]),
|
||||
)
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
**Issue: Repetitive output**
|
||||
- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3`
|
||||
- For sampling: Increase `temperature`, enable `top_p`
|
||||
|
||||
**Issue: Incoherent output**
|
||||
- Solution: Lower `temperature` (0.5-0.8), use beam search
|
||||
- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens
|
||||
|
||||
**Issue: Too short output**
|
||||
- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search)
|
||||
- Check if EOS token is being generated early
|
||||
|
||||
**Issue: Too slow generation**
|
||||
- Solution: Use greedy instead of beam search
|
||||
- Reduce `num_beams`
|
||||
- Try speculative decoding with assistant model
|
||||
- Use smaller model variant
|
||||
|
||||
**Issue: Output doesn't follow format**
|
||||
- Solution: Use constrained beam search
|
||||
- Add format examples to prompt
|
||||
- Use `bad_words_ids` to prevent format-breaking tokens
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
```python
|
||||
# Use half precision
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
# Use KV cache optimization (default, but can be disabled)
|
||||
outputs = model.generate(**inputs, use_cache=True)
|
||||
|
||||
# Batch generation
|
||||
inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
|
||||
# Static cache for longer sequences (if supported)
|
||||
outputs = model.generate(**inputs, cache_implementation="static")
|
||||
```
|
||||
|
||||
This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`.
|
||||
504
scientific-packages/transformers/references/quantization.md
Normal file
504
scientific-packages/transformers/references/quantization.md
Normal file
@@ -0,0 +1,504 @@
|
||||
# Model Quantization Guide
|
||||
|
||||
Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy.
|
||||
|
||||
## Overview
|
||||
|
||||
Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables:
|
||||
- Running larger models on limited hardware
|
||||
- Faster inference (reduced memory bandwidth)
|
||||
- Lower deployment costs
|
||||
- Enabling fine-tuning of models that wouldn't fit in memory
|
||||
|
||||
**Tradeoffs:**
|
||||
- Slight accuracy loss (typically < 1-2%)
|
||||
- Initial quantization overhead
|
||||
- Some methods require calibration data
|
||||
|
||||
## Quick Comparison
|
||||
|
||||
| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup |
|
||||
|--------|-----------|-------|----------|-------------|----------|-------|
|
||||
| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy |
|
||||
| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium |
|
||||
| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium |
|
||||
| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy |
|
||||
| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium |
|
||||
|
||||
## Bitsandbytes (BnB)
|
||||
|
||||
On-the-fly quantization with excellent PEFT fine-tuning support.
|
||||
|
||||
### 8-bit Quantization
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True, # Enable 8-bit quantization
|
||||
device_map="auto", # Automatic device placement
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
|
||||
# Use normally
|
||||
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~7GB (50% reduction)
|
||||
- 13B model: ~26GB → ~13GB
|
||||
- 70B model: ~140GB → ~70GB
|
||||
|
||||
**Characteristics:**
|
||||
- Fast inference
|
||||
- Minimal accuracy loss
|
||||
- Works with PEFT (LoRA, QLoRA)
|
||||
- Supports CPU and CUDA GPUs
|
||||
|
||||
### 4-bit Quantization (QLoRA)
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
import torch
|
||||
|
||||
# Configure 4-bit quantization
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True, # Enable 4-bit quantization
|
||||
bnb_4bit_quant_type="nf4", # Quantization type ("nf4" or "fp4")
|
||||
bnb_4bit_compute_dtype=torch.float16, # Computation dtype
|
||||
bnb_4bit_use_double_quant=True, # Nested quantization for more savings
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~4GB (70% reduction)
|
||||
- 13B model: ~26GB → ~7GB
|
||||
- 70B model: ~140GB → ~35GB
|
||||
|
||||
**Quantization Types:**
|
||||
- `nf4`: Normal Float 4 (recommended, better quality)
|
||||
- `fp4`: Float Point 4 (slightly more memory efficient)
|
||||
|
||||
**Compute Dtype:**
|
||||
```python
|
||||
# For better quality
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
|
||||
# For best performance on Ampere+ GPUs
|
||||
bnb_4bit_compute_dtype=torch.bfloat16
|
||||
```
|
||||
|
||||
**Double Quantization:**
|
||||
```python
|
||||
# Enable for additional ~0.4 bits/param savings
|
||||
bnb_4bit_use_double_quant=True # Quantize the quantization constants
|
||||
```
|
||||
|
||||
### Fine-tuning with QLoRA
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
import torch
|
||||
|
||||
# Load quantized model
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Prepare for training
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Train normally
|
||||
trainer = Trainer(model=model, args=training_args, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GPTQ
|
||||
|
||||
Post-training quantization requiring calibration, optimized for inference speed.
|
||||
|
||||
### Loading GPTQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
|
||||
# Load pre-quantized GPTQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ", # Pre-quantized model
|
||||
device_map="auto",
|
||||
revision="gptq-4bit-32g-actorder_True", # Specific quantization config
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # 2, 3, 4, 8 bits
|
||||
dataset="c4", # Calibration dataset
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
device_map="auto",
|
||||
quantization_config=gptq_config,
|
||||
)
|
||||
|
||||
# Save quantized model
|
||||
model.save_pretrained("llama-2-7b-gptq")
|
||||
```
|
||||
|
||||
**Configuration Options:**
|
||||
```python
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # Quantization bits
|
||||
group_size=128, # Group size for quantization (128, 32, -1)
|
||||
dataset="c4", # Calibration dataset
|
||||
desc_act=False, # Activation order (can improve accuracy)
|
||||
sym=True, # Symmetric quantization
|
||||
damp_percent=0.1, # Dampening factor
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Fastest inference among quantization methods
|
||||
- Requires one-time calibration (slow)
|
||||
- Best when using pre-quantized models from Hub
|
||||
- Limited fine-tuning support
|
||||
- Excellent for production deployment
|
||||
|
||||
## AWQ (Activation-aware Weight Quantization)
|
||||
|
||||
Protects important weights for better quality.
|
||||
|
||||
### Loading AWQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AwqConfig
|
||||
|
||||
# Load pre-quantized AWQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-AWQ",
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
awq_config = AwqConfig(
|
||||
bits=4, # 4-bit quantization
|
||||
group_size=128, # Quantization group size
|
||||
zero_point=True, # Use zero-point quantization
|
||||
version="GEMM", # Quantization version
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=awq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Better accuracy than GPTQ at same bit width
|
||||
- Excellent inference speed
|
||||
- Supports PEFT fine-tuning
|
||||
- Requires calibration data
|
||||
|
||||
### Fine-tuning AWQ Models
|
||||
|
||||
```python
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
# AWQ models support LoRA fine-tuning
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "v_proj"],
|
||||
lora_dropout=0.05,
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GGUF (GGML Format)
|
||||
|
||||
CPU-optimized quantization format, popular in llama.cpp ecosystem.
|
||||
|
||||
### Using GGUF Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Load GGUF model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GGUF",
|
||||
gguf_file="llama-2-7b.Q4_K_M.gguf", # Specific quantization file
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF")
|
||||
```
|
||||
|
||||
**GGUF Quantization Types:**
|
||||
- `Q4_0`: 4-bit, smallest, lowest quality
|
||||
- `Q4_K_M`: 4-bit, medium quality (recommended)
|
||||
- `Q5_K_M`: 5-bit, good quality
|
||||
- `Q6_K`: 6-bit, high quality
|
||||
- `Q8_0`: 8-bit, very high quality
|
||||
|
||||
**Characteristics:**
|
||||
- Optimized for CPU inference
|
||||
- Wide range of bit depths (1-8)
|
||||
- Good for Apple Silicon (M1/M2)
|
||||
- No fine-tuning support
|
||||
- Excellent for local/edge deployment
|
||||
|
||||
## HQQ (Half-Quadratic Quantization)
|
||||
|
||||
Flexible quantization with good accuracy retention.
|
||||
|
||||
### Using HQQ
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, HqqConfig
|
||||
|
||||
hqq_config = HqqConfig(
|
||||
nbits=4, # Quantization bits
|
||||
group_size=64, # Group size
|
||||
quant_zero=False, # Quantize zero point
|
||||
quant_scale=False, # Quantize scale
|
||||
axis=0, # Quantization axis
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=hqq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Very fast quantization
|
||||
- No calibration data needed
|
||||
- Support for 1-8 bits
|
||||
- Can serialize/deserialize
|
||||
- Good accuracy vs size tradeoff
|
||||
|
||||
## Choosing a Quantization Method
|
||||
|
||||
### Decision Tree
|
||||
|
||||
**For inference only:**
|
||||
1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models)
|
||||
2. CPU-only deployment? → **GGUF**
|
||||
3. Want easiest setup? → **Bitsandbytes 8-bit**
|
||||
4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit**
|
||||
|
||||
**For fine-tuning:**
|
||||
1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)**
|
||||
2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA**
|
||||
3. Need very large models? → **QLoRA with double quantization**
|
||||
|
||||
**For production:**
|
||||
1. Latency-critical? → **GPTQ or AWQ**
|
||||
2. Cost-optimized? → **Bitsandbytes 8-bit**
|
||||
3. CPU deployment? → **GGUF**
|
||||
|
||||
## Memory Requirements
|
||||
|
||||
Approximate memory for Llama-2 7B model:
|
||||
|
||||
| Method | Memory | vs FP16 |
|
||||
|--------|--------|---------|
|
||||
| FP32 | 28GB | 2x |
|
||||
| FP16 / BF16 | 14GB | 1x |
|
||||
| 8-bit (BnB) | 7GB | 0.5x |
|
||||
| 4-bit (QLoRA) | 3.5GB | 0.25x |
|
||||
| 4-bit Double Quant | 3GB | 0.21x |
|
||||
| GPTQ 4-bit | 4GB | 0.29x |
|
||||
| AWQ 4-bit | 4GB | 0.29x |
|
||||
|
||||
**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Training
|
||||
|
||||
```python
|
||||
# QLoRA recommended configuration
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # BF16 if available
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
# LoRA configuration
|
||||
lora_config = LoraConfig(
|
||||
r=16, # Rank (8, 16, 32, 64)
|
||||
lora_alpha=32, # Scaling (typically 2*r)
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
```
|
||||
|
||||
### For Inference
|
||||
|
||||
```python
|
||||
# High-speed inference
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ",
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16, # Use FP16 for activations
|
||||
)
|
||||
|
||||
# Balanced quality/speed
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Maximum compression
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
),
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-GPU Setups
|
||||
|
||||
```python
|
||||
# Automatically distribute across GPUs
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map="auto", # Automatic distribution
|
||||
max_memory={0: "20GB", 1: "20GB"}, # Optional: limit per GPU
|
||||
)
|
||||
|
||||
# Manual device map
|
||||
device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
# ... distribute layers ...
|
||||
"model.norm": 1,
|
||||
"lm_head": 1,
|
||||
}
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map=device_map,
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: OOM during quantization**
|
||||
```python
|
||||
# Solution: Use low_cpu_mem_usage
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
quantization_config=config,
|
||||
device_map="auto",
|
||||
low_cpu_mem_usage=True, # Reduce CPU memory during loading
|
||||
)
|
||||
```
|
||||
|
||||
**Issue: Slow quantization**
|
||||
```python
|
||||
# GPTQ/AWQ take time to calibrate
|
||||
# Solution: Use pre-quantized models from Hub
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ")
|
||||
|
||||
# Or use BnB for instant quantization
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True)
|
||||
```
|
||||
|
||||
**Issue: Poor quality after quantization**
|
||||
```python
|
||||
# Try different quantization types
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Try "nf4" instead of "fp4"
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 if available
|
||||
)
|
||||
|
||||
# Or use 8-bit instead of 4-bit
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True)
|
||||
```
|
||||
|
||||
**Issue: Can't fine-tune quantized model**
|
||||
```python
|
||||
# Ensure using compatible quantization method
|
||||
from peft import prepare_model_for_kbit_training
|
||||
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Only BnB and AWQ support PEFT fine-tuning
|
||||
# GPTQ has limited support, GGUF doesn't support fine-tuning
|
||||
```
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB:
|
||||
|
||||
| Method | Speed | Memory |
|
||||
|--------|-------|--------|
|
||||
| FP16 | 100 tok/s | 14GB |
|
||||
| 8-bit | 90 tok/s | 7GB |
|
||||
| 4-bit QLoRA | 70 tok/s | 4GB |
|
||||
| GPTQ 4-bit | 95 tok/s | 4GB |
|
||||
| AWQ 4-bit | 95 tok/s | 4GB |
|
||||
|
||||
**Note:** Actual performance varies by hardware, sequence length, and batch size.
|
||||
|
||||
## Resources
|
||||
|
||||
- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub
|
||||
- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes
|
||||
- **PEFT library:** https://github.com/huggingface/peft
|
||||
- **QLoRA paper:** https://arxiv.org/abs/2305.14314
|
||||
|
||||
For task-specific quantization examples, see `training_guide.md`.
|
||||
610
scientific-packages/transformers/references/task_patterns.md
Normal file
610
scientific-packages/transformers/references/task_patterns.md
Normal file
@@ -0,0 +1,610 @@
|
||||
# Task-Specific Patterns
|
||||
|
||||
Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
|
||||
|
||||
## Text Classification
|
||||
|
||||
Classify text into predefined categories (sentiment, topic, intent, etc.).
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSequenceClassification,
|
||||
TrainingArguments, Trainer, DataCollatorWithPadding
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# 1. Load data
|
||||
dataset = load_dataset("imdb")
|
||||
|
||||
# 2. Preprocess
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def preprocess(examples):
|
||||
return tokenizer(examples["text"], truncation=True, max_length=512)
|
||||
|
||||
tokenized = dataset.map(preprocess, batched=True)
|
||||
|
||||
# 3. Model
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2,
|
||||
id2label={0: "negative", 1: "positive"},
|
||||
label2id={"negative": 0, "positive": 1}
|
||||
)
|
||||
|
||||
# 4. Train
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=16,
|
||||
num_train_epochs=3,
|
||||
eval_strategy="epoch",
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized["train"],
|
||||
eval_dataset=tokenized["test"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# 5. Inference
|
||||
text = "This movie was fantastic!"
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
predictions = outputs.logits.argmax(-1)
|
||||
print(model.config.id2label[predictions.item()]) # "positive"
|
||||
```
|
||||
|
||||
## Token Classification (NER)
|
||||
|
||||
Label each token in text (named entities, POS tags, etc.).
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load data (tokens and NER tags)
|
||||
dataset = load_dataset("conll2003")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
def tokenize_and_align_labels(examples):
|
||||
tokenized_inputs = tokenizer(
|
||||
examples["tokens"],
|
||||
truncation=True,
|
||||
is_split_into_words=True
|
||||
)
|
||||
|
||||
labels = []
|
||||
for i, label in enumerate(examples["ner_tags"]):
|
||||
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
||||
label_ids = []
|
||||
previous_word_idx = None
|
||||
for word_idx in word_ids:
|
||||
if word_idx is None:
|
||||
label_ids.append(-100) # Special tokens
|
||||
elif word_idx != previous_word_idx:
|
||||
label_ids.append(label[word_idx])
|
||||
else:
|
||||
label_ids.append(-100) # Subword tokens
|
||||
previous_word_idx = word_idx
|
||||
labels.append(label_ids)
|
||||
|
||||
tokenized_inputs["labels"] = labels
|
||||
return tokenized_inputs
|
||||
|
||||
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
|
||||
|
||||
# Model
|
||||
label_list = dataset["train"].features["ner_tags"].feature.names
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-cased",
|
||||
num_labels=len(label_list),
|
||||
id2label={i: label for i, label in enumerate(label_list)},
|
||||
label2id={label: i for i, label in enumerate(label_list)}
|
||||
)
|
||||
|
||||
# Training similar to classification
|
||||
# ... (use Trainer with DataCollatorForTokenClassification)
|
||||
```
|
||||
|
||||
## Question Answering (Extractive)
|
||||
|
||||
Extract answer spans from context.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
|
||||
|
||||
question = "What is the capital of France?"
|
||||
context = "Paris is the capital and most populous city of France."
|
||||
|
||||
inputs = tokenizer(question, context, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Get answer span
|
||||
answer_start = outputs.start_logits.argmax()
|
||||
answer_end = outputs.end_logits.argmax() + 1
|
||||
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
|
||||
print(answer) # "Paris"
|
||||
```
|
||||
|
||||
## Text Generation
|
||||
|
||||
Generate text continuations.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
prompt = "In the future, artificial intelligence will"
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
repetition_penalty=1.2,
|
||||
)
|
||||
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
## Summarization
|
||||
|
||||
Condense long text into summaries.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoModelForSeq2SeqLM,
|
||||
Seq2SeqTrainingArguments, Seq2SeqTrainer,
|
||||
DataCollatorForSeq2Seq
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
||||
|
||||
def preprocess(examples):
|
||||
inputs = ["summarize: " + doc for doc in examples["document"]]
|
||||
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
||||
|
||||
labels = tokenizer(
|
||||
examples["summary"],
|
||||
max_length=128,
|
||||
truncation=True
|
||||
)
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
tokenized_dataset = dataset.map(preprocess, batched=True)
|
||||
|
||||
# Training
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./results",
|
||||
predict_with_generate=True, # Important for seq2seq
|
||||
eval_strategy="epoch",
|
||||
learning_rate=2e-5,
|
||||
per_device_train_batch_size=8,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["validation"],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
text = "Long article text here..."
|
||||
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
|
||||
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
|
||||
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Translation
|
||||
|
||||
Translate text between languages.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
|
||||
result = translator("Hello, how are you?")
|
||||
print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?"
|
||||
|
||||
# For fine-tuning, similar to summarization with Seq2SeqTrainer
|
||||
```
|
||||
|
||||
## Image Classification
|
||||
|
||||
Classify images into categories.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoImageProcessor, AutoModelForImageClassification,
|
||||
TrainingArguments, Trainer
|
||||
)
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
|
||||
# Load data
|
||||
dataset = load_dataset("food101", split="train[:1000]")
|
||||
|
||||
# Preprocess
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
def transform(examples):
|
||||
examples["pixel_values"] = [
|
||||
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
|
||||
for img in examples["image"]
|
||||
]
|
||||
return examples
|
||||
|
||||
dataset = dataset.with_transform(transform)
|
||||
|
||||
# Model
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
"google/vit-base-patch16-224",
|
||||
num_labels=101,
|
||||
ignore_mismatched_sizes=True
|
||||
)
|
||||
|
||||
# Training
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
remove_unused_columns=False, # Keep image data
|
||||
eval_strategy="epoch",
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=32,
|
||||
num_train_epochs=3,
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=dataset,
|
||||
tokenizer=processor,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
# Inference
|
||||
image = Image.open("food.jpg")
|
||||
inputs = processor(image, return_tensors="pt")
|
||||
outputs = model(**inputs)
|
||||
predicted_class = outputs.logits.argmax(-1).item()
|
||||
print(model.config.id2label[predicted_class])
|
||||
```
|
||||
|
||||
## Object Detection
|
||||
|
||||
Detect and localize objects in images.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
from PIL import Image
|
||||
|
||||
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
||||
|
||||
image = Image.open("street.jpg")
|
||||
results = detector(image)
|
||||
|
||||
for result in results:
|
||||
print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
|
||||
# car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
|
||||
```
|
||||
|
||||
## Image Segmentation
|
||||
|
||||
Segment images into regions.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
|
||||
|
||||
image = "path/to/image.jpg"
|
||||
segments = segmenter(image)
|
||||
|
||||
for segment in segments:
|
||||
print(f"{segment['label']}: {segment['score']:.2f}")
|
||||
# Access mask: segment['mask']
|
||||
```
|
||||
|
||||
## Image Captioning
|
||||
|
||||
Generate textual descriptions of images.
|
||||
|
||||
```python
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
from PIL import Image
|
||||
|
||||
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
||||
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
inputs = processor(images=image, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(**inputs, max_length=50)
|
||||
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||
print(caption) # "a dog sitting on grass"
|
||||
```
|
||||
|
||||
## Speech Recognition (ASR)
|
||||
|
||||
Transcribe speech to text.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
transcriber = pipeline(
|
||||
"automatic-speech-recognition",
|
||||
model="openai/whisper-base"
|
||||
)
|
||||
|
||||
result = transcriber("audio.mp3")
|
||||
print(result["text"]) # "Hello, this is a test."
|
||||
|
||||
# With timestamps
|
||||
result = transcriber("audio.mp3", return_timestamps=True)
|
||||
for chunk in result["chunks"]:
|
||||
print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
|
||||
```
|
||||
|
||||
## Text-to-Speech
|
||||
|
||||
Generate speech from text.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
||||
|
||||
result = synthesizer("Hello, how are you today?")
|
||||
# result["audio"] contains the waveform
|
||||
# result["sampling_rate"] contains the sample rate
|
||||
|
||||
# Save audio
|
||||
import scipy
|
||||
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
|
||||
```
|
||||
|
||||
## Visual Question Answering
|
||||
|
||||
Answer questions about images.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
from PIL import Image
|
||||
|
||||
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
question = "What color is the car?"
|
||||
|
||||
result = vqa(image=image, question=question)
|
||||
print(result[0]["answer"]) # "red"
|
||||
```
|
||||
|
||||
## Document Question Answering
|
||||
|
||||
Extract information from documents (PDFs, images with text).
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
||||
|
||||
result = doc_qa(
|
||||
image="invoice.png",
|
||||
question="What is the total amount?"
|
||||
)
|
||||
|
||||
print(result["answer"]) # "$1,234.56"
|
||||
```
|
||||
|
||||
## Zero-Shot Classification
|
||||
|
||||
Classify without training data.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
||||
|
||||
text = "This is a delicious Italian restaurant with great pasta."
|
||||
candidate_labels = ["food", "travel", "technology", "sports"]
|
||||
|
||||
result = classifier(text, candidate_labels)
|
||||
print(result["labels"][0]) # "food"
|
||||
print(result["scores"][0]) # 0.95
|
||||
```
|
||||
|
||||
## Few-Shot Learning with LLMs
|
||||
|
||||
Use large language models for few-shot tasks.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
|
||||
# Few-shot prompt
|
||||
prompt = """
|
||||
Classify the sentiment: positive, negative, or neutral.
|
||||
|
||||
Text: "I love this product!"
|
||||
Sentiment: positive
|
||||
|
||||
Text: "This is terrible."
|
||||
Sentiment: negative
|
||||
|
||||
Text: "It's okay, nothing special."
|
||||
Sentiment: neutral
|
||||
|
||||
Text: "Best purchase ever!"
|
||||
Sentiment:"""
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
|
||||
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(response.split("Sentiment:")[-1].strip()) # "positive"
|
||||
```
|
||||
|
||||
## Instruction-Following / Chat
|
||||
|
||||
Use instruction-tuned models.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
]
|
||||
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
|
||||
inputs = tokenizer(formatted, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
|
||||
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# Extract assistant response
|
||||
assistant_response = response.split("[/INST]")[-1].strip()
|
||||
print(assistant_response)
|
||||
```
|
||||
|
||||
## Embeddings / Semantic Search
|
||||
|
||||
Generate embeddings for semantic similarity.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
||||
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
||||
|
||||
def get_embedding(text):
|
||||
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Mean pooling
|
||||
embeddings = outputs.last_hidden_state.mean(dim=1)
|
||||
return embeddings
|
||||
|
||||
# Get embeddings
|
||||
text1 = "Machine learning is a subset of AI"
|
||||
text2 = "AI includes machine learning"
|
||||
|
||||
emb1 = get_embedding(text1)
|
||||
emb2 = get_embedding(text2)
|
||||
|
||||
# Compute similarity
|
||||
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
|
||||
print(f"Similarity: {similarity.item():.4f}") # ~0.85
|
||||
```
|
||||
|
||||
## Multimodal Understanding (CLIP)
|
||||
|
||||
Connect vision and language.
|
||||
|
||||
```python
|
||||
from transformers import CLIPProcessor, CLIPModel
|
||||
from PIL import Image
|
||||
|
||||
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
||||
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
||||
|
||||
image = Image.open("photo.jpg")
|
||||
texts = ["a dog", "a cat", "a car", "a house"]
|
||||
|
||||
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
|
||||
outputs = model(**inputs)
|
||||
|
||||
# Get similarity scores
|
||||
logits_per_image = outputs.logits_per_image
|
||||
probs = logits_per_image.softmax(dim=1)
|
||||
|
||||
for text, prob in zip(texts, probs[0]):
|
||||
print(f"{text}: {prob.item():.4f}")
|
||||
```
|
||||
|
||||
## Common Evaluation Metrics
|
||||
|
||||
```python
|
||||
from datasets import load_metric
|
||||
|
||||
# Accuracy (classification)
|
||||
metric = load_metric("accuracy")
|
||||
predictions = [0, 1, 1, 0]
|
||||
references = [0, 1, 0, 0]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# F1 Score (classification, NER)
|
||||
metric = load_metric("f1")
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# BLEU (translation)
|
||||
metric = load_metric("bleu")
|
||||
predictions = ["hello there general kenobi"]
|
||||
references = [["hello there general kenobi", "hello there!"]]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
|
||||
# ROUGE (summarization)
|
||||
metric = load_metric("rouge")
|
||||
predictions = ["summary text"]
|
||||
references = ["reference summary"]
|
||||
result = metric.compute(predictions=predictions, references=references)
|
||||
```
|
||||
|
||||
## Common Data Collators
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding,
|
||||
DataCollatorForTokenClassification,
|
||||
DataCollatorForSeq2Seq,
|
||||
DataCollatorForLanguageModeling,
|
||||
)
|
||||
|
||||
# Classification: dynamic padding
|
||||
DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# NER: pad labels too
|
||||
DataCollatorForTokenClassification(tokenizer=tokenizer)
|
||||
|
||||
# Seq2Seq: pad inputs and labels
|
||||
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
||||
|
||||
# Language modeling: create MLM masks
|
||||
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
|
||||
```
|
||||
|
||||
This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`.
|
||||
Reference in New Issue
Block a user