From 11da596765e6d22ac1a78104953dbc5c8be2d1f7 Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Tue, 21 Oct 2025 10:30:38 -0700 Subject: [PATCH] Update Huggingface Transformer --- .claude-plugin/marketplace.json | 2 +- docs/scientific-packages.md | 2 +- scientific-packages/transformers/SKILL.md | 965 ++++------------- .../transformers/references/api_reference.md | 986 +++++++----------- .../references/generation_strategies.md | 659 +++++------- .../transformers/references/pipelines.md | 234 +++++ .../transformers/references/quantization.md | 504 --------- .../transformers/references/task_patterns.md | 941 +++++++++-------- .../transformers/references/training.md | 328 ++++++ .../scripts/fine_tune_classifier.py | 337 +++--- .../transformers/scripts/generate_text.py | 309 +++--- .../transformers/scripts/quick_inference.py | 209 ++-- 12 files changed, 2328 insertions(+), 3148 deletions(-) create mode 100644 scientific-packages/transformers/references/pipelines.md delete mode 100644 scientific-packages/transformers/references/quantization.md create mode 100644 scientific-packages/transformers/references/training.md mode change 100755 => 100644 scientific-packages/transformers/scripts/fine_tune_classifier.py mode change 100755 => 100644 scientific-packages/transformers/scripts/generate_text.py mode change 100755 => 100644 scientific-packages/transformers/scripts/quick_inference.py diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f68789c..927fd1d 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -7,7 +7,7 @@ }, "metadata": { "description": "Claude scientific skills from K-Dense Inc", - "version": "1.37.0" + "version": "1.38.0" }, "plugins": [ { diff --git a/docs/scientific-packages.md b/docs/scientific-packages.md index 2902a02..dc77903 100644 --- a/docs/scientific-packages.md +++ b/docs/scientific-packages.md @@ -32,7 +32,7 @@ - **scikit-learn** - Machine learning algorithms, preprocessing, and model selection - **statsmodels** - Statistical modeling and econometrics (OLS, GLM, logit/probit, ARIMA, time series forecasting, hypothesis testing, diagnostics) - **Torch Geometric** - Graph Neural Networks for molecular and geometric data -- **Transformers** - Hugging Face transformers for NLU, image classification, and generation +- **Transformers** - State-of-the-art machine learning models for NLP, computer vision, audio, and multimodal tasks. Provides 1M+ pre-trained models accessible via pipelines (text-classification, NER, QA, summarization, translation, text-generation, image-classification, object-detection, ASR, VQA), comprehensive training via Trainer API with distributed training and mixed precision, flexible text generation with multiple decoding strategies (greedy, beam search, sampling), and Auto classes for automatic architecture selection (BERT, GPT, T5, ViT, BART, etc.) - **UMAP-learn** - Dimensionality reduction and manifold learning ## Materials Science & Chemistry diff --git a/scientific-packages/transformers/SKILL.md b/scientific-packages/transformers/SKILL.md index ef50437..a5fdf15 100644 --- a/scientific-packages/transformers/SKILL.md +++ b/scientific-packages/transformers/SKILL.md @@ -1,860 +1,351 @@ --- name: transformers -description: "Hugging Face Transformers. Load BERT, GPT, T5, ViT, CLIP, Llama models, fine-tune, text generation, classification, NER, pipelines, LoRA, for NLP/vision/audio tasks." +description: Work with state-of-the-art machine learning models for NLP, computer vision, audio, and multimodal tasks using HuggingFace Transformers. This skill should be used when fine-tuning pre-trained models, performing inference with pipelines, generating text, training sequence models, or working with BERT, GPT, T5, ViT, and other transformer architectures. Covers model loading, tokenization, training with Trainer API, text generation strategies, and task-specific patterns for classification, NER, QA, summarization, translation, and image tasks. (plugin:scientific-packages@claude-scientific-skills) --- # Transformers ## Overview -Transformers is Hugging Face's flagship library providing unified access to over 1 million pretrained models for machine learning across text, vision, audio, and multimodal domains. The library serves as a standardized model-definition framework compatible with PyTorch, TensorFlow, and JAX, emphasizing ease of use through three core components: +The Transformers library provides state-of-the-art machine learning models for natural language processing (NLP), computer vision, audio processing, and multimodal tasks. It offers over 1 million pre-trained model checkpoints and supports quick inference through pipelines, comprehensive training via the Trainer API, and flexible text generation with various decoding strategies. -- **Pipeline**: Simple, optimized inference API for common tasks -- **AutoClasses**: Automatic model/tokenizer selection from pretrained checkpoints -- **Trainer**: Full-featured training loop with distributed training, mixed precision, and optimization +This skill provides comprehensive guidance on working with Transformers across all major task types and modalities. -The library prioritizes accessibility with pretrained models that reduce computational costs and carbon footprint while providing compatibility across major training frameworks (PyTorch-Lightning, DeepSpeed, vLLM, etc.). +## Core Capabilities -## Quick Start with Pipelines +### 1. Quick Inference with Pipelines -Use pipelines for simple, efficient inference without managing models, tokenizers, or preprocessing manually. Pipelines abstract complexity into a single function call. - -### Basic Pipeline Usage +For rapid inference without complex setup, use the `pipeline()` API. Pipelines abstract away tokenization, model invocation, and post-processing. ```python from transformers import pipeline # Text classification classifier = pipeline("text-classification") -result = classifier("This restaurant is awesome") -# [{'label': 'POSITIVE', 'score': 0.9998}] +result = classifier("This product is amazing!") -# Text generation -generator = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf") -generator("The secret to baking a good cake is", max_length=50) +# Named entity recognition +ner = pipeline("token-classification") +entities = ner("Sarah works at Microsoft in Seattle") # Question answering qa = pipeline("question-answering") -qa(question="What is extractive QA?", context="Extractive QA is...") +answer = qa(question="What is the capital?", context="Paris is the capital of France.") + +# Text generation +generator = pipeline("text-generation", model="gpt2") +text = generator("Once upon a time", max_length=50) # Image classification -img_classifier = pipeline("image-classification") -img_classifier("path/to/image.jpg") - -# Automatic speech recognition -transcriber = pipeline("automatic-speech-recognition") -transcriber("audio_file.mp3") +image_classifier = pipeline("image-classification") +predictions = image_classifier("image.jpg") ``` -### Available Pipeline Tasks +**When to use pipelines:** +- Quick prototyping and testing +- Simple inference tasks without custom logic +- Demonstrations and examples +- Production inference for standard tasks -**NLP Tasks:** -- `text-classification`, `token-classification`, `question-answering` -- `fill-mask`, `summarization`, `translation` -- `text-generation`, `conversational` -- `zero-shot-classification`, `sentiment-analysis` +**Available pipeline tasks:** +- **NLP**: text-classification, token-classification, question-answering, summarization, translation, text-generation, fill-mask, zero-shot-classification +- **Vision**: image-classification, object-detection, image-segmentation, depth-estimation, zero-shot-image-classification +- **Audio**: automatic-speech-recognition, audio-classification, text-to-audio +- **Multimodal**: image-to-text, visual-question-answering, image-text-to-text -**Vision Tasks:** -- `image-classification`, `image-segmentation`, `object-detection` -- `depth-estimation`, `image-to-image`, `zero-shot-image-classification` +For comprehensive pipeline documentation, see `references/pipelines.md`. -**Audio Tasks:** -- `automatic-speech-recognition`, `audio-classification` -- `text-to-audio`, `zero-shot-audio-classification` +### 2. Model Training and Fine-Tuning -**Multimodal Tasks:** -- `visual-question-answering`, `document-question-answering` -- `image-to-text`, `zero-shot-object-detection` +Use the Trainer API for comprehensive model training with support for distributed training, mixed precision, and advanced optimization. -### Pipeline Best Practices - -**Device Management:** -```python -from transformers import pipeline, infer_device - -device = infer_device() # Auto-detect best device -pipe = pipeline("text-generation", model="...", device=device) -``` - -**Batch Processing:** -```python -# Process multiple inputs efficiently -results = classifier(["Text 1", "Text 2", "Text 3"]) - -# Use KeyDataset for large datasets -from transformers.pipelines.pt_utils import KeyDataset -from datasets import load_dataset - -dataset = load_dataset("imdb", split="test") -for result in pipe(KeyDataset(dataset, "text")): - print(result) -``` - -**Memory Optimization:** -```python -# Use half-precision for faster inference -pipe = pipeline("text-generation", model="...", - torch_dtype=torch.float16, device="cuda") -``` - -## Core Components - -### AutoClasses for Model Loading - -AutoClasses automatically select the correct architecture based on pretrained checkpoints. +**Basic training workflow:** ```python from transformers import ( - AutoModel, AutoTokenizer, AutoConfig, - AutoModelForCausalLM, AutoModelForSequenceClassification + AutoTokenizer, + AutoModelForSequenceClassification, + TrainingArguments, + Trainer ) +from datasets import load_dataset -# Load any model by checkpoint name +# 1. Load and tokenize data +dataset = load_dataset("imdb") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -model = AutoModel.from_pretrained("bert-base-uncased") -# Task-specific model classes -causal_lm = AutoModelForCausalLM.from_pretrained("gpt2") -classifier = AutoModelForSequenceClassification.from_pretrained( +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenized_datasets = dataset.map(tokenize_function, batched=True) + +# 2. Load model +model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", - num_labels=3 + num_labels=2 ) -# Load with device and dtype optimization -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - device_map="auto", # Automatically distribute across devices - torch_dtype="auto" # Use optimal dtype -) -``` - -**Key Parameters:** -- `device_map="auto"`: Optimal device allocation (CPU/GPU/multi-GPU) -- `torch_dtype`: Control precision (torch.float16, torch.bfloat16, "auto") -- `trust_remote_code`: Enable custom model code (use cautiously) -- `use_fast`: Enable Rust-backed fast tokenizers (default True) - -### Tokenization - -Tokenizers convert text to model-compatible tensor inputs. - -```python -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - -# Basic tokenization -tokens = tokenizer.tokenize("Hello, how are you?") -# ['hello', ',', 'how', 'are', 'you', '?'] - -# Encoding (text → token IDs) -encoded = tokenizer("Hello, how are you?", return_tensors="pt") -# {'input_ids': tensor([[...]], 'attention_mask': tensor([[...]])} - -# Batch encoding with padding and truncation -batch = tokenizer( - ["Short text", "This is a much longer text..."], - padding=True, # Pad to longest in batch - truncation=True, # Truncate to model's max length - max_length=512, - return_tensors="pt" +# 3. Configure training +training_args = TrainingArguments( + output_dir="./results", + num_train_epochs=3, + per_device_train_batch_size=16, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, ) -# Decoding (token IDs → text) -text = tokenizer.decode(encoded['input_ids'][0]) -``` - -**Special Tokens:** -```python -# Access special tokens -tokenizer.pad_token # Padding token -tokenizer.cls_token # Classification token -tokenizer.sep_token # Separator token -tokenizer.mask_token # Mask token (for MLM) - -# Add custom tokens -tokenizer.add_tokens(["[CUSTOM]"]) -tokenizer.add_special_tokens({'additional_special_tokens': ['[NEW]']}) - -# Resize model embeddings to match new vocabulary -model.resize_token_embeddings(len(tokenizer)) -``` - -### Image Processors - -For vision tasks, use image processors instead of tokenizers. - -```python -from transformers import AutoImageProcessor - -processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") - -# Process single image -from PIL import Image -image = Image.open("path/to/image.jpg") -inputs = processor(image, return_tensors="pt") -# Returns: {'pixel_values': tensor([[...]])} - -# Batch processing -images = [Image.open(f"img{i}.jpg") for i in range(3)] -inputs = processor(images, return_tensors="pt") -``` - -### Processors for Multimodal Models - -Multimodal models use processors that combine image and text processing. - -```python -from transformers import AutoProcessor - -processor = AutoProcessor.from_pretrained("microsoft/git-base") - -# Process image + text caption -inputs = processor( - images=image, - text="A description of the image", - return_tensors="pt", - padding=True +# 4. Create trainer and train +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], ) + +trainer.train() ``` -## Model Inference +**Key training features:** +- Mixed precision training (fp16/bf16) +- Distributed training (multi-GPU, multi-node) +- Gradient accumulation +- Learning rate scheduling with warmup +- Checkpoint management +- Hyperparameter search +- Push to Hugging Face Hub -### Basic Inference Pattern +For detailed training documentation, see `references/training.md`. + +### 3. Text Generation + +Generate text using various decoding strategies including greedy decoding, beam search, sampling, and more. + +**Generation strategies:** ```python from transformers import AutoModelForCausalLM, AutoTokenizer -# Load model and tokenizer model = AutoModelForCausalLM.from_pretrained("gpt2") tokenizer = AutoTokenizer.from_pretrained("gpt2") +inputs = tokenizer("Once upon a time", return_tensors="pt") -# Tokenize input -inputs = tokenizer("The future of AI is", return_tensors="pt") +# Greedy decoding (deterministic) +outputs = model.generate(**inputs, max_new_tokens=50) -# Generate (for causal LM) -outputs = model.generate(**inputs, max_length=50) -text = tokenizer.decode(outputs[0]) - -# Or get model outputs directly -outputs = model(**inputs) -logits = outputs.logits # Shape: (batch_size, seq_len, vocab_size) -``` - -### Text Generation Strategies - -For generative models, control generation behavior with parameters: - -```python -# Greedy decoding (default) -output = model.generate(inputs, max_length=50) - -# Beam search (multiple hypothesis) -output = model.generate( - inputs, - max_length=50, - num_beams=5, # Keep top 5 beams +# Beam search (explores multiple hypotheses) +outputs = model.generate( + **inputs, + max_new_tokens=50, + num_beams=5, early_stopping=True ) -# Sampling with temperature -output = model.generate( - inputs, - max_length=50, +# Sampling (creative, diverse) +outputs = model.generate( + **inputs, + max_new_tokens=50, do_sample=True, - temperature=0.7, # Lower = more focused, higher = more random - top_k=50, # Sample from top 50 tokens - top_p=0.95 # Nucleus sampling + temperature=0.7, + top_p=0.9, + top_k=50 ) - -# Streaming generation -from transformers import TextStreamer - -streamer = TextStreamer(tokenizer) -model.generate(**inputs, streamer=streamer, max_length=100) ``` -**Generation Parameters:** -- `max_length` / `max_new_tokens`: Control output length -- `num_beams`: Beam search width (1 = greedy) -- `temperature`: Randomness (0.7-1.0 typical) -- `top_k`: Sample from top k tokens +**Generation parameters:** +- `temperature`: Controls randomness (0.1-2.0) +- `top_k`: Sample from top-k tokens - `top_p`: Nucleus sampling threshold -- `repetition_penalty`: Discourage repetition (>1.0) +- `num_beams`: Number of beams for beam search +- `repetition_penalty`: Discourage repetition +- `no_repeat_ngram_size`: Prevent repeating n-grams -Refer to `references/generation_strategies.md` for detailed information on choosing appropriate strategies. +For comprehensive generation documentation, see `references/generation_strategies.md`. -## Training and Fine-Tuning +### 4. Task-Specific Patterns -### Training Workflow Overview - -1. **Load dataset** → 2. **Preprocess** → 3. **Configure training** → 4. **Train** → 5. **Evaluate** → 6. **Save/Share** - -### Text Classification Example +Common task patterns with appropriate model classes: +**Text Classification:** ```python -from transformers import ( - AutoTokenizer, AutoModelForSequenceClassification, - TrainingArguments, Trainer, DataCollatorWithPadding -) -from datasets import load_dataset +from transformers import AutoModelForSequenceClassification -# 1. Load dataset -dataset = load_dataset("imdb") - -# 2. Preprocess -tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - -def preprocess(examples): - return tokenizer(examples["text"], truncation=True) - -tokenized = dataset.map(preprocess, batched=True) -data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - -# 3. Load model model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", - num_labels=2, - id2label={0: "negative", 1: "positive"}, - label2id={"negative": 0, "positive": 1} + num_labels=3, + id2label={0: "negative", 1: "neutral", 2: "positive"} ) - -# 4. Configure training -training_args = TrainingArguments( - output_dir="./results", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=3, - weight_decay=0.01, - eval_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - push_to_hub=False, -) - -# 5. Train -trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized["train"], - eval_dataset=tokenized["test"], - tokenizer=tokenizer, - data_collator=data_collator, -) - -trainer.train() - -# 6. Evaluate and save -metrics = trainer.evaluate() -trainer.save_model("./my-finetuned-model") -trainer.push_to_hub() # Share to Hugging Face Hub ``` -### Vision Task Fine-Tuning - +**Named Entity Recognition (Token Classification):** ```python -from transformers import ( - AutoImageProcessor, AutoModelForImageClassification, - TrainingArguments, Trainer +from transformers import AutoModelForTokenClassification + +model = AutoModelForTokenClassification.from_pretrained( + "bert-base-uncased", + num_labels=9 # Number of entity types ) -from datasets import load_dataset +``` -# Load dataset -dataset = load_dataset("food101", split="train[:5000]") +**Question Answering:** +```python +from transformers import AutoModelForQuestionAnswering -# Image preprocessing -processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased") +``` -def transform(examples): - examples["pixel_values"] = [ - processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0] - for img in examples["image"] - ] - return examples +**Summarization and Translation (Seq2Seq):** +```python +from transformers import AutoModelForSeq2SeqLM -dataset = dataset.with_transform(transform) +model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") +``` + +**Image Classification:** +```python +from transformers import AutoModelForImageClassification -# Load model model = AutoModelForImageClassification.from_pretrained( "google/vit-base-patch16-224", - num_labels=101, # 101 food categories - ignore_mismatched_sizes=True + num_labels=num_classes ) - -# Training (similar pattern to text) -training_args = TrainingArguments( - output_dir="./vit-food101", - remove_unused_columns=False, # Keep image data - eval_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-5, - per_device_train_batch_size=32, - num_train_epochs=3, -) - -trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset, - tokenizer=processor, -) - -trainer.train() ``` -### Sequence-to-Sequence Tasks +For detailed task-specific workflows including data preprocessing, training, and evaluation, see `references/task_patterns.md`. -For tasks like summarization, translation, use Seq2SeqTrainer: +## Auto Classes + +Use Auto classes for automatic architecture selection based on model checkpoints: ```python from transformers import ( - AutoTokenizer, AutoModelForSeq2SeqLM, - Seq2SeqTrainingArguments, Seq2SeqTrainer, - DataCollatorForSeq2Seq + AutoTokenizer, # Tokenization + AutoModel, # Base model (hidden states) + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelForQuestionAnswering, + AutoModelForCausalLM, # GPT-style + AutoModelForMaskedLM, # BERT-style + AutoModelForSeq2SeqLM, # T5, BART + AutoProcessor, # For multimodal models + AutoImageProcessor, # For vision models ) -tokenizer = AutoTokenizer.from_pretrained("t5-small") -model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") - -def preprocess(examples): - # Prefix input for T5 - inputs = ["summarize: " + doc for doc in examples["text"]] - model_inputs = tokenizer(inputs, max_length=1024, truncation=True) - - # Tokenize targets - labels = tokenizer( - examples["summary"], - max_length=128, - truncation=True - ) - model_inputs["labels"] = labels["input_ids"] - return model_inputs - -tokenized_dataset = dataset.map(preprocess, batched=True) - -training_args = Seq2SeqTrainingArguments( - output_dir="./t5-summarization", - eval_strategy="epoch", - learning_rate=2e-5, - per_device_train_batch_size=8, - num_train_epochs=3, - predict_with_generate=True, # Important for seq2seq -) - -trainer = Seq2SeqTrainer( - model=model, - args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["test"], - tokenizer=tokenizer, - data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer), -) - -trainer.train() +# Load any model by name +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") +model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") ``` -### Important TrainingArguments +For comprehensive API documentation, see `references/api_reference.md`. +## Model Loading and Optimization + +**Device placement:** ```python -TrainingArguments( - # Essential - output_dir="./results", - num_train_epochs=3, - per_device_train_batch_size=8, - learning_rate=2e-5, +model = AutoModel.from_pretrained("bert-base-uncased", device_map="auto") +``` - # Evaluation - eval_strategy="epoch", # or "steps" - eval_steps=500, # if eval_strategy="steps" - - # Checkpointing - save_strategy="epoch", - save_steps=500, - save_total_limit=2, # Keep only 2 best checkpoints - load_best_model_at_end=True, - metric_for_best_model="accuracy", - - # Optimization - gradient_accumulation_steps=4, - warmup_steps=500, - weight_decay=0.01, - max_grad_norm=1.0, - - # Mixed Precision - fp16=True, # For Nvidia GPUs - bf16=True, # For Ampere+ GPUs (better) - - # Logging - logging_steps=100, - report_to="tensorboard", # or "wandb", "mlflow" - - # Memory Optimization - gradient_checkpointing=True, - optim="adamw_torch", # or "adafactor" for memory - - # Distributed Training - ddp_find_unused_parameters=False, +**Mixed precision:** +```python +model = AutoModel.from_pretrained( + "model-name", + torch_dtype=torch.float16 # or torch.bfloat16 ) ``` -Refer to `references/training_guide.md` for comprehensive training patterns and optimization strategies. - -## Performance Optimization - -### Model Quantization - -Reduce memory footprint while maintaining accuracy: - +**Quantization:** ```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from transformers import BitsAndBytesConfig -# 8-bit quantization -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_8bit=True, - device_map="auto" -) - -# 4-bit quantization (even smaller) -bnb_config = BitsAndBytesConfig( +quantization_config = BitsAndBytesConfig( load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", - quantization_config=bnb_config, + quantization_config=quantization_config, device_map="auto" ) ``` -**Quantization Methods:** -- **Bitsandbytes**: 4/8-bit on-the-fly quantization, supports PEFT fine-tuning -- **GPTQ**: 2/3/4/8-bit, requires calibration, very fast inference -- **AWQ**: 4-bit activation-aware, balanced speed/accuracy - -Refer to `references/quantization.md` for detailed comparison and usage patterns. - -### Training Optimization - -```python -# Gradient accumulation (simulate larger batch) -training_args = TrainingArguments( - per_device_train_batch_size=4, - gradient_accumulation_steps=8, # Effective batch = 4 * 8 = 32 -) - -# Gradient checkpointing (reduce memory, slower) -training_args = TrainingArguments( - gradient_checkpointing=True, -) - -# Mixed precision training -training_args = TrainingArguments( - bf16=True, # or fp16=True -) - -# Efficient optimizer -training_args = TrainingArguments( - optim="adafactor", # Lower memory than AdamW -) -``` - -**Key Strategies:** -- **Batch sizes**: Use powers of 2 (8, 16, 32, 64, 128) -- **Gradient accumulation**: Enables larger effective batch sizes -- **Gradient checkpointing**: Reduces memory ~60%, increases time ~20% -- **Mixed precision**: bf16 for Ampere+ GPUs, fp16 for older -- **torch.compile**: Optimize model graph (PyTorch 2.0+) - -## Advanced Features - -### Custom Training Loop - -For maximum control, bypass Trainer: - -```python -from torch.utils.data import DataLoader -from transformers import AdamW, get_scheduler - -# Prepare data -train_dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True) - -# Setup optimizer and scheduler -optimizer = AdamW(model.parameters(), lr=5e-5) -scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=len(train_dataloader) * num_epochs -) - -# Training loop -model.train() -for epoch in range(num_epochs): - for batch in train_dataloader: - batch = {k: v.to(device) for k, v in batch.items()} - - outputs = model(**batch) - loss = outputs.loss - loss.backward() - - optimizer.step() - scheduler.step() - optimizer.zero_grad() -``` - -### Parameter-Efficient Fine-Tuning (PEFT) - -Use PEFT library with transformers for efficient fine-tuning: - -```python -from peft import LoraConfig, get_peft_model - -# Configure LoRA -lora_config = LoraConfig( - r=16, # Low-rank dimension - lora_alpha=32, - target_modules=["q_proj", "v_proj"], # Which layers to adapt - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM" -) - -# Apply to model -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") -model = get_peft_model(model, lora_config) - -# Now train as usual - only LoRA parameters train -trainer = Trainer(model=model, ...) -trainer.train() -``` - -### Chat Templates - -Apply chat templates for instruction-tuned models: - -```python -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") - -messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "What is machine learning?"}, -] - -# Format according to model's chat template -formatted = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True -) - -# Tokenize and generate -inputs = tokenizer(formatted, return_tensors="pt") -outputs = model.generate(**inputs, max_length=200) -response = tokenizer.decode(outputs[0]) -``` - -### Multi-GPU Training - -```python -# Automatic with Trainer - no code changes needed -# Just run with: accelerate launch train.py - -# Or use PyTorch DDP explicitly -training_args = TrainingArguments( - output_dir="./results", - ddp_find_unused_parameters=False, - # ... other args -) - -# For larger models, use FSDP -training_args = TrainingArguments( - output_dir="./results", - fsdp="full_shard auto_wrap", - fsdp_config={ - "fsdp_transformer_layer_cls_to_wrap": ["BertLayer"], - }, -) -``` - -## Task-Specific Patterns - -### Question Answering (Extractive) - -```python -from transformers import pipeline - -qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") - -result = qa( - question="What is extractive QA?", - context="Extractive QA extracts the answer from the given context..." -) -# {'answer': 'extracts the answer from the given context', 'score': 0.97, ...} -``` - -### Named Entity Recognition - -```python -ner = pipeline("token-classification", model="dslim/bert-base-NER") - -result = ner("My name is John and I live in New York") -# [{'entity': 'B-PER', 'word': 'John', ...}, {'entity': 'B-LOC', 'word': 'New York', ...}] -``` - -### Image Captioning - -```python -from transformers import AutoProcessor, AutoModelForCausalLM - -processor = AutoProcessor.from_pretrained("microsoft/git-base") -model = AutoModelForCausalLM.from_pretrained("microsoft/git-base") - -from PIL import Image -image = Image.open("image.jpg") - -inputs = processor(images=image, return_tensors="pt") -outputs = model.generate(**inputs, max_length=50) -caption = processor.batch_decode(outputs, skip_special_tokens=True)[0] -``` - -### Speech Recognition - -```python -transcriber = pipeline( - "automatic-speech-recognition", - model="openai/whisper-base" -) - -result = transcriber("audio.mp3") -# {'text': 'This is the transcribed text...'} - -# With timestamps -result = transcriber("audio.mp3", return_timestamps=True) -``` - -## Common Patterns and Best Practices - -### Saving and Loading Models - -```python -# Save entire model -model.save_pretrained("./my-model") -tokenizer.save_pretrained("./my-model") - -# Load later -model = AutoModel.from_pretrained("./my-model") -tokenizer = AutoTokenizer.from_pretrained("./my-model") - -# Push to Hugging Face Hub -model.push_to_hub("username/my-model") -tokenizer.push_to_hub("username/my-model") - -# Load from Hub -model = AutoModel.from_pretrained("username/my-model") -``` - -### Error Handling - -```python -from transformers import AutoModel -import torch - -try: - model = AutoModel.from_pretrained("model-name") -except OSError: - print("Model not found - check internet connection or model name") -except torch.cuda.OutOfMemoryError: - print("GPU memory exceeded - try quantization or smaller batch size") -``` - -### Device Management - -```python -import torch - -# Check device availability -device = "cuda" if torch.cuda.is_available() else "cpu" - -# Move model to device -model = model.to(device) - -# Or use device_map for automatic distribution -model = AutoModel.from_pretrained("model-name", device_map="auto") - -# For inputs -inputs = tokenizer(text, return_tensors="pt").to(device) -``` - -### Memory Management - -```python -import torch - -# Clear CUDA cache -torch.cuda.empty_cache() - -# Use context manager for inference -with torch.no_grad(): - outputs = model(**inputs) - -# Delete unused models -del model -torch.cuda.empty_cache() -``` +## Common Workflows + +### Quick Inference Workflow +1. Choose appropriate pipeline for task +2. Load pipeline with optional model specification +3. Pass inputs and get results +4. For batch processing, pass list of inputs + +**See:** `scripts/quick_inference.py` for comprehensive pipeline examples + +### Training Workflow +1. Load and preprocess dataset using 🤗 Datasets +2. Tokenize data with appropriate tokenizer +3. Load pre-trained model for specific task +4. Configure TrainingArguments +5. Create Trainer with model, data, and compute_metrics +6. Train with `trainer.train()` +7. Evaluate with `trainer.evaluate()` +8. Save model and optionally push to Hub + +**See:** `scripts/fine_tune_classifier.py` for complete training example + +### Text Generation Workflow +1. Load causal or seq2seq language model +2. Load tokenizer and tokenize prompt +3. Choose generation strategy (greedy, beam search, sampling) +4. Configure generation parameters +5. Generate with `model.generate()` +6. Decode output tokens to text + +**See:** `scripts/generate_text.py` for generation strategy examples + +## Best Practices + +1. **Use Auto classes** for flexibility across different model architectures +2. **Batch processing** for efficiency - process multiple inputs at once +3. **Device management** - use `device_map="auto"` for automatic placement +4. **Memory optimization** - enable fp16/bf16 or quantization for large models +5. **Checkpoint management** - save checkpoints regularly and load best model +6. **Pipeline for quick tasks** - use pipelines for standard inference tasks +7. **Custom metrics** - define compute_metrics for task-specific evaluation +8. **Gradient accumulation** - use for large effective batch sizes on limited memory +9. **Learning rate warmup** - typically 5-10% of total training steps +10. **Hub integration** - push trained models to Hub for sharing and versioning ## Resources -This skill includes comprehensive reference documentation and example scripts: - ### scripts/ +Executable Python scripts demonstrating common Transformers workflows: -- `quick_inference.py`: Ready-to-use script for running inference with pipelines -- `fine_tune_classifier.py`: Complete example for fine-tuning a text classifier -- `generate_text.py`: Text generation with various strategies +- `quick_inference.py` - Pipeline examples for NLP, vision, audio, and multimodal tasks +- `fine_tune_classifier.py` - Complete fine-tuning workflow with Trainer API +- `generate_text.py` - Text generation with various decoding strategies -Execute scripts directly or read them as implementation templates. - -### references/ - -- `api_reference.md`: Comprehensive API documentation for key classes -- `training_guide.md`: Detailed training patterns, optimization, and troubleshooting -- `generation_strategies.md`: In-depth guide to text generation methods -- `quantization.md`: Model quantization techniques comparison and usage -- `task_patterns.md`: Quick reference for common task implementations - -Load reference files when you need detailed information on specific topics. References contain extensive examples, parameter explanations, and best practices. - -## Troubleshooting - -**Import errors:** +Run scripts directly to see examples in action: ```bash -pip install transformers -pip install accelerate # For device_map="auto" -pip install bitsandbytes # For quantization +python scripts/quick_inference.py +python scripts/fine_tune_classifier.py +python scripts/generate_text.py ``` -**CUDA out of memory:** -- Reduce batch size -- Enable gradient checkpointing -- Use gradient accumulation -- Try quantization (8-bit or 4-bit) -- Use smaller model variant +### references/ +Comprehensive reference documentation loaded into context as needed: -**Slow training:** -- Enable mixed precision (fp16/bf16) -- Increase batch size (if memory allows) -- Use torch.compile (PyTorch 2.0+) -- Check data loading isn't bottleneck +- `api_reference.md` - Core classes and APIs (Auto classes, Trainer, GenerationConfig, etc.) +- `pipelines.md` - All available pipelines organized by modality with examples +- `training.md` - Training patterns, TrainingArguments, distributed training, callbacks +- `generation_strategies.md` - Text generation methods, decoding strategies, parameters +- `task_patterns.md` - Complete workflows for common tasks (classification, NER, QA, summarization, etc.) -**Poor generation quality:** -- Adjust temperature (lower = more focused) -- Try different decoding strategies (beam search vs sampling) -- Increase max_length if outputs cut off -- Use repetition_penalty to reduce repetition +When working on specific tasks or features, load the relevant reference file for detailed guidance. -For task-specific guidance, consult the appropriate reference file in the `references/` directory. +## Additional Information + +- **Official Documentation**: https://huggingface.co/docs/transformers/index +- **Model Hub**: https://huggingface.co/models (1M+ pre-trained models) +- **Datasets Hub**: https://huggingface.co/datasets +- **Installation**: `pip install transformers datasets evaluate accelerate` +- **GPU Support**: Requires PyTorch or TensorFlow with CUDA +- **Framework Support**: PyTorch (primary), TensorFlow, JAX/Flax diff --git a/scientific-packages/transformers/references/api_reference.md b/scientific-packages/transformers/references/api_reference.md index d43397a..51a4695 100644 --- a/scientific-packages/transformers/references/api_reference.md +++ b/scientific-packages/transformers/references/api_reference.md @@ -1,699 +1,485 @@ # Transformers API Reference -This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library. +This reference covers the core classes and APIs in the Transformers library. -## Core Model Classes +## Core Auto Classes -### PreTrainedModel +Auto classes provide a convenient way to automatically select the appropriate architecture based on model name or checkpoint. -Base class for all models. Handles loading, saving, and common model operations. - -**Key Methods:** - -```python -from transformers import PreTrainedModel - -# Load pretrained model -model = ModelClass.from_pretrained( - pretrained_model_name_or_path, - config=None, # Custom config - cache_dir=None, # Custom cache location - force_download=False, # Force re-download - resume_download=False, # Resume interrupted download - proxies=None, # HTTP proxies - local_files_only=False, # Only use cached files - token=None, # HF auth token - revision="main", # Git branch/tag - trust_remote_code=False, # Allow custom model code - device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.) - torch_dtype=None, # Model dtype (torch.float16, "auto", etc.) - low_cpu_mem_usage=False, # Reduce CPU memory during loading - **model_kwargs -) - -# Save model -model.save_pretrained( - save_directory, - save_config=True, # Save config.json - state_dict=None, # Custom state dict - save_function=torch.save, # Custom save function - push_to_hub=False, # Upload to Hub - max_shard_size="5GB", # Max checkpoint size - safe_serialization=True, # Use SafeTensors format - variant=None, # Model variant name -) - -# Generate text (for generative models) -outputs = model.generate( - inputs=None, # Input token IDs - max_length=20, # Max total length - max_new_tokens=None, # Max new tokens to generate - min_length=0, # Minimum length - do_sample=False, # Enable sampling - early_stopping=False, # Stop when num_beams finish - num_beams=1, # Beam search width - temperature=1.0, # Sampling temperature - top_k=50, # Top-k sampling - top_p=1.0, # Nucleus sampling - repetition_penalty=1.0, # Penalize repetition - length_penalty=1.0, # Beam search length penalty - no_repeat_ngram_size=0, # Block repeated n-grams - num_return_sequences=1, # Number of sequences to return - **model_kwargs -) - -# Resize token embeddings (after adding tokens) -new_embeddings = model.resize_token_embeddings( - new_num_tokens, - pad_to_multiple_of=None -) - -# Utility methods -num_params = model.num_parameters(only_trainable=False) -model.gradient_checkpointing_enable() # Enable gradient checkpointing -model.enable_input_require_grads() # For PEFT with frozen models -``` - -### AutoModel Classes - -Automatically instantiate the correct model architecture. - -**Available Classes:** - -- `AutoModel`: Base model (returns hidden states) -- `AutoModelForCausalLM`: Causal language modeling (GPT-style) -- `AutoModelForMaskedLM`: Masked language modeling (BERT-style) -- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART) -- `AutoModelForSequenceClassification`: Text classification -- `AutoModelForTokenClassification`: Token classification (NER) -- `AutoModelForQuestionAnswering`: Extractive QA -- `AutoModelForImageClassification`: Image classification -- `AutoModelForObjectDetection`: Object detection -- `AutoModelForSemanticSegmentation`: Semantic segmentation -- `AutoModelForAudioClassification`: Audio classification -- `AutoModelForSpeechSeq2Seq`: Speech-to-text -- `AutoModelForVision2Seq`: Image captioning, VQA - -**Usage:** - -```python -from transformers import AutoModel, AutoConfig - -# Load with default configuration -model = AutoModel.from_pretrained("bert-base-uncased") - -# Load with custom configuration -config = AutoConfig.from_pretrained("bert-base-uncased") -config.hidden_dropout_prob = 0.2 -model = AutoModel.from_pretrained("bert-base-uncased", config=config) - -# Register custom models -from transformers import AutoConfig, AutoModel - -AutoConfig.register("my-model", MyModelConfig) -AutoModel.register(MyModelConfig, MyModel) -``` - -## Tokenizer Classes - -### PreTrainedTokenizer / PreTrainedTokenizerFast - -Convert text to token IDs and vice versa. - -**Key Methods:** +### AutoTokenizer ```python from transformers import AutoTokenizer -tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, - use_fast=True, # Use fast (Rust) tokenizer if available - revision="main", - **kwargs -) +# Load tokenizer +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -# Encoding (text → token IDs) +# Tokenize single text +encoded = tokenizer("Hello, how are you?") +# Returns: {'input_ids': [...], 'attention_mask': [...]} + +# Tokenize with options encoded = tokenizer( - text, # String or List[str] - text_pair=None, # Second sequence for pairs - add_special_tokens=True, # Add [CLS], [SEP], etc. - padding=False, # True, False, "longest", "max_length" - truncation=False, # True, False, "longest_first", "only_first", "only_second" - max_length=None, # Max sequence length - stride=0, # Overlap for split sequences - return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy) - return_token_type_ids=None, # Return token type IDs - return_attention_mask=None, # Return attention mask - return_overflowing_tokens=False, # Return overflowing tokens - return_special_tokens_mask=False, # Return special token mask - return_offsets_mapping=False, # Return char-level offsets (fast only) - return_length=False, # Return sequence lengths - **kwargs + "Hello, how are you?", + padding="max_length", + truncation=True, + max_length=512, + return_tensors="pt" # "pt" for PyTorch, "tf" for TensorFlow ) -# Decoding (token IDs → text) -text = tokenizer.decode( - token_ids, - skip_special_tokens=False, # Remove special tokens - clean_up_tokenization_spaces=True, # Clean up spacing +# Tokenize pairs (for classification, QA, etc.) +encoded = tokenizer( + "Question or sentence A", + "Context or sentence B", + padding=True, + truncation=True ) -# Batch decoding -texts = tokenizer.batch_decode( - sequences, - skip_special_tokens=False, - clean_up_tokenization_spaces=True, -) +# Batch tokenization +texts = ["Text 1", "Text 2", "Text 3"] +encoded = tokenizer(texts, padding=True, truncation=True) -# Tokenization (text → tokens) -tokens = tokenizer.tokenize(text, **kwargs) +# Decode tokens back to text +text = tokenizer.decode(token_ids, skip_special_tokens=True) -# Convert tokens to IDs -ids = tokenizer.convert_tokens_to_ids(tokens) - -# Convert IDs to tokens -tokens = tokenizer.convert_ids_to_tokens(ids) - -# Add new tokens -num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"]) - -# Add special tokens -tokenizer.add_special_tokens({ - "bos_token": "[BOS]", - "eos_token": "[EOS]", - "unk_token": "[UNK]", - "sep_token": "[SEP]", - "pad_token": "[PAD]", - "cls_token": "[CLS]", - "mask_token": "[MASK]", - "additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"], -}) - -# Chat template formatting -formatted = tokenizer.apply_chat_template( - conversation, # List[Dict[str, str]] with "role" and "content" - chat_template=None, # Custom template - add_generation_prompt=False, # Add prompt for model to continue - tokenize=True, # Return token IDs - padding=False, - truncation=False, - max_length=None, - return_tensors=None, - return_dict=True, -) - -# Save tokenizer -tokenizer.save_pretrained(save_directory) - -# Get vocab size -vocab_size = len(tokenizer) - -# Get special tokens -pad_token = tokenizer.pad_token -pad_token_id = tokenizer.pad_token_id -# Similar for: bos, eos, unk, sep, cls, mask +# Batch decode +texts = tokenizer.batch_decode(batch_token_ids, skip_special_tokens=True) ``` -**Special Token Attributes:** +**Key Parameters:** +- `padding`: "max_length", "longest", or True (pad to max in batch) +- `truncation`: True or strategy ("longest_first", "only_first", "only_second") +- `max_length`: Maximum sequence length +- `return_tensors`: "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy) +- `return_attention_mask`: Return attention masks (default True) +- `return_token_type_ids`: Return token type IDs for pairs (default True) +- `add_special_tokens`: Add special tokens like [CLS], [SEP] (default True) + +**Special Properties:** +- `tokenizer.vocab_size`: Size of vocabulary +- `tokenizer.pad_token_id`: ID of padding token +- `tokenizer.eos_token_id`: ID of end-of-sequence token +- `tokenizer.bos_token_id`: ID of beginning-of-sequence token +- `tokenizer.unk_token_id`: ID of unknown token + +### AutoModel + +Base model class that outputs hidden states. ```python -tokenizer.bos_token # Beginning of sequence -tokenizer.eos_token # End of sequence -tokenizer.unk_token # Unknown token -tokenizer.sep_token # Separator token -tokenizer.pad_token # Padding token -tokenizer.cls_token # Classification token -tokenizer.mask_token # Mask token +from transformers import AutoModel -# Corresponding IDs -tokenizer.bos_token_id -tokenizer.eos_token_id -# ... etc +model = AutoModel.from_pretrained("bert-base-uncased") + +# Forward pass +outputs = model(**inputs) + +# Access hidden states +last_hidden_state = outputs.last_hidden_state # [batch_size, seq_length, hidden_size] +pooler_output = outputs.pooler_output # [batch_size, hidden_size] + +# Get all hidden states +model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True) +outputs = model(**inputs) +all_hidden_states = outputs.hidden_states # Tuple of tensors ``` -## Image Processors +### Task-Specific Auto Classes + +```python +from transformers import ( + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelForQuestionAnswering, + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoModelForSeq2SeqLM, + AutoModelForImageClassification, + AutoModelForObjectDetection, + AutoModelForVision2Seq, +) + +# Sequence classification (sentiment, topic, etc.) +model = AutoModelForSequenceClassification.from_pretrained( + "bert-base-uncased", + num_labels=3, + id2label={0: "negative", 1: "neutral", 2: "positive"}, + label2id={"negative": 0, "neutral": 1, "positive": 2} +) + +# Token classification (NER, POS tagging) +model = AutoModelForTokenClassification.from_pretrained( + "bert-base-uncased", + num_labels=9 # Number of entity types +) + +# Question answering +model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased") + +# Causal language modeling (GPT-style) +model = AutoModelForCausalLM.from_pretrained("gpt2") + +# Masked language modeling (BERT-style) +model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") + +# Sequence-to-sequence (T5, BART) +model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + +# Image classification +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224") + +# Object detection +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50") + +# Vision-to-text (image captioning, VQA) +model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base") +``` + +### AutoProcessor + +For multimodal models that need both text and image processing. + +```python +from transformers import AutoProcessor + +# For vision-language models +processor = AutoProcessor.from_pretrained("microsoft/git-base") + +# Process image and text +from PIL import Image +image = Image.open("image.jpg") +inputs = processor(images=image, text="caption", return_tensors="pt") + +# For audio models +processor = AutoProcessor.from_pretrained("openai/whisper-base") +inputs = processor(audio, sampling_rate=16000, return_tensors="pt") +``` ### AutoImageProcessor -Preprocess images for vision models. - -**Key Methods:** +For vision-only models. ```python from transformers import AutoImageProcessor processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -# Process images -inputs = processor( - images, # PIL Image, np.array, torch.Tensor, or List - return_tensors="pt", # "pt", "tf", "np", None - do_resize=True, # Resize to model size - size=None, # Target size dict - resample=None, # Resampling method - do_rescale=True, # Rescale pixel values - do_normalize=True, # Normalize with mean/std - image_mean=None, # Custom mean - image_std=None, # Custom std - do_center_crop=False, # Center crop - crop_size=None, # Crop size - **kwargs +# Process single image +from PIL import Image +image = Image.open("image.jpg") +inputs = processor(image, return_tensors="pt") + +# Batch processing +images = [Image.open(f"image{i}.jpg") for i in range(10)] +inputs = processor(images, return_tensors="pt") +``` + +## Model Loading Options + +### from_pretrained Parameters + +```python +model = AutoModel.from_pretrained( + "model-name", + # Device and precision + device_map="auto", # Automatic device placement + torch_dtype=torch.float16, # Use fp16 + low_cpu_mem_usage=True, # Reduce CPU memory during loading + + # Quantization + load_in_8bit=True, # 8-bit quantization + load_in_4bit=True, # 4-bit quantization + + # Model configuration + num_labels=3, # For classification + id2label={...}, # Label mapping + label2id={...}, + + # Outputs + output_hidden_states=True, + output_attentions=True, + + # Trust remote code + trust_remote_code=True, # For custom models + + # Caching + cache_dir="./cache", + force_download=False, + resume_download=True, +) +``` + +### Quantization with BitsAndBytes + +```python +from transformers import BitsAndBytesConfig, AutoModelForCausalLM + +# 4-bit quantization +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4" ) -# Returns: BatchFeature with 'pixel_values' key +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + quantization_config=quantization_config, + device_map="auto" +) ``` ## Training Components ### TrainingArguments -Configuration for the Trainer class. - -**Essential Arguments:** +See `training.md` for comprehensive coverage. Key parameters: ```python from transformers import TrainingArguments args = TrainingArguments( - # ===== Output & Logging ===== - output_dir="./results", # REQUIRED: Output directory - overwrite_output_dir=False, # Overwrite output directory - - # ===== Training Parameters ===== - num_train_epochs=3.0, # Number of epochs - max_steps=-1, # Max training steps (overrides epochs) - per_device_train_batch_size=8, # Train batch size per device - per_device_eval_batch_size=8, # Eval batch size per device - gradient_accumulation_steps=1, # Accumulation steps - - # ===== Learning Rate & Optimization ===== - learning_rate=5e-5, # Initial learning rate - weight_decay=0.0, # Weight decay - adam_beta1=0.9, # Adam beta1 - adam_beta2=0.999, # Adam beta2 - adam_epsilon=1e-8, # Adam epsilon - max_grad_norm=1.0, # Gradient clipping - optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit") - - # ===== Learning Rate Scheduler ===== - lr_scheduler_type="linear", # Scheduler type - warmup_steps=0, # Warmup steps - warmup_ratio=0.0, # Warmup ratio (alternative to steps) - - # ===== Evaluation ===== - eval_strategy="no", # "no", "steps", "epoch" - eval_steps=None, # Eval every N steps - eval_delay=0, # Delay first eval - eval_accumulation_steps=None, # Accumulate eval outputs - - # ===== Checkpointing ===== - save_strategy="steps", # "no", "steps", "epoch" - save_steps=500, # Save every N steps - save_total_limit=None, # Max checkpoints to keep - save_safetensors=True, # Save as SafeTensors - save_on_each_node=False, # Save on each node (distributed) - - # ===== Best Model Selection ===== - load_best_model_at_end=False, # Load best checkpoint at end - metric_for_best_model=None, # Metric to use - greater_is_better=None, # True if higher is better - - # ===== Logging ===== - logging_dir=None, # TensorBoard log directory - logging_strategy="steps", # "no", "steps", "epoch" - logging_steps=500, # Log every N steps - logging_first_step=False, # Log first step - logging_nan_inf_filter=True, # Filter NaN/Inf - - # ===== Mixed Precision ===== - fp16=False, # Use fp16 training - fp16_opt_level="O1", # Apex AMP optimization level - fp16_backend="auto", # "auto", "apex", "cpu_amp" - bf16=False, # Use bfloat16 training - bf16_full_eval=False, # Use bf16 for evaluation - tf32=None, # Use TF32 (Ampere+ GPUs) - - # ===== Memory Optimization ===== - gradient_checkpointing=False, # Enable gradient checkpointing - gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing - torch_empty_cache_steps=None, # Clear cache every N steps - - # ===== Distributed Training ===== - local_rank=-1, # Local rank for distributed - ddp_backend=None, # "nccl", "gloo", "mpi", "ccl" - ddp_find_unused_parameters=None, # Find unused parameters - ddp_bucket_cap_mb=None, # DDP bucket size - fsdp="", # FSDP configuration - fsdp_config=None, # FSDP config dict - deepspeed=None, # DeepSpeed config - - # ===== Hub Integration ===== - push_to_hub=False, # Push to Hugging Face Hub - hub_model_id=None, # Hub model ID - hub_strategy="every_save", # "every_save", "checkpoint", "end" - hub_token=None, # Hub authentication token - hub_private_repo=False, # Make repo private - - # ===== Data Handling ===== - dataloader_num_workers=0, # DataLoader workers - dataloader_pin_memory=True, # Pin memory - dataloader_drop_last=False, # Drop last incomplete batch - dataloader_prefetch_factor=None, # Prefetch factor - remove_unused_columns=True, # Remove unused dataset columns - label_names=None, # Label column names - - # ===== Other ===== - seed=42, # Random seed - data_seed=None, # Data sampling seed - jit_mode_eval=False, # Use PyTorch JIT for eval - use_ipex=False, # Use Intel Extension for PyTorch - torch_compile=False, # Use torch.compile() - torch_compile_backend=None, # Compile backend - torch_compile_mode=None, # Compile mode - include_inputs_for_metrics=False, # Pass inputs to compute_metrics - skip_memory_metrics=True, # Skip memory profiling + output_dir="./results", + num_train_epochs=3, + per_device_train_batch_size=16, + per_device_eval_batch_size=64, + learning_rate=2e-5, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + fp16=True, + logging_steps=100, + save_total_limit=2, ) ``` ### Trainer -Main training class with full training loop. - -**Key Methods:** - ```python from transformers import Trainer trainer = Trainer( - model=None, # Model to train - args=None, # TrainingArguments - data_collator=None, # Data collator - train_dataset=None, # Training dataset - eval_dataset=None, # Evaluation dataset - tokenizer=None, # Tokenizer - model_init=None, # Function to instantiate model - compute_metrics=None, # Function to compute metrics - callbacks=None, # List of callbacks - optimizers=(None, None), # (optimizer, scheduler) tuple - preprocess_logits_for_metrics=None, # Preprocess logits before metrics -) - -# Train model -train_result = trainer.train( - resume_from_checkpoint=None, # Resume from checkpoint - trial=None, # Optuna/Ray trial - ignore_keys_for_eval=None, # Keys to ignore in eval -) - -# Evaluate model -eval_result = trainer.evaluate( - eval_dataset=None, # Eval dataset (default: self.eval_dataset) - ignore_keys=None, # Keys to ignore - metric_key_prefix="eval", # Prefix for metric names -) - -# Make predictions -predictions = trainer.predict( - test_dataset, # Test dataset - ignore_keys=None, # Keys to ignore - metric_key_prefix="test", # Metric prefix -) -# Returns: PredictionOutput(predictions, label_ids, metrics) - -# Save model -trainer.save_model(output_dir=None) - -# Push to Hub -trainer.push_to_hub( - commit_message="End of training", - blocking=True, - **kwargs -) - -# Hyperparameter search -best_trial = trainer.hyperparameter_search( - hp_space=None, # Hyperparameter search space - compute_objective=None, # Objective function - n_trials=20, # Number of trials - direction="minimize", # "minimize" or "maximize" - backend=None, # "optuna", "ray", "sigopt" - **kwargs -) - -# Create optimizer -optimizer = trainer.create_optimizer() - -# Create scheduler -scheduler = trainer.create_scheduler( - num_training_steps, - optimizer=None -) - -# Log metrics -trainer.log_metrics(split, metrics) -trainer.save_metrics(split, metrics) - -# Save checkpoint -trainer.save_state() - -# Access current step/epoch -current_step = trainer.state.global_step -current_epoch = trainer.state.epoch - -# Access training logs -logs = trainer.state.log_history -``` - -### Seq2SeqTrainer - -Specialized trainer for sequence-to-sequence models. - -```python -from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments - -# Use Seq2SeqTrainingArguments with additional parameters -training_args = Seq2SeqTrainingArguments( - output_dir="./results", - predict_with_generate=True, # Use generate() for evaluation - generation_max_length=None, # Max length for generation - generation_num_beams=None, # Num beams for generation - **other_training_arguments -) - -# Trainer usage is identical to Trainer -trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, - tokenizer=tokenizer, - data_collator=data_collator, compute_metrics=compute_metrics, + data_collator=data_collator, + callbacks=[callback1, callback2], +) + +# Train +trainer.train() + +# Resume from checkpoint +trainer.train(resume_from_checkpoint=True) + +# Evaluate +metrics = trainer.evaluate() + +# Predict +predictions = trainer.predict(test_dataset) + +# Hyperparameter search +best_trial = trainer.hyperparameter_search( + direction="maximize", + backend="optuna", + n_trials=10, +) + +# Save model +trainer.save_model("./final_model") + +# Push to Hub +trainer.push_to_hub(commit_message="Training complete") +``` + +### Data Collators + +```python +from transformers import ( + DataCollatorWithPadding, + DataCollatorForTokenClassification, + DataCollatorForSeq2Seq, + DataCollatorForLanguageModeling, + DefaultDataCollator, +) + +# For classification/regression with dynamic padding +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +# For token classification (NER) +data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) + +# For seq2seq tasks +data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) + +# For language modeling +data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=True, # True for masked LM, False for causal LM + mlm_probability=0.15 +) + +# Default (no special handling) +data_collator = DefaultDataCollator() +``` + +## Generation Components + +### GenerationConfig + +See `generation_strategies.md` for comprehensive coverage. + +```python +from transformers import GenerationConfig + +config = GenerationConfig( + max_new_tokens=100, + do_sample=True, + temperature=0.7, + top_p=0.9, + top_k=50, + num_beams=5, + repetition_penalty=1.2, + no_repeat_ngram_size=3, +) + +# Use with model +outputs = model.generate(**inputs, generation_config=config) +``` + +### generate() Method + +```python +outputs = model.generate( + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + max_new_tokens=100, + do_sample=True, + temperature=0.7, + top_p=0.9, + num_return_sequences=3, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, ) ``` -## Pipeline Classes +## Pipeline API -### pipeline() - -Unified inference API for all tasks. +See `pipelines.md` for comprehensive coverage. ```python from transformers import pipeline -pipe = pipeline( - task=None, # Task name (required) - model=None, # Model name/path or model object - config=None, # Model config - tokenizer=None, # Tokenizer - feature_extractor=None, # Feature extractor - image_processor=None, # Image processor - framework=None, # "pt" or "tf" - revision=None, # Model revision - use_fast=True, # Use fast tokenizer - token=None, # HF token - device=None, # Device (-1 for CPU, 0+ for GPU) - device_map=None, # Device map for multi-GPU - torch_dtype=None, # Model dtype - trust_remote_code=False, # Allow custom code - model_kwargs=None, # Additional model kwargs - pipeline_class=None, # Custom pipeline class - **kwargs -) +# Basic usage +pipe = pipeline("task-name", model="model-name", device=0) +results = pipe(inputs) -# Use pipeline -results = pipe( - inputs, # Input data - **task_specific_parameters -) -``` +# With custom model +from transformers import AutoModelForSequenceClassification, AutoTokenizer -## Data Collators - -Batch and pad data for training. - -```python -from transformers import ( - DataCollatorWithPadding, # Dynamic padding for classification - DataCollatorForTokenClassification, # Padding for token classification - DataCollatorForSeq2Seq, # Padding for seq2seq - DataCollatorForLanguageModeling, # MLM/CLM data collation - default_data_collator, # Simple collator (no padding) -) - -# Text classification -data_collator = DataCollatorWithPadding( - tokenizer=tokenizer, - padding=True, - max_length=None, - pad_to_multiple_of=None, -) - -# Token classification -data_collator = DataCollatorForTokenClassification( - tokenizer=tokenizer, - padding=True, - max_length=None, - pad_to_multiple_of=None, - label_pad_token_id=-100, -) - -# Seq2Seq -data_collator = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - model=None, - padding=True, - max_length=None, - pad_to_multiple_of=None, - label_pad_token_id=-100, -) - -# Language modeling -data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, - mlm=True, # Masked LM (False for causal LM) - mlm_probability=0.15, # Mask probability - pad_to_multiple_of=None, -) -``` - -## Optimization & Scheduling - -```python -from transformers import ( - AdamW, # AdamW optimizer - Adafactor, # Adafactor optimizer - get_scheduler, # Get LR scheduler - get_linear_schedule_with_warmup, - get_cosine_schedule_with_warmup, - get_polynomial_decay_schedule_with_warmup, -) - -# Create optimizer -optimizer = AdamW( - model.parameters(), - lr=5e-5, - betas=(0.9, 0.999), - eps=1e-8, - weight_decay=0.01, -) - -# Create scheduler -scheduler = get_scheduler( - name="linear", # "linear", "cosine", "polynomial", "constant" - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=total_steps, -) - -# Or use specific schedulers -scheduler = get_linear_schedule_with_warmup( - optimizer, - num_warmup_steps=warmup_steps, - num_training_steps=total_steps, -) - -scheduler = get_cosine_schedule_with_warmup( - optimizer, - num_warmup_steps=warmup_steps, - num_training_steps=total_steps, - num_cycles=0.5, -) +model = AutoModelForSequenceClassification.from_pretrained("model-name") +tokenizer = AutoTokenizer.from_pretrained("model-name") +pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) ``` ## Configuration Classes +### Model Configuration + ```python from transformers import AutoConfig # Load configuration -config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, - **kwargs -) +config = AutoConfig.from_pretrained("bert-base-uncased") -# Common configuration attributes -config.vocab_size # Vocabulary size -config.hidden_size # Hidden layer size -config.num_hidden_layers # Number of layers -config.num_attention_heads # Attention heads -config.intermediate_size # FFN intermediate size -config.hidden_dropout_prob # Dropout probability -config.attention_probs_dropout_prob # Attention dropout -config.max_position_embeddings # Max sequence length +# Access configuration +print(config.hidden_size) +print(config.num_attention_heads) +print(config.num_hidden_layers) + +# Modify configuration +config.num_labels = 5 +config.output_hidden_states = True + +# Create model with config +model = AutoModel.from_config(config) # Save configuration -config.save_pretrained(save_directory) - -# Create model from config -from transformers import AutoModel -model = AutoModel.from_config(config) +config.save_pretrained("./config") ``` -## Utility Functions +## Utilities + +### Hub Utilities ```python -from transformers import ( - set_seed, # Set random seed - logging, # Logging utilities -) +from huggingface_hub import login, snapshot_download -# Set seed for reproducibility -set_seed(42) +# Login +login(token="hf_...") -# Configure logging -logging.set_verbosity_info() -logging.set_verbosity_warning() -logging.set_verbosity_error() -logging.set_verbosity_debug() +# Download model +snapshot_download(repo_id="model-name", cache_dir="./cache") -# Get logger -logger = logging.get_logger(__name__) +# Push to Hub +model.push_to_hub("username/model-name", commit_message="Initial commit") +tokenizer.push_to_hub("username/model-name") +``` + +### Evaluation Metrics + +```python +import evaluate + +# Load metric +metric = evaluate.load("accuracy") + +# Compute metric +results = metric.compute(predictions=predictions, references=labels) + +# Common metrics +accuracy = evaluate.load("accuracy") +precision = evaluate.load("precision") +recall = evaluate.load("recall") +f1 = evaluate.load("f1") +bleu = evaluate.load("bleu") +rouge = evaluate.load("rouge") ``` ## Model Outputs -All models return model-specific output classes (subclasses of `ModelOutput`): +All models return dataclass objects with named attributes: ```python -# Common output attributes -outputs.loss # Loss (if labels provided) -outputs.logits # Model logits -outputs.hidden_states # All hidden states (if output_hidden_states=True) -outputs.attentions # Attention weights (if output_attentions=True) +# Sequence classification output +outputs = model(**inputs) +logits = outputs.logits # [batch_size, num_labels] +loss = outputs.loss # If labels provided -# Seq2Seq specific -outputs.encoder_last_hidden_state -outputs.encoder_hidden_states -outputs.encoder_attentions -outputs.decoder_hidden_states -outputs.decoder_attentions -outputs.cross_attentions +# Causal LM output +outputs = model(**inputs) +logits = outputs.logits # [batch_size, seq_length, vocab_size] +past_key_values = outputs.past_key_values # KV cache -# Access as dict or tuple +# Seq2Seq output +outputs = model(**inputs, labels=labels) +loss = outputs.loss logits = outputs.logits -logits = outputs["logits"] -loss, logits = outputs.to_tuple()[:2] +encoder_last_hidden_state = outputs.encoder_last_hidden_state + +# Access as dict +outputs_dict = outputs.to_tuple() # or dict(outputs) ``` -This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers. +## Best Practices + +1. **Use Auto classes**: AutoModel, AutoTokenizer for flexibility +2. **Device management**: Use `device_map="auto"` for multi-GPU +3. **Memory optimization**: Use `torch_dtype=torch.float16` and quantization +4. **Caching**: Set `cache_dir` to avoid re-downloading +5. **Batch processing**: Process multiple inputs at once for efficiency +6. **Trust remote code**: Only set `trust_remote_code=True` for trusted sources diff --git a/scientific-packages/transformers/references/generation_strategies.md b/scientific-packages/transformers/references/generation_strategies.md index 9ad4486..79ff40b 100644 --- a/scientific-packages/transformers/references/generation_strategies.md +++ b/scientific-packages/transformers/references/generation_strategies.md @@ -1,22 +1,8 @@ # Text Generation Strategies -Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity. +Transformers provides flexible text generation capabilities through the `generate()` method, supporting multiple decoding strategies and configuration options. -## Overview - -Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost. - -**When to use each strategy:** -- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical -- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization) -- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue) -- **Top-k/Top-p**: Balanced creativity and coherence - -## Basic Generation Methods - -### Greedy Decoding - -Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences. +## Basic Generation ```python from transformers import AutoModelForCausalLM, AutoTokenizer @@ -24,507 +10,364 @@ from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("gpt2") tokenizer = AutoTokenizer.from_pretrained("gpt2") -inputs = tokenizer("The future of AI", return_tensors="pt") - -# Greedy decoding (default) +inputs = tokenizer("Once upon a time", return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=50) -print(tokenizer.decode(outputs[0])) +generated_text = tokenizer.decode(outputs[0]) ``` -**Characteristics:** -- Deterministic (always same output for same input) -- Fast (single forward pass per token) -- Prone to repetition in longer sequences -- Best for: Short generations, deterministic applications +## Decoding Strategies -**Parameters:** -```python -outputs = model.generate( - **inputs, - max_new_tokens=50, # Number of tokens to generate - min_length=10, # Minimum total length - pad_token_id=tokenizer.pad_token_id, -) -``` +### 1. Greedy Decoding -### Beam Search - -Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability. +Selects the token with highest probability at each step. Deterministic but can be repetitive. ```python outputs = model.generate( **inputs, max_new_tokens=50, - num_beams=5, # Number of beams - early_stopping=True, # Stop when all beams finish - no_repeat_ngram_size=2, # Prevent 2-gram repetition + do_sample=False, + num_beams=1 # Greedy is default when num_beams=1 and do_sample=False ) ``` -**Characteristics:** -- Higher quality than greedy for tasks with "correct" answers -- Slower than greedy (num_beams forward passes per step) -- Still can suffer from repetition -- Best for: Translation, summarization, QA generation +### 2. Beam Search + +Explores multiple hypotheses simultaneously, keeping top-k candidates at each step. -**Advanced Parameters:** ```python outputs = model.generate( **inputs, + max_new_tokens=50, + num_beams=5, # Number of beams + early_stopping=True, # Stop when all beams reach EOS + no_repeat_ngram_size=2, # Prevent repeating n-grams +) +``` + +**Key parameters:** +- `num_beams`: Number of beams (higher = more thorough but slower) +- `early_stopping`: Stop when all beams finish (True/False) +- `length_penalty`: Exponential penalty for length (>1.0 favors longer sequences) +- `no_repeat_ngram_size`: Prevent repeating n-grams + +### 3. Sampling (Multinomial) + +Samples from probability distribution, introducing randomness and diversity. + +```python +outputs = model.generate( + **inputs, + max_new_tokens=50, + do_sample=True, + temperature=0.7, # Controls randomness (lower = more focused) + top_k=50, # Consider only top-k tokens + top_p=0.9, # Nucleus sampling (cumulative probability threshold) +) +``` + +**Key parameters:** +- `temperature`: Scales logits before softmax (0.1-2.0 typical range) + - Lower (0.1-0.7): More focused, deterministic + - Higher (0.8-1.5): More creative, random +- `top_k`: Sample from top-k tokens only +- `top_p`: Nucleus sampling - sample from smallest set with cumulative probability > p + +### 4. Beam Search with Sampling + +Combines beam search with sampling for diverse but coherent outputs. + +```python +outputs = model.generate( + **inputs, + max_new_tokens=50, num_beams=5, - num_beam_groups=1, # Diverse beam search groups - diversity_penalty=0.0, # Penalty for similar beams - length_penalty=1.0, # >1: longer sequences, <1: shorter - early_stopping=True, # Stop when num_beams sequences finish - no_repeat_ngram_size=2, # Block repeating n-grams - num_return_sequences=1, # Return top-k sequences (≤ num_beams) -) -``` - -**Length Penalty:** -- `length_penalty > 1.0`: Favor longer sequences -- `length_penalty = 1.0`: No penalty -- `length_penalty < 1.0`: Favor shorter sequences - -### Sampling (Multinomial) - -Randomly sample tokens according to the probability distribution. - -```python -outputs = model.generate( - **inputs, - max_new_tokens=50, - do_sample=True, # Enable sampling - temperature=1.0, # Sampling temperature - num_beams=1, # Must be 1 for sampling -) -``` - -**Characteristics:** -- Non-deterministic (different output each time) -- More diverse and creative than greedy/beam search -- Can produce incoherent output if not controlled -- Best for: Creative writing, dialogue, open-ended generation - -**Temperature Parameter:** -```python -# Low temperature (0.1-0.7): More focused, less random -outputs = model.generate(**inputs, do_sample=True, temperature=0.5) - -# Medium temperature (0.7-1.0): Balanced -outputs = model.generate(**inputs, do_sample=True, temperature=0.8) - -# High temperature (1.0-2.0): More random, more creative -outputs = model.generate(**inputs, do_sample=True, temperature=1.5) -``` - -- `temperature → 0`: Approaches greedy decoding -- `temperature = 1.0`: Sample from original distribution -- `temperature > 1.0`: Flatter distribution, more random -- `temperature < 1.0`: Sharper distribution, more confident - -## Advanced Sampling Methods - -### Top-k Sampling - -Sample from only the k most likely tokens. - -```python -outputs = model.generate( - **inputs, do_sample=True, - max_new_tokens=50, - top_k=50, # Consider top 50 tokens temperature=0.8, + top_k=50, ) ``` -**How it works:** -1. Filter to top-k most probable tokens -2. Renormalize probabilities -3. Sample from filtered distribution +### 5. Contrastive Search -**Choosing k:** -- `k=1`: Equivalent to greedy decoding -- `k=10-50`: More focused, coherent output -- `k=100-500`: More diverse output -- Too high k: Includes low-probability tokens (noise) -- Too low k: Less diverse, may miss good alternatives - -### Top-p (Nucleus) Sampling - -Sample from the smallest set of tokens whose cumulative probability ≥ p. +Balances coherence and diversity using contrastive objective. ```python outputs = model.generate( **inputs, - do_sample=True, max_new_tokens=50, - top_p=0.95, # Nucleus probability - temperature=0.8, + penalty_alpha=0.6, # Contrastive penalty + top_k=4, # Consider top-k candidates ) ``` -**How it works:** -1. Sort tokens by probability -2. Find smallest set with cumulative probability ≥ p -3. Sample from this set +### 6. Assisted Decoding -**Choosing p:** -- `p=0.9-0.95`: Good balance (recommended) -- `p=1.0`: Sample from full distribution -- Higher p: More diverse, might include unlikely tokens -- Lower p: More focused, like top-k with adaptive k - -**Top-p vs Top-k:** -- Top-p adapts to probability distribution shape -- Top-k is fixed regardless of distribution -- Top-p generally better for variable-quality contexts -- Can combine: `top_k=50, top_p=0.95` (apply both filters) - -### Combining Strategies +Uses a smaller "assistant" model to speed up generation of larger model. ```python -# Recommended for high-quality open-ended generation +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("gpt2-large") +assistant_model = AutoModelForCausalLM.from_pretrained("gpt2") + outputs = model.generate( **inputs, - do_sample=True, + assistant_model=assistant_model, + max_new_tokens=50, +) +``` + +## GenerationConfig + +Configure generation parameters with `GenerationConfig` for reusability. + +```python +from transformers import GenerationConfig + +generation_config = GenerationConfig( max_new_tokens=100, - temperature=0.8, # Moderate temperature - top_k=50, # Limit to top 50 tokens - top_p=0.95, # Nucleus sampling - repetition_penalty=1.2, # Discourage repetition - no_repeat_ngram_size=3, # Block 3-gram repetition + do_sample=True, + temperature=0.7, + top_p=0.9, + top_k=50, + repetition_penalty=1.2, + no_repeat_ngram_size=3, ) + +# Use with model +outputs = model.generate(**inputs, generation_config=generation_config) + +# Save and load +generation_config.save_pretrained("./config") +loaded_config = GenerationConfig.from_pretrained("./config") ``` -## Controlling Generation Quality +## Key Parameters Reference + +### Output Length Control + +- `max_length`: Maximum total tokens (input + output) +- `max_new_tokens`: Maximum new tokens to generate (recommended over max_length) +- `min_length`: Minimum total tokens +- `min_new_tokens`: Minimum new tokens to generate + +### Sampling Parameters + +- `temperature`: Sampling temperature (0.1-2.0, default 1.0) +- `top_k`: Top-k sampling (1-100, typically 50) +- `top_p`: Nucleus sampling (0.0-1.0, typically 0.9) +- `do_sample`: Enable sampling (True/False) + +### Beam Search Parameters + +- `num_beams`: Number of beams (1-20, typically 5) +- `early_stopping`: Stop when beams finish (True/False) +- `length_penalty`: Length penalty (>1.0 favors longer, <1.0 favors shorter) +- `num_beam_groups`: Diverse beam search groups +- `diversity_penalty`: Penalty for similar beams ### Repetition Control -Prevent models from repeating themselves: +- `repetition_penalty`: Penalty for repeating tokens (1.0-2.0, default 1.0) +- `no_repeat_ngram_size`: Prevent repeating n-grams (2-5 typical) +- `encoder_repetition_penalty`: Penalty for repeating encoder tokens -```python -outputs = model.generate( - **inputs, - max_new_tokens=100, +### Special Tokens - # Method 1: Repetition penalty - repetition_penalty=1.2, # Penalize repeated tokens (>1.0) +- `bos_token_id`: Beginning of sequence token +- `eos_token_id`: End of sequence token (or list of tokens) +- `pad_token_id`: Padding token +- `forced_bos_token_id`: Force specific token at beginning +- `forced_eos_token_id`: Force specific token at end - # Method 2: Block n-gram repetition - no_repeat_ngram_size=3, # Never repeat 3-grams +### Multiple Sequences - # Method 3: Encoder repetition penalty (for seq2seq) - encoder_repetition_penalty=1.0, # Penalize input tokens -) -``` +- `num_return_sequences`: Number of sequences to return +- `num_beam_groups`: Number of diverse beam groups -**Repetition Penalty Values:** -- `1.0`: No penalty -- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3) -- `>1.5`: Strong penalty (may harm coherence) +## Advanced Generation Techniques -### Length Control +### Constrained Generation -```python -outputs = model.generate( - **inputs, - - # Hard constraints - min_length=20, # Minimum total length - max_length=100, # Maximum total length - max_new_tokens=50, # Maximum new tokens (excluding input) - - # Soft constraints (with beam search) - length_penalty=1.0, # Encourage longer/shorter outputs - - # Early stopping - early_stopping=True, # Stop when condition met -) -``` - -### Bad Words and Forced Tokens - -```python -# Prevent specific tokens -bad_words_ids = [ - tokenizer.encode("badword1", add_special_tokens=False), - tokenizer.encode("badword2", add_special_tokens=False), -] - -outputs = model.generate( - **inputs, - bad_words_ids=bad_words_ids, -) - -# Force specific tokens -force_words_ids = [ - tokenizer.encode("important", add_special_tokens=False), -] - -outputs = model.generate( - **inputs, - force_words_ids=force_words_ids, -) -``` - -## Streaming Generation - -Generate and process tokens as they're produced: - -```python -from transformers import TextStreamer, TextIteratorStreamer -from threading import Thread - -# Simple streaming (prints to stdout) -streamer = TextStreamer(tokenizer, skip_prompt=True) -outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100) - -# Iterator streaming (for custom processing) -streamer = TextIteratorStreamer(tokenizer, skip_prompt=True) - -generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100) -thread = Thread(target=model.generate, kwargs=generation_kwargs) -thread.start() - -for text in streamer: - print(text, end="", flush=True) - -thread.join() -``` - -## Advanced Techniques - -### Contrastive Search - -Balance coherence and diversity using contrastive objective: - -```python -outputs = model.generate( - **inputs, - max_new_tokens=50, - penalty_alpha=0.6, # Contrastive penalty - top_k=4, # Consider top-4 tokens -) -``` - -**When to use:** -- Open-ended text generation -- Reduces repetition without sacrificing coherence -- Good alternative to sampling - -### Diverse Beam Search - -Generate multiple diverse outputs: - -```python -outputs = model.generate( - **inputs, - max_new_tokens=50, - num_beams=10, - num_beam_groups=5, # 5 groups of 2 beams each - diversity_penalty=1.0, # Penalty for similar beams - num_return_sequences=5, # Return 5 diverse outputs -) -``` - -### Constrained Beam Search - -Force output to include specific phrases: +Force generation to include specific tokens or follow patterns. ```python from transformers import PhrasalConstraint constraints = [ - PhrasalConstraint( - tokenizer("machine learning", add_special_tokens=False).input_ids - ), + PhrasalConstraint(tokenizer("New York", add_special_tokens=False).input_ids) ] outputs = model.generate( **inputs, constraints=constraints, - num_beams=10, # Requires beam search + num_beams=5, ) ``` -## Speculative Decoding +### Streaming Generation -Accelerate generation using a smaller draft model: +Generate tokens one at a time for real-time display. ```python -from transformers import AutoModelForCausalLM +from transformers import TextIteratorStreamer +from threading import Thread -# Load main and assistant models -model = AutoModelForCausalLM.from_pretrained("large-model") -assistant_model = AutoModelForCausalLM.from_pretrained("small-model") +streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) + +generation_kwargs = dict( + **inputs, + max_new_tokens=100, + streamer=streamer, +) + +thread = Thread(target=model.generate, kwargs=generation_kwargs) +thread.start() + +for new_text in streamer: + print(new_text, end="", flush=True) + +thread.join() +``` + +### Logit Processors + +Customize token selection with custom logit processors. + +```python +from transformers import LogitsProcessor, LogitsProcessorList + +class CustomLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids, scores): + # Modify scores here + return scores + +logits_processor = LogitsProcessorList([CustomLogitsProcessor()]) -# Generate with speculative decoding outputs = model.generate( **inputs, - assistant_model=assistant_model, + logits_processor=logits_processor, +) +``` + +### Stopping Criteria + +Define custom stopping conditions. + +```python +from transformers import StoppingCriteria, StoppingCriteriaList + +class CustomStoppingCriteria(StoppingCriteria): + def __call__(self, input_ids, scores, **kwargs): + # Return True to stop generation + return False + +stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria()]) + +outputs = model.generate( + **inputs, + stopping_criteria=stopping_criteria, +) +``` + +## Best Practices + +### For Creative Tasks (Stories, Dialogue) +```python +outputs = model.generate( + **inputs, + max_new_tokens=200, do_sample=True, temperature=0.8, -) -``` - -**Benefits:** -- 2-3x faster generation -- Identical output distribution to regular generation -- Works with sampling and greedy decoding - -## Recipe: Recommended Settings by Task - -### Creative Writing / Dialogue - -```python -outputs = model.generate( - **inputs, - do_sample=True, - max_new_tokens=200, - temperature=0.9, top_p=0.95, - top_k=50, repetition_penalty=1.2, no_repeat_ngram_size=3, ) ``` -### Translation / Summarization - +### For Factual Tasks (Summaries, QA) ```python outputs = model.generate( **inputs, - num_beams=5, - max_new_tokens=150, + max_new_tokens=100, + num_beams=4, early_stopping=True, - length_penalty=1.0, no_repeat_ngram_size=2, + length_penalty=1.0, ) ``` -### Code Generation - +### For Chat/Instruction Following ```python outputs = model.generate( **inputs, - max_new_tokens=300, - temperature=0.2, # Low temperature for correctness - top_p=0.95, + max_new_tokens=512, do_sample=True, -) -``` - -### Chatbot / Instruction Following - -```python -outputs = model.generate( - **inputs, - do_sample=True, - max_new_tokens=256, temperature=0.7, top_p=0.9, - repetition_penalty=1.15, + repetition_penalty=1.1, ) ``` -### Factual QA / Information Extraction +## Vision-Language Model Generation + +For models like LLaVA, BLIP-2, etc.: ```python -outputs = model.generate( - **inputs, - max_new_tokens=50, - num_beams=3, - early_stopping=True, - # Or greedy for very short answers: - # (no special parameters needed) -) -``` +from transformers import AutoProcessor, AutoModelForVision2Seq +from PIL import Image -## Debugging Generation +model = AutoModelForVision2Seq.from_pretrained("llava-hf/llava-1.5-7b-hf") +processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") -### Check Token Probabilities - -```python -outputs = model.generate( - **inputs, - max_new_tokens=20, - output_scores=True, # Return generation scores - return_dict_in_generate=True, # Return as dict -) - -# Access generation scores -scores = outputs.scores # Tuple of tensors (seq_len, vocab_size) - -# Get token probabilities -import torch -probs = torch.softmax(scores[0], dim=-1) -``` - -### Monitor Generation Process - -```python -from transformers import LogitsProcessor, LogitsProcessorList - -class DebugLogitsProcessor(LogitsProcessor): - def __call__(self, input_ids, scores): - # Print top 5 tokens at each step - top_tokens = scores[0].topk(5) - print(f"Top 5 tokens: {top_tokens}") - return scores +image = Image.open("image.jpg") +inputs = processor(text="Describe this image", images=image, return_tensors="pt") outputs = model.generate( **inputs, - max_new_tokens=10, - logits_processor=LogitsProcessorList([DebugLogitsProcessor()]), + max_new_tokens=100, + do_sample=True, + temperature=0.7, ) + +generated_text = processor.decode(outputs[0], skip_special_tokens=True) ``` -## Common Issues and Solutions - -**Issue: Repetitive output** -- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3` -- For sampling: Increase `temperature`, enable `top_p` - -**Issue: Incoherent output** -- Solution: Lower `temperature` (0.5-0.8), use beam search -- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens - -**Issue: Too short output** -- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search) -- Check if EOS token is being generated early - -**Issue: Too slow generation** -- Solution: Use greedy instead of beam search -- Reduce `num_beams` -- Try speculative decoding with assistant model -- Use smaller model variant - -**Issue: Output doesn't follow format** -- Solution: Use constrained beam search -- Add format examples to prompt -- Use `bad_words_ids` to prevent format-breaking tokens - ## Performance Optimization +### Use KV Cache ```python -# Use half precision -model = AutoModelForCausalLM.from_pretrained( - "model-name", - torch_dtype=torch.float16, - device_map="auto" -) - -# Use KV cache optimization (default, but can be disabled) +# KV cache is enabled by default outputs = model.generate(**inputs, use_cache=True) - -# Batch generation -inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True) -outputs = model.generate(**inputs, max_new_tokens=50) - -# Static cache for longer sequences (if supported) -outputs = model.generate(**inputs, cache_implementation="static") ``` -This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`. +### Mixed Precision +```python +import torch + +with torch.cuda.amp.autocast(): + outputs = model.generate(**inputs, max_new_tokens=100) +``` + +### Batch Generation +```python +texts = ["Prompt 1", "Prompt 2", "Prompt 3"] +inputs = tokenizer(texts, return_tensors="pt", padding=True) +outputs = model.generate(**inputs, max_new_tokens=50) +``` + +### Quantization +```python +from transformers import BitsAndBytesConfig + +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16 +) + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + quantization_config=quantization_config, + device_map="auto" +) +``` diff --git a/scientific-packages/transformers/references/pipelines.md b/scientific-packages/transformers/references/pipelines.md new file mode 100644 index 0000000..8ac74a3 --- /dev/null +++ b/scientific-packages/transformers/references/pipelines.md @@ -0,0 +1,234 @@ +# Transformers Pipelines + +Pipelines provide a simple and optimized interface for inference across many machine learning tasks. They abstract away the complexity of tokenization, model invocation, and post-processing. + +## Usage Pattern + +```python +from transformers import pipeline + +# Basic usage +classifier = pipeline("text-classification") +result = classifier("This movie was amazing!") + +# With specific model +classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english") +result = classifier("This movie was amazing!") +``` + +## Natural Language Processing Pipelines + +### Text Classification +```python +classifier = pipeline("text-classification") +classifier("I love this product!") +# [{'label': 'POSITIVE', 'score': 0.9998}] +``` + +### Zero-Shot Classification +```python +classifier = pipeline("zero-shot-classification") +classifier("This is about climate change", candidate_labels=["politics", "science", "sports"]) +``` + +### Token Classification (NER) +```python +ner = pipeline("token-classification") +ner("My name is Sarah and I work at Microsoft in Seattle") +``` + +### Question Answering +```python +qa = pipeline("question-answering") +qa(question="What is the capital?", context="The capital of France is Paris.") +``` + +### Text Generation +```python +generator = pipeline("text-generation") +generator("Once upon a time", max_length=50) +``` + +### Text2Text Generation +```python +generator = pipeline("text2text-generation", model="t5-base") +generator("translate English to French: Hello") +``` + +### Summarization +```python +summarizer = pipeline("summarization") +summarizer("Long article text here...", max_length=130, min_length=30) +``` + +### Translation +```python +translator = pipeline("translation_en_to_fr") +translator("Hello, how are you?") +``` + +### Fill Mask +```python +unmasker = pipeline("fill-mask") +unmasker("Paris is the [MASK] of France.") +``` + +### Feature Extraction +```python +extractor = pipeline("feature-extraction") +embeddings = extractor("This is a sentence") +``` + +### Document Question Answering +```python +doc_qa = pipeline("document-question-answering") +doc_qa(image="document.png", question="What is the invoice number?") +``` + +### Table Question Answering +```python +table_qa = pipeline("table-question-answering") +table_qa(table=data, query="How many employees?") +``` + +## Computer Vision Pipelines + +### Image Classification +```python +classifier = pipeline("image-classification") +classifier("cat.jpg") +``` + +### Zero-Shot Image Classification +```python +classifier = pipeline("zero-shot-image-classification") +classifier("cat.jpg", candidate_labels=["cat", "dog", "bird"]) +``` + +### Object Detection +```python +detector = pipeline("object-detection") +detector("street.jpg") +``` + +### Image Segmentation +```python +segmenter = pipeline("image-segmentation") +segmenter("image.jpg") +``` + +### Image-to-Image +```python +img2img = pipeline("image-to-image", model="lllyasviel/sd-controlnet-canny") +img2img("input.jpg") +``` + +### Depth Estimation +```python +depth = pipeline("depth-estimation") +depth("image.jpg") +``` + +### Video Classification +```python +classifier = pipeline("video-classification") +classifier("video.mp4") +``` + +### Keypoint Matching +```python +matcher = pipeline("keypoint-matching") +matcher(image1="img1.jpg", image2="img2.jpg") +``` + +## Audio Pipelines + +### Automatic Speech Recognition +```python +asr = pipeline("automatic-speech-recognition") +asr("audio.wav") +``` + +### Audio Classification +```python +classifier = pipeline("audio-classification") +classifier("audio.wav") +``` + +### Zero-Shot Audio Classification +```python +classifier = pipeline("zero-shot-audio-classification") +classifier("audio.wav", candidate_labels=["speech", "music", "noise"]) +``` + +### Text-to-Audio/Text-to-Speech +```python +synthesizer = pipeline("text-to-audio") +audio = synthesizer("Hello, how are you today?") +``` + +## Multimodal Pipelines + +### Image-to-Text (Image Captioning) +```python +captioner = pipeline("image-to-text") +captioner("image.jpg") +``` + +### Visual Question Answering +```python +vqa = pipeline("visual-question-answering") +vqa(image="image.jpg", question="What color is the car?") +``` + +### Image-Text-to-Text (VLMs) +```python +vlm = pipeline("image-text-to-text") +vlm(images="image.jpg", text="Describe this image in detail") +``` + +### Zero-Shot Object Detection +```python +detector = pipeline("zero-shot-object-detection") +detector("image.jpg", candidate_labels=["car", "person", "tree"]) +``` + +## Pipeline Configuration + +### Common Parameters + +- `model`: Specify model identifier or path +- `device`: Set device (0 for GPU, -1 for CPU, or "cuda:0") +- `batch_size`: Process multiple inputs at once +- `torch_dtype`: Set precision (torch.float16, torch.bfloat16) + +```python +# GPU with half precision +pipe = pipeline("text-generation", model="gpt2", device=0, torch_dtype=torch.float16) + +# Batch processing +pipe(["text 1", "text 2", "text 3"], batch_size=8) +``` + +### Task-Specific Parameters + +Each pipeline accepts task-specific parameters in the call: + +```python +# Text generation +generator("prompt", max_length=100, temperature=0.7, top_p=0.9, num_return_sequences=3) + +# Summarization +summarizer("text", max_length=130, min_length=30, do_sample=False) + +# Translation +translator("text", max_length=512, num_beams=4) +``` + +## Best Practices + +1. **Reuse pipelines**: Create once, use multiple times for efficiency +2. **Batch processing**: Use batches for multiple inputs to maximize throughput +3. **GPU acceleration**: Set `device=0` for GPU when available +4. **Model selection**: Choose task-specific models for best results +5. **Memory management**: Use `torch_dtype=torch.float16` for large models diff --git a/scientific-packages/transformers/references/quantization.md b/scientific-packages/transformers/references/quantization.md deleted file mode 100644 index a6a3fc4..0000000 --- a/scientific-packages/transformers/references/quantization.md +++ /dev/null @@ -1,504 +0,0 @@ -# Model Quantization Guide - -Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy. - -## Overview - -Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables: -- Running larger models on limited hardware -- Faster inference (reduced memory bandwidth) -- Lower deployment costs -- Enabling fine-tuning of models that wouldn't fit in memory - -**Tradeoffs:** -- Slight accuracy loss (typically < 1-2%) -- Initial quantization overhead -- Some methods require calibration data - -## Quick Comparison - -| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup | -|--------|-----------|-------|----------|-------------|----------|-------| -| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy | -| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium | -| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium | -| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy | -| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium | - -## Bitsandbytes (BnB) - -On-the-fly quantization with excellent PEFT fine-tuning support. - -### 8-bit Quantization - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_8bit=True, # Enable 8-bit quantization - device_map="auto", # Automatic device placement -) - -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - -# Use normally -inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda") -outputs = model.generate(**inputs, max_new_tokens=50) -``` - -**Memory Savings:** -- 7B model: ~14GB → ~7GB (50% reduction) -- 13B model: ~26GB → ~13GB -- 70B model: ~140GB → ~70GB - -**Characteristics:** -- Fast inference -- Minimal accuracy loss -- Works with PEFT (LoRA, QLoRA) -- Supports CPU and CUDA GPUs - -### 4-bit Quantization (QLoRA) - -```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig -import torch - -# Configure 4-bit quantization -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, # Enable 4-bit quantization - bnb_4bit_quant_type="nf4", # Quantization type ("nf4" or "fp4") - bnb_4bit_compute_dtype=torch.float16, # Computation dtype - bnb_4bit_use_double_quant=True, # Nested quantization for more savings -) - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=bnb_config, - device_map="auto", -) -``` - -**Memory Savings:** -- 7B model: ~14GB → ~4GB (70% reduction) -- 13B model: ~26GB → ~7GB -- 70B model: ~140GB → ~35GB - -**Quantization Types:** -- `nf4`: Normal Float 4 (recommended, better quality) -- `fp4`: Float Point 4 (slightly more memory efficient) - -**Compute Dtype:** -```python -# For better quality -bnb_4bit_compute_dtype=torch.float16 - -# For best performance on Ampere+ GPUs -bnb_4bit_compute_dtype=torch.bfloat16 -``` - -**Double Quantization:** -```python -# Enable for additional ~0.4 bits/param savings -bnb_4bit_use_double_quant=True # Quantize the quantization constants -``` - -### Fine-tuning with QLoRA - -```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer -from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training -import torch - -# Load quantized model -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, -) - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=bnb_config, - device_map="auto", -) - -# Prepare for training -model = prepare_model_for_kbit_training(model) - -# Configure LoRA -lora_config = LoraConfig( - r=16, - lora_alpha=32, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM" -) - -model = get_peft_model(model, lora_config) - -# Train normally -trainer = Trainer(model=model, args=training_args, ...) -trainer.train() -``` - -## GPTQ - -Post-training quantization requiring calibration, optimized for inference speed. - -### Loading GPTQ Models - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig - -# Load pre-quantized GPTQ model -model = AutoModelForCausalLM.from_pretrained( - "TheBloke/Llama-2-7B-GPTQ", # Pre-quantized model - device_map="auto", - revision="gptq-4bit-32g-actorder_True", # Specific quantization config -) - -# Or quantize yourself -gptq_config = GPTQConfig( - bits=4, # 2, 3, 4, 8 bits - dataset="c4", # Calibration dataset - tokenizer=tokenizer, -) - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - device_map="auto", - quantization_config=gptq_config, -) - -# Save quantized model -model.save_pretrained("llama-2-7b-gptq") -``` - -**Configuration Options:** -```python -gptq_config = GPTQConfig( - bits=4, # Quantization bits - group_size=128, # Group size for quantization (128, 32, -1) - dataset="c4", # Calibration dataset - desc_act=False, # Activation order (can improve accuracy) - sym=True, # Symmetric quantization - damp_percent=0.1, # Dampening factor -) -``` - -**Characteristics:** -- Fastest inference among quantization methods -- Requires one-time calibration (slow) -- Best when using pre-quantized models from Hub -- Limited fine-tuning support -- Excellent for production deployment - -## AWQ (Activation-aware Weight Quantization) - -Protects important weights for better quality. - -### Loading AWQ Models - -```python -from transformers import AutoModelForCausalLM, AwqConfig - -# Load pre-quantized AWQ model -model = AutoModelForCausalLM.from_pretrained( - "TheBloke/Llama-2-7B-AWQ", - device_map="auto", -) - -# Or quantize yourself -awq_config = AwqConfig( - bits=4, # 4-bit quantization - group_size=128, # Quantization group size - zero_point=True, # Use zero-point quantization - version="GEMM", # Quantization version -) - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=awq_config, - device_map="auto", -) -``` - -**Characteristics:** -- Better accuracy than GPTQ at same bit width -- Excellent inference speed -- Supports PEFT fine-tuning -- Requires calibration data - -### Fine-tuning AWQ Models - -```python -from peft import LoraConfig, get_peft_model - -# AWQ models support LoRA fine-tuning -lora_config = LoraConfig( - r=16, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - lora_dropout=0.05, - task_type="CAUSAL_LM" -) - -model = get_peft_model(model, lora_config) -trainer = Trainer(model=model, ...) -trainer.train() -``` - -## GGUF (GGML Format) - -CPU-optimized quantization format, popular in llama.cpp ecosystem. - -### Using GGUF Models - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -# Load GGUF model -model = AutoModelForCausalLM.from_pretrained( - "TheBloke/Llama-2-7B-GGUF", - gguf_file="llama-2-7b.Q4_K_M.gguf", # Specific quantization file - device_map="auto", -) - -tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF") -``` - -**GGUF Quantization Types:** -- `Q4_0`: 4-bit, smallest, lowest quality -- `Q4_K_M`: 4-bit, medium quality (recommended) -- `Q5_K_M`: 5-bit, good quality -- `Q6_K`: 6-bit, high quality -- `Q8_0`: 8-bit, very high quality - -**Characteristics:** -- Optimized for CPU inference -- Wide range of bit depths (1-8) -- Good for Apple Silicon (M1/M2) -- No fine-tuning support -- Excellent for local/edge deployment - -## HQQ (Half-Quadratic Quantization) - -Flexible quantization with good accuracy retention. - -### Using HQQ - -```python -from transformers import AutoModelForCausalLM, HqqConfig - -hqq_config = HqqConfig( - nbits=4, # Quantization bits - group_size=64, # Group size - quant_zero=False, # Quantize zero point - quant_scale=False, # Quantize scale - axis=0, # Quantization axis -) - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=hqq_config, - device_map="auto", -) -``` - -**Characteristics:** -- Very fast quantization -- No calibration data needed -- Support for 1-8 bits -- Can serialize/deserialize -- Good accuracy vs size tradeoff - -## Choosing a Quantization Method - -### Decision Tree - -**For inference only:** -1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models) -2. CPU-only deployment? → **GGUF** -3. Want easiest setup? → **Bitsandbytes 8-bit** -4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit** - -**For fine-tuning:** -1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)** -2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA** -3. Need very large models? → **QLoRA with double quantization** - -**For production:** -1. Latency-critical? → **GPTQ or AWQ** -2. Cost-optimized? → **Bitsandbytes 8-bit** -3. CPU deployment? → **GGUF** - -## Memory Requirements - -Approximate memory for Llama-2 7B model: - -| Method | Memory | vs FP16 | -|--------|--------|---------| -| FP32 | 28GB | 2x | -| FP16 / BF16 | 14GB | 1x | -| 8-bit (BnB) | 7GB | 0.5x | -| 4-bit (QLoRA) | 3.5GB | 0.25x | -| 4-bit Double Quant | 3GB | 0.21x | -| GPTQ 4-bit | 4GB | 0.29x | -| AWQ 4-bit | 4GB | 0.29x | - -**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead. - -## Best Practices - -### For Training - -```python -# QLoRA recommended configuration -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, # BF16 if available - bnb_4bit_use_double_quant=True, -) - -# LoRA configuration -lora_config = LoraConfig( - r=16, # Rank (8, 16, 32, 64) - lora_alpha=32, # Scaling (typically 2*r) - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM" -) -``` - -### For Inference - -```python -# High-speed inference -model = AutoModelForCausalLM.from_pretrained( - "TheBloke/Llama-2-7B-GPTQ", - device_map="auto", - torch_dtype=torch.float16, # Use FP16 for activations -) - -# Balanced quality/speed -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_8bit=True, - device_map="auto", -) - -# Maximum compression -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - quantization_config=BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - ), - device_map="auto", -) -``` - -### Multi-GPU Setups - -```python -# Automatically distribute across GPUs -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-70b-hf", - load_in_4bit=True, - device_map="auto", # Automatic distribution - max_memory={0: "20GB", 1: "20GB"}, # Optional: limit per GPU -) - -# Manual device map -device_map = { - "model.embed_tokens": 0, - "model.layers.0": 0, - "model.layers.1": 0, - # ... distribute layers ... - "model.norm": 1, - "lm_head": 1, -} - -model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-2-70b-hf", - load_in_4bit=True, - device_map=device_map, -) -``` - -## Troubleshooting - -**Issue: OOM during quantization** -```python -# Solution: Use low_cpu_mem_usage -model = AutoModelForCausalLM.from_pretrained( - "model-name", - quantization_config=config, - device_map="auto", - low_cpu_mem_usage=True, # Reduce CPU memory during loading -) -``` - -**Issue: Slow quantization** -```python -# GPTQ/AWQ take time to calibrate -# Solution: Use pre-quantized models from Hub -model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ") - -# Or use BnB for instant quantization -model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True) -``` - -**Issue: Poor quality after quantization** -```python -# Try different quantization types -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", # Try "nf4" instead of "fp4" - bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 if available -) - -# Or use 8-bit instead of 4-bit -model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True) -``` - -**Issue: Can't fine-tune quantized model** -```python -# Ensure using compatible quantization method -from peft import prepare_model_for_kbit_training - -model = prepare_model_for_kbit_training(model) - -# Only BnB and AWQ support PEFT fine-tuning -# GPTQ has limited support, GGUF doesn't support fine-tuning -``` - -## Performance Benchmarks - -Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB: - -| Method | Speed | Memory | -|--------|-------|--------| -| FP16 | 100 tok/s | 14GB | -| 8-bit | 90 tok/s | 7GB | -| 4-bit QLoRA | 70 tok/s | 4GB | -| GPTQ 4-bit | 95 tok/s | 4GB | -| AWQ 4-bit | 95 tok/s | 4GB | - -**Note:** Actual performance varies by hardware, sequence length, and batch size. - -## Resources - -- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub -- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes -- **PEFT library:** https://github.com/huggingface/peft -- **QLoRA paper:** https://arxiv.org/abs/2305.14314 - -For task-specific quantization examples, see `training_guide.md`. diff --git a/scientific-packages/transformers/references/task_patterns.md b/scientific-packages/transformers/references/task_patterns.md index 3ebf89a..fe8cff7 100644 --- a/scientific-packages/transformers/references/task_patterns.md +++ b/scientific-packages/transformers/references/task_patterns.md @@ -1,77 +1,101 @@ -# Task-Specific Patterns +# Common Task Patterns -Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference. +This document provides common patterns and workflows for typical tasks using Transformers. ## Text Classification -Classify text into predefined categories (sentiment, topic, intent, etc.). +### Binary or Multi-class Classification ```python from transformers import ( - AutoTokenizer, AutoModelForSequenceClassification, - TrainingArguments, Trainer, DataCollatorWithPadding + AutoTokenizer, + AutoModelForSequenceClassification, + TrainingArguments, + Trainer ) from datasets import load_dataset +import evaluate +import numpy as np -# 1. Load data +# Load dataset dataset = load_dataset("imdb") -# 2. Preprocess +# Tokenize tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -def preprocess(examples): - return tokenizer(examples["text"], truncation=True, max_length=512) +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) -tokenized = dataset.map(preprocess, batched=True) +tokenized_datasets = dataset.map(tokenize_function, batched=True) + +# Load model +id2label = {0: "negative", 1: "positive"} +label2id = {"negative": 0, "positive": 1} -# 3. Model model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2, - id2label={0: "negative", 1: "positive"}, - label2id={"negative": 0, "positive": 1} + id2label=id2label, + label2id=label2id ) -# 4. Train +# Metrics +metric = evaluate.load("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +# Train training_args = TrainingArguments( output_dir="./results", + eval_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, + per_device_eval_batch_size=64, num_train_epochs=3, - eval_strategy="epoch", + weight_decay=0.01, + load_best_model_at_end=True, ) trainer = Trainer( model=model, args=training_args, - train_dataset=tokenized["train"], - eval_dataset=tokenized["test"], - tokenizer=tokenizer, - data_collator=DataCollatorWithPadding(tokenizer=tokenizer), + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + compute_metrics=compute_metrics, ) trainer.train() -# 5. Inference +# Inference text = "This movie was fantastic!" inputs = tokenizer(text, return_tensors="pt") outputs = model(**inputs) predictions = outputs.logits.argmax(-1) -print(model.config.id2label[predictions.item()]) # "positive" +print(id2label[predictions.item()]) ``` -## Token Classification (NER) - -Label each token in text (named entities, POS tags, etc.). +## Named Entity Recognition (Token Classification) ```python -from transformers import AutoTokenizer, AutoModelForTokenClassification +from transformers import ( + AutoTokenizer, + AutoModelForTokenClassification, + TrainingArguments, + Trainer, + DataCollatorForTokenClassification +) from datasets import load_dataset +import evaluate +import numpy as np -# Load data (tokens and NER tags) +# Load dataset dataset = load_dataset("conll2003") -tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") +# Tokenize (align labels with tokenized words) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( @@ -87,524 +111,489 @@ def tokenize_and_align_labels(examples): previous_word_idx = None for word_idx in word_ids: if word_idx is None: - label_ids.append(-100) # Special tokens + label_ids.append(-100) elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) else: - label_ids.append(-100) # Subword tokens + label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs -tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True) +tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True) # Model label_list = dataset["train"].features["ner_tags"].feature.names model = AutoModelForTokenClassification.from_pretrained( - "bert-base-cased", - num_labels=len(label_list), - id2label={i: label for i, label in enumerate(label_list)}, - label2id={label: i for i, label in enumerate(label_list)} + "bert-base-uncased", + num_labels=len(label_list) ) -# Training similar to classification -# ... (use Trainer with DataCollatorForTokenClassification) -``` +# Data collator +data_collator = DataCollatorForTokenClassification(tokenizer) -## Question Answering (Extractive) +# Metrics +metric = evaluate.load("seqeval") -Extract answer spans from context. +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=2) -```python -from transformers import AutoTokenizer, AutoModelForQuestionAnswering + true_labels = [[label_list[l] for l in label if l != -100] for label in labels] + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] -tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad") -model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad") + return metric.compute(predictions=true_predictions, references=true_labels) -question = "What is the capital of France?" -context = "Paris is the capital and most populous city of France." - -inputs = tokenizer(question, context, return_tensors="pt") -outputs = model(**inputs) - -# Get answer span -answer_start = outputs.start_logits.argmax() -answer_end = outputs.end_logits.argmax() + 1 -answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end]) -print(answer) # "Paris" -``` - -## Text Generation - -Generate text continuations. - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -model = AutoModelForCausalLM.from_pretrained("gpt2") -tokenizer = AutoTokenizer.from_pretrained("gpt2") - -prompt = "In the future, artificial intelligence will" -inputs = tokenizer(prompt, return_tensors="pt") - -outputs = model.generate( - **inputs, - max_new_tokens=100, - do_sample=True, - temperature=0.8, - top_p=0.95, - repetition_penalty=1.2, -) - -generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(generated_text) -``` - -## Summarization - -Condense long text into summaries. - -```python -from transformers import ( - AutoTokenizer, AutoModelForSeq2SeqLM, - Seq2SeqTrainingArguments, Seq2SeqTrainer, - DataCollatorForSeq2Seq -) - -tokenizer = AutoTokenizer.from_pretrained("t5-small") -model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") - -def preprocess(examples): - inputs = ["summarize: " + doc for doc in examples["document"]] - model_inputs = tokenizer(inputs, max_length=1024, truncation=True) - - labels = tokenizer( - examples["summary"], - max_length=128, - truncation=True - ) - model_inputs["labels"] = labels["input_ids"] - return model_inputs - -tokenized_dataset = dataset.map(preprocess, batched=True) - -# Training -training_args = Seq2SeqTrainingArguments( +# Train +training_args = TrainingArguments( output_dir="./results", - predict_with_generate=True, # Important for seq2seq eval_strategy="epoch", learning_rate=2e-5, - per_device_train_batch_size=8, + per_device_train_batch_size=16, num_train_epochs=3, + weight_decay=0.01, ) -trainer = Seq2SeqTrainer( +trainer = Trainer( model=model, args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["validation"], - tokenizer=tokenizer, - data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer), + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + compute_metrics=compute_metrics, ) trainer.train() - -# Inference -text = "Long article text here..." -inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True) -outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True) -summary = tokenizer.decode(outputs[0], skip_special_tokens=True) ``` -## Translation - -Translate text between languages. - -```python -from transformers import pipeline - -translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr") -result = translator("Hello, how are you?") -print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?" - -# For fine-tuning, similar to summarization with Seq2SeqTrainer -``` - -## Image Classification - -Classify images into categories. +## Question Answering ```python from transformers import ( - AutoImageProcessor, AutoModelForImageClassification, - TrainingArguments, Trainer + AutoTokenizer, + AutoModelForQuestionAnswering, + TrainingArguments, + Trainer, + DefaultDataCollator ) from datasets import load_dataset -from PIL import Image -# Load data -dataset = load_dataset("food101", split="train[:1000]") +# Load dataset +dataset = load_dataset("squad") -# Preprocess -processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +# Tokenize +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") -def transform(examples): - examples["pixel_values"] = [ - processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0] - for img in examples["image"] - ] - return examples +def preprocess_function(examples): + questions = [q.strip() for q in examples["question"]] + inputs = tokenizer( + questions, + examples["context"], + max_length=384, + truncation="only_second", + return_offsets_mapping=True, + padding="max_length", + ) -dataset = dataset.with_transform(transform) + offset_mapping = inputs.pop("offset_mapping") + answers = examples["answers"] + start_positions = [] + end_positions = [] + + for i, offset in enumerate(offset_mapping): + answer = answers[i] + start_char = answer["answer_start"][0] + end_char = start_char + len(answer["text"][0]) + + # Find start and end token positions + sequence_ids = inputs.sequence_ids(i) + context_start = sequence_ids.index(1) + context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1) + + if offset[context_start][0] > end_char or offset[context_end][1] < start_char: + start_positions.append(0) + end_positions.append(0) + else: + idx = context_start + while idx <= context_end and offset[idx][0] <= start_char: + idx += 1 + start_positions.append(idx - 1) + + idx = context_end + while idx >= context_start and offset[idx][1] >= end_char: + idx -= 1 + end_positions.append(idx + 1) + + inputs["start_positions"] = start_positions + inputs["end_positions"] = end_positions + return inputs + +tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names) # Model -model = AutoModelForImageClassification.from_pretrained( - "google/vit-base-patch16-224", - num_labels=101, - ignore_mismatched_sizes=True -) +model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased") -# Training +# Train training_args = TrainingArguments( output_dir="./results", - remove_unused_columns=False, # Keep image data eval_strategy="epoch", - learning_rate=5e-5, - per_device_train_batch_size=32, + learning_rate=2e-5, + per_device_train_batch_size=16, num_train_epochs=3, ) trainer = Trainer( model=model, args=training_args, - train_dataset=dataset, - tokenizer=processor, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=DefaultDataCollator(), ) trainer.train() # Inference -image = Image.open("food.jpg") -inputs = processor(image, return_tensors="pt") -outputs = model(**inputs) -predicted_class = outputs.logits.argmax(-1).item() -print(model.config.id2label[predicted_class]) -``` - -## Object Detection - -Detect and localize objects in images. - -```python -from transformers import pipeline -from PIL import Image - -detector = pipeline("object-detection", model="facebook/detr-resnet-50") - -image = Image.open("street.jpg") -results = detector(image) - -for result in results: - print(f"{result['label']}: {result['score']:.2f} at {result['box']}") - # car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011} -``` - -## Image Segmentation - -Segment images into regions. - -```python -from transformers import pipeline - -segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic") - -image = "path/to/image.jpg" -segments = segmenter(image) - -for segment in segments: - print(f"{segment['label']}: {segment['score']:.2f}") - # Access mask: segment['mask'] -``` - -## Image Captioning - -Generate textual descriptions of images. - -```python -from transformers import AutoProcessor, AutoModelForCausalLM -from PIL import Image - -processor = AutoProcessor.from_pretrained("microsoft/git-base") -model = AutoModelForCausalLM.from_pretrained("microsoft/git-base") - -image = Image.open("photo.jpg") -inputs = processor(images=image, return_tensors="pt") - -outputs = model.generate(**inputs, max_length=50) -caption = processor.batch_decode(outputs, skip_special_tokens=True)[0] -print(caption) # "a dog sitting on grass" -``` - -## Speech Recognition (ASR) - -Transcribe speech to text. - -```python -from transformers import pipeline - -transcriber = pipeline( - "automatic-speech-recognition", - model="openai/whisper-base" -) - -result = transcriber("audio.mp3") -print(result["text"]) # "Hello, this is a test." - -# With timestamps -result = transcriber("audio.mp3", return_timestamps=True) -for chunk in result["chunks"]: - print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}") -``` - -## Text-to-Speech - -Generate speech from text. - -```python -from transformers import pipeline - -synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts") - -result = synthesizer("Hello, how are you today?") -# result["audio"] contains the waveform -# result["sampling_rate"] contains the sample rate - -# Save audio -import scipy -scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0]) -``` - -## Visual Question Answering - -Answer questions about images. - -```python -from transformers import pipeline -from PIL import Image - -vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa") - -image = Image.open("photo.jpg") -question = "What color is the car?" - -result = vqa(image=image, question=question) -print(result[0]["answer"]) # "red" -``` - -## Document Question Answering - -Extract information from documents (PDFs, images with text). - -```python -from transformers import pipeline - -doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa") - -result = doc_qa( - image="invoice.png", - question="What is the total amount?" -) - -print(result["answer"]) # "$1,234.56" -``` - -## Zero-Shot Classification - -Classify without training data. - -```python -from transformers import pipeline - -classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") - -text = "This is a delicious Italian restaurant with great pasta." -candidate_labels = ["food", "travel", "technology", "sports"] - -result = classifier(text, candidate_labels) -print(result["labels"][0]) # "food" -print(result["scores"][0]) # 0.95 -``` - -## Few-Shot Learning with LLMs - -Use large language models for few-shot tasks. - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - -# Few-shot prompt -prompt = """ -Classify the sentiment: positive, negative, or neutral. - -Text: "I love this product!" -Sentiment: positive - -Text: "This is terrible." -Sentiment: negative - -Text: "It's okay, nothing special." -Sentiment: neutral - -Text: "Best purchase ever!" -Sentiment:""" - -inputs = tokenizer(prompt, return_tensors="pt") -outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1) -response = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(response.split("Sentiment:")[-1].strip()) # "positive" -``` - -## Instruction-Following / Chat - -Use instruction-tuned models. - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf") - -messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "What is machine learning?"}, -] - -formatted = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True -) - -inputs = tokenizer(formatted, return_tensors="pt") -outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7) -response = tokenizer.decode(outputs[0], skip_special_tokens=True) - -# Extract assistant response -assistant_response = response.split("[/INST]")[-1].strip() -print(assistant_response) -``` - -## Embeddings / Semantic Search - -Generate embeddings for semantic similarity. - -```python -from transformers import AutoTokenizer, AutoModel -import torch - -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") -model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - -def get_embedding(text): - inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) - with torch.no_grad(): - outputs = model(**inputs) - - # Mean pooling - embeddings = outputs.last_hidden_state.mean(dim=1) - return embeddings - -# Get embeddings -text1 = "Machine learning is a subset of AI" -text2 = "AI includes machine learning" - -emb1 = get_embedding(text1) -emb2 = get_embedding(text2) - -# Compute similarity -similarity = torch.nn.functional.cosine_similarity(emb1, emb2) -print(f"Similarity: {similarity.item():.4f}") # ~0.85 -``` - -## Multimodal Understanding (CLIP) - -Connect vision and language. - -```python -from transformers import CLIPProcessor, CLIPModel -from PIL import Image - -model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") -processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - -image = Image.open("photo.jpg") -texts = ["a dog", "a cat", "a car", "a house"] - -inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) +question = "What is the capital of France?" +context = "Paris is the capital and most populous city of France." +inputs = tokenizer(question, context, return_tensors="pt") outputs = model(**inputs) -# Get similarity scores -logits_per_image = outputs.logits_per_image -probs = logits_per_image.softmax(dim=1) - -for text, prob in zip(texts, probs[0]): - print(f"{text}: {prob.item():.4f}") +start_pos = outputs.start_logits.argmax() +end_pos = outputs.end_logits.argmax() +answer_tokens = inputs.input_ids[0][start_pos:end_pos+1] +answer = tokenizer.decode(answer_tokens) ``` -## Common Evaluation Metrics - -```python -from datasets import load_metric - -# Accuracy (classification) -metric = load_metric("accuracy") -predictions = [0, 1, 1, 0] -references = [0, 1, 0, 0] -result = metric.compute(predictions=predictions, references=references) - -# F1 Score (classification, NER) -metric = load_metric("f1") -result = metric.compute(predictions=predictions, references=references) - -# BLEU (translation) -metric = load_metric("bleu") -predictions = ["hello there general kenobi"] -references = [["hello there general kenobi", "hello there!"]] -result = metric.compute(predictions=predictions, references=references) - -# ROUGE (summarization) -metric = load_metric("rouge") -predictions = ["summary text"] -references = ["reference summary"] -result = metric.compute(predictions=predictions, references=references) -``` - -## Common Data Collators +## Text Summarization ```python from transformers import ( - DataCollatorWithPadding, - DataCollatorForTokenClassification, - DataCollatorForSeq2Seq, - DataCollatorForLanguageModeling, + AutoTokenizer, + AutoModelForSeq2SeqLM, + TrainingArguments, + Trainer, + DataCollatorForSeq2Seq +) +from datasets import load_dataset +import evaluate + +# Load dataset +dataset = load_dataset("cnn_dailymail", "3.0.0") + +# Tokenize +tokenizer = AutoTokenizer.from_pretrained("t5-small") + +def preprocess_function(examples): + inputs = ["summarize: " + doc for doc in examples["article"]] + model_inputs = tokenizer(inputs, max_length=1024, truncation=True) + + labels = tokenizer( + text_target=examples["highlights"], + max_length=128, + truncation=True + ) + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + +tokenized_datasets = dataset.map(preprocess_function, batched=True) + +# Model +model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") + +# Data collator +data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + +# Metrics +rouge = evaluate.load("rouge") + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + result = rouge.compute( + predictions=decoded_preds, + references=decoded_labels, + use_stemmer=True + ) + + return {k: round(v, 4) for k, v in result.items()} + +# Train +training_args = TrainingArguments( + output_dir="./results", + eval_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + num_train_epochs=3, + predict_with_generate=True, ) -# Classification: dynamic padding -DataCollatorWithPadding(tokenizer=tokenizer) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + compute_metrics=compute_metrics, +) -# NER: pad labels too -DataCollatorForTokenClassification(tokenizer=tokenizer) +trainer.train() -# Seq2Seq: pad inputs and labels -DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) - -# Language modeling: create MLM masks -DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) +# Inference +text = "Long article text..." +inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True) +outputs = model.generate(**inputs, max_length=128, num_beams=4) +summary = tokenizer.decode(outputs[0], skip_special_tokens=True) ``` -This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`. +## Translation + +```python +from transformers import ( + AutoTokenizer, + AutoModelForSeq2SeqLM, + TrainingArguments, + Trainer, + DataCollatorForSeq2Seq +) +from datasets import load_dataset + +# Load dataset +dataset = load_dataset("wmt16", "de-en") + +# Tokenize +tokenizer = AutoTokenizer.from_pretrained("t5-small") + +def preprocess_function(examples): + inputs = [f"translate German to English: {de}" for de in examples["de"]] + model_inputs = tokenizer(inputs, max_length=128, truncation=True) + + labels = tokenizer( + text_target=examples["en"], + max_length=128, + truncation=True + ) + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + +tokenized_datasets = dataset.map(preprocess_function, batched=True) + +# Model and training (similar to summarization) +model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") + +# Inference +text = "Guten Tag, wie geht es Ihnen?" +inputs = tokenizer(f"translate German to English: {text}", return_tensors="pt") +outputs = model.generate(**inputs, max_length=128) +translation = tokenizer.decode(outputs[0], skip_special_tokens=True) +``` + +## Causal Language Modeling (Training from Scratch or Fine-tuning) + +```python +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + TrainingArguments, + Trainer, + DataCollatorForLanguageModeling +) +from datasets import load_dataset + +# Load dataset +dataset = load_dataset("wikitext", "wikitext-2-raw-v1") + +# Tokenize +tokenizer = AutoTokenizer.from_pretrained("gpt2") +tokenizer.pad_token = tokenizer.eos_token + +def tokenize_function(examples): + return tokenizer(examples["text"], truncation=True, max_length=512) + +tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) + +# Group texts into chunks +block_size = 128 + +def group_texts(examples): + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + total_length = (total_length // block_size) * block_size + result = { + k: [t[i:i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + +lm_datasets = tokenized_datasets.map(group_texts, batched=True) + +# Model +model = AutoModelForCausalLM.from_pretrained("gpt2") + +# Data collator +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + +# Train +training_args = TrainingArguments( + output_dir="./results", + eval_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=8, + num_train_epochs=3, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=lm_datasets["train"], + eval_dataset=lm_datasets["validation"], + data_collator=data_collator, +) + +trainer.train() +``` + +## Image Classification + +```python +from transformers import ( + AutoImageProcessor, + AutoModelForImageClassification, + TrainingArguments, + Trainer +) +from datasets import load_dataset +from torchvision.transforms import Compose, Resize, ToTensor, Normalize +import numpy as np +import evaluate + +# Load dataset +dataset = load_dataset("food101", split="train[:5000]") + +# Prepare image transforms +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") + +normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) +size = image_processor.size["height"] + +transforms = Compose([ + Resize((size, size)), + ToTensor(), + normalize, +]) + +def preprocess_function(examples): + examples["pixel_values"] = [transforms(img.convert("RGB")) for img in examples["image"]] + return examples + +dataset = dataset.with_transform(preprocess_function) + +# Model +model = AutoModelForImageClassification.from_pretrained( + "google/vit-base-patch16-224", + num_labels=len(dataset["train"].features["label"].names), + ignore_mismatched_sizes=True +) + +# Metrics +metric = evaluate.load("accuracy") + +def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=1) + return metric.compute(predictions=predictions, references=eval_pred.label_ids) + +# Train +training_args = TrainingArguments( + output_dir="./results", + eval_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=16, + num_train_epochs=3, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["validation"], + compute_metrics=compute_metrics, +) + +trainer.train() +``` + +## Vision-Language Tasks (Image Captioning) + +```python +from transformers import ( + AutoProcessor, + AutoModelForVision2Seq, + TrainingArguments, + Trainer +) +from datasets import load_dataset +from PIL import Image + +# Load dataset +dataset = load_dataset("ybelkada/football-dataset") + +# Processor +processor = AutoProcessor.from_pretrained("microsoft/git-base") + +def preprocess_function(examples): + images = [Image.open(img).convert("RGB") for img in examples["image"]] + texts = examples["caption"] + + inputs = processor(images=images, text=texts, padding="max_length", truncation=True) + inputs["labels"] = inputs["input_ids"] + return inputs + +dataset = dataset.map(preprocess_function, batched=True) + +# Model +model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base") + +# Train +training_args = TrainingArguments( + output_dir="./results", + eval_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=8, + num_train_epochs=3, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], +) + +trainer.train() + +# Inference +image = Image.open("image.jpg") +inputs = processor(images=image, return_tensors="pt") +outputs = model.generate(**inputs) +caption = processor.decode(outputs[0], skip_special_tokens=True) +``` + +## Best Practices Summary + +1. **Use appropriate Auto* classes**: AutoTokenizer, AutoModel, etc. for model loading +2. **Proper preprocessing**: Tokenize, align labels, handle special cases +3. **Data collators**: Use appropriate collators for dynamic padding +4. **Metrics**: Load and compute relevant metrics for evaluation +5. **Training arguments**: Configure properly for task and hardware +6. **Inference**: Use pipeline() for quick inference, or manual tokenization for custom needs diff --git a/scientific-packages/transformers/references/training.md b/scientific-packages/transformers/references/training.md new file mode 100644 index 0000000..f589192 --- /dev/null +++ b/scientific-packages/transformers/references/training.md @@ -0,0 +1,328 @@ +# Training with Transformers + +Transformers provides comprehensive training capabilities through the `Trainer` API, supporting distributed training, mixed precision, and advanced optimization techniques. + +## Basic Training Workflow + +```python +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + Trainer, + TrainingArguments +) +from datasets import load_dataset + +# 1. Load and preprocess data +dataset = load_dataset("imdb") +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +def tokenize_function(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) + +tokenized_datasets = dataset.map(tokenize_function, batched=True) + +# 2. Load model +model = AutoModelForSequenceClassification.from_pretrained( + "bert-base-uncased", + num_labels=2 +) + +# 3. Define training arguments +training_args = TrainingArguments( + output_dir="./results", + num_train_epochs=3, + per_device_train_batch_size=16, + per_device_eval_batch_size=64, + learning_rate=2e-5, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) + +# 4. Create trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], +) + +# 5. Train +trainer.train() + +# 6. Evaluate +trainer.evaluate() + +# 7. Save model +trainer.save_model("./final_model") +``` + +## TrainingArguments Configuration + +### Essential Parameters + +**Output and Logging:** +- `output_dir`: Directory for checkpoints and outputs (required) +- `logging_dir`: TensorBoard log directory (default: `{output_dir}/runs`) +- `logging_steps`: Log every N steps (default: 500) +- `logging_strategy`: "steps" or "epoch" + +**Training Duration:** +- `num_train_epochs`: Number of epochs (default: 3.0) +- `max_steps`: Max training steps (overrides num_train_epochs if set) + +**Batch Size and Gradient Accumulation:** +- `per_device_train_batch_size`: Batch size per device (default: 8) +- `per_device_eval_batch_size`: Eval batch size per device (default: 8) +- `gradient_accumulation_steps`: Accumulate gradients over N steps (default: 1) +- Effective batch size = `per_device_train_batch_size * gradient_accumulation_steps * num_gpus` + +**Learning Rate:** +- `learning_rate`: Peak learning rate (default: 5e-5) +- `lr_scheduler_type`: Scheduler type ("linear", "cosine", "constant", etc.) +- `warmup_steps`: Warmup steps (default: 0) +- `warmup_ratio`: Warmup as fraction of total steps + +**Evaluation:** +- `eval_strategy`: "no", "steps", or "epoch" (default: "no") +- `eval_steps`: Evaluate every N steps (if eval_strategy="steps") +- `eval_delay`: Delay evaluation until N steps + +**Checkpointing:** +- `save_strategy`: "no", "steps", or "epoch" (default: "steps") +- `save_steps`: Save checkpoint every N steps (default: 500) +- `save_total_limit`: Keep only N most recent checkpoints +- `load_best_model_at_end`: Load best checkpoint at end (default: False) +- `metric_for_best_model`: Metric to determine best model + +**Optimization:** +- `optim`: Optimizer ("adamw_torch", "adamw_hf", "sgd", etc.) +- `weight_decay`: Weight decay coefficient (default: 0.0) +- `adam_beta1`, `adam_beta2`: Adam optimizer betas +- `adam_epsilon`: Epsilon for Adam (default: 1e-8) +- `max_grad_norm`: Max gradient norm for clipping (default: 1.0) + +### Mixed Precision Training + +```python +training_args = TrainingArguments( + output_dir="./results", + fp16=True, # Use fp16 on NVIDIA GPUs + fp16_opt_level="O1", # O0, O1, O2, O3 (Apex levels) + # or + bf16=True, # Use bf16 on Ampere+ GPUs (better than fp16) +) +``` + +### Distributed Training + +**DataParallel (single-node multi-GPU):** +```python +# Automatic with multiple GPUs +training_args = TrainingArguments( + output_dir="./results", + per_device_train_batch_size=16, # Per GPU +) +# Run: python script.py +``` + +**DistributedDataParallel (multi-node or multi-GPU):** +```bash +# Single node, multiple GPUs +python -m torch.distributed.launch --nproc_per_node=4 script.py + +# Or use accelerate +accelerate config +accelerate launch script.py +``` + +**DeepSpeed Integration:** +```python +training_args = TrainingArguments( + output_dir="./results", + deepspeed="ds_config.json", # DeepSpeed config file +) +``` + +### Advanced Features + +**Gradient Checkpointing (reduce memory):** +```python +training_args = TrainingArguments( + output_dir="./results", + gradient_checkpointing=True, +) +``` + +**Compilation with torch.compile:** +```python +training_args = TrainingArguments( + output_dir="./results", + torch_compile=True, + torch_compile_backend="inductor", # or "cudagraphs" +) +``` + +**Push to Hub:** +```python +training_args = TrainingArguments( + output_dir="./results", + push_to_hub=True, + hub_model_id="username/model-name", + hub_strategy="every_save", # or "end" +) +``` + +## Custom Training Components + +### Custom Metrics + +```python +import evaluate +import numpy as np + +metric = evaluate.load("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +trainer = Trainer( + model=model, + args=training_args, + compute_metrics=compute_metrics, +) +``` + +### Custom Loss Function + +```python +class CustomTrainer(Trainer): + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop("labels") + outputs = model(**inputs) + logits = outputs.logits + + # Custom loss calculation + loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights) + loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + + return (loss, outputs) if return_outputs else loss +``` + +### Data Collator + +```python +from transformers import DataCollatorWithPadding + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=data_collator, +) +``` + +### Callbacks + +```python +from transformers import TrainerCallback + +class CustomCallback(TrainerCallback): + def on_epoch_end(self, args, state, control, **kwargs): + print(f"Epoch {state.epoch} completed!") + return control + +trainer = Trainer( + model=model, + args=training_args, + callbacks=[CustomCallback], +) +``` + +## Hyperparameter Search + +```python +def model_init(): + return AutoModelForSequenceClassification.from_pretrained( + "bert-base-uncased", + num_labels=2 + ) + +trainer = Trainer( + model_init=model_init, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, +) + +# Optuna-based search +best_trial = trainer.hyperparameter_search( + direction="maximize", + backend="optuna", + n_trials=10, + hp_space=lambda trial: { + "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True), + "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]), + "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5), + } +) +``` + +## Training Best Practices + +1. **Start with small learning rates**: 2e-5 to 5e-5 for fine-tuning +2. **Use warmup**: 5-10% of total steps for learning rate warmup +3. **Monitor training**: Use eval_strategy="epoch" or "steps" to track progress +4. **Save checkpoints**: Set save_strategy and save_total_limit +5. **Use mixed precision**: Enable fp16 or bf16 for faster training +6. **Gradient accumulation**: For large effective batch sizes on limited memory +7. **Load best model**: Set load_best_model_at_end=True to avoid overfitting +8. **Push to Hub**: Enable push_to_hub for easy model sharing and versioning + +## Common Training Patterns + +### Classification +```python +model = AutoModelForSequenceClassification.from_pretrained( + "bert-base-uncased", + num_labels=num_classes, + id2label=id2label, + label2id=label2id +) +``` + +### Question Answering +```python +model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased") +``` + +### Token Classification (NER) +```python +model = AutoModelForTokenClassification.from_pretrained( + "bert-base-uncased", + num_labels=num_tags, + id2label=id2label, + label2id=label2id +) +``` + +### Sequence-to-Sequence +```python +model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") +``` + +### Causal Language Modeling +```python +model = AutoModelForCausalLM.from_pretrained("gpt2") +``` + +### Masked Language Modeling +```python +model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") +``` diff --git a/scientific-packages/transformers/scripts/fine_tune_classifier.py b/scientific-packages/transformers/scripts/fine_tune_classifier.py old mode 100755 new mode 100644 index 6eba340..46f9500 --- a/scientific-packages/transformers/scripts/fine_tune_classifier.py +++ b/scientific-packages/transformers/scripts/fine_tune_classifier.py @@ -1,19 +1,12 @@ #!/usr/bin/env python3 """ -Complete example for fine-tuning a text classification model. +Fine-tune a transformer model for text classification. -This script demonstrates the full workflow: -1. Load dataset -2. Preprocess with tokenizer -3. Configure model -4. Train with Trainer -5. Evaluate and save - -Usage: - python fine_tune_classifier.py --model bert-base-uncased --dataset imdb --epochs 3 +This script demonstrates the complete workflow for fine-tuning a pre-trained +model on a classification task using the Trainer API. """ -import argparse +import numpy as np from datasets import load_dataset from transformers import ( AutoTokenizer, @@ -23,189 +16,225 @@ from transformers import ( DataCollatorWithPadding, ) import evaluate -import numpy as np -def compute_metrics(eval_pred): - """Compute accuracy and F1 score.""" - metric_accuracy = evaluate.load("accuracy") - metric_f1 = evaluate.load("f1") +def load_and_prepare_data(dataset_name="imdb", model_name="distilbert-base-uncased", max_samples=None): + """ + Load dataset and tokenize. - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) + Args: + dataset_name: Name of the dataset to load + model_name: Name of the model/tokenizer to use + max_samples: Limit number of samples (for quick testing) - accuracy = metric_accuracy.compute(predictions=predictions, references=labels) - f1 = metric_f1.compute(predictions=predictions, references=labels) + Returns: + tokenized_datasets, tokenizer + """ + print(f"Loading dataset: {dataset_name}") + dataset = load_dataset(dataset_name) - return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]} + # Optionally limit samples for quick testing + if max_samples: + dataset["train"] = dataset["train"].select(range(max_samples)) + dataset["test"] = dataset["test"].select(range(min(max_samples, len(dataset["test"])))) + + print(f"Loading tokenizer: {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) + + def tokenize_function(examples): + return tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=512 + ) + + print("Tokenizing dataset...") + tokenized_datasets = dataset.map(tokenize_function, batched=True) + + return tokenized_datasets, tokenizer -def main(): - parser = argparse.ArgumentParser(description="Fine-tune a text classification model") - parser.add_argument( - "--model", - type=str, - default="bert-base-uncased", - help="Pretrained model name or path", - ) - parser.add_argument( - "--dataset", - type=str, - default="imdb", - help="Dataset name from Hugging Face Hub", - ) - parser.add_argument( - "--max-samples", - type=int, - default=None, - help="Maximum samples to use (for quick testing)", - ) - parser.add_argument( - "--output-dir", - type=str, - default="./results", - help="Output directory for checkpoints", - ) - parser.add_argument( - "--epochs", - type=int, - default=3, - help="Number of training epochs", - ) - parser.add_argument( - "--batch-size", - type=int, - default=16, - help="Batch size per device", - ) - parser.add_argument( - "--learning-rate", - type=float, - default=2e-5, - help="Learning rate", - ) - parser.add_argument( - "--push-to-hub", - action="store_true", - help="Push model to Hugging Face Hub after training", - ) +def create_model(model_name, num_labels, id2label, label2id): + """ + Create classification model. - args = parser.parse_args() - - print("=" * 60) - print("Text Classification Fine-Tuning") - print("=" * 60) - print(f"Model: {args.model}") - print(f"Dataset: {args.dataset}") - print(f"Epochs: {args.epochs}") - print(f"Batch size: {args.batch_size}") - print(f"Learning rate: {args.learning_rate}") - print("=" * 60) - - # 1. Load dataset - print("\n[1/5] Loading dataset...") - dataset = load_dataset(args.dataset) - - if args.max_samples: - dataset["train"] = dataset["train"].select(range(args.max_samples)) - dataset["test"] = dataset["test"].select(range(args.max_samples // 5)) - - print(f"Train samples: {len(dataset['train'])}") - print(f"Test samples: {len(dataset['test'])}") - - # 2. Preprocess - print("\n[2/5] Preprocessing data...") - tokenizer = AutoTokenizer.from_pretrained(args.model) - - def preprocess_function(examples): - return tokenizer(examples["text"], truncation=True, max_length=512) - - tokenized_dataset = dataset.map(preprocess_function, batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # 3. Load model - print("\n[3/5] Loading model...") - - # Determine number of labels - num_labels = len(set(dataset["train"]["label"])) + Args: + model_name: Name of the pre-trained model + num_labels: Number of classification labels + id2label: Dictionary mapping label IDs to names + label2id: Dictionary mapping label names to IDs + Returns: + model + """ + print(f"Loading model: {model_name}") model = AutoModelForSequenceClassification.from_pretrained( - args.model, + model_name, num_labels=num_labels, + id2label=id2label, + label2id=label2id ) + return model - print(f"Number of labels: {num_labels}") - print(f"Model parameters: {model.num_parameters():,}") - # 4. Configure training - print("\n[4/5] Configuring training...") +def define_compute_metrics(metric_name="accuracy"): + """ + Define function to compute metrics during evaluation. + + Args: + metric_name: Name of the metric to use + + Returns: + compute_metrics function + """ + metric = evaluate.load(metric_name) + + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + return compute_metrics + + +def train_model(model, tokenizer, train_dataset, eval_dataset, output_dir="./results"): + """ + Train the model. + + Args: + model: The model to train + tokenizer: The tokenizer + train_dataset: Training dataset + eval_dataset: Evaluation dataset + output_dir: Directory for checkpoints and logs + + Returns: + trained model, trainer + """ + # Define training arguments training_args = TrainingArguments( - output_dir=args.output_dir, - learning_rate=args.learning_rate, - per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=args.batch_size, - num_train_epochs=args.epochs, + output_dir=output_dir, + num_train_epochs=3, + per_device_train_batch_size=16, + per_device_eval_batch_size=64, + learning_rate=2e-5, weight_decay=0.01, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, - push_to_hub=args.push_to_hub, + metric_for_best_model="accuracy", + logging_dir=f"{output_dir}/logs", logging_steps=100, + save_total_limit=2, + fp16=False, # Set to True if using GPU with fp16 support ) + # Create data collator + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Create trainer trainer = Trainer( model=model, args=training_args, - train_dataset=tokenized_dataset["train"], - eval_dataset=tokenized_dataset["test"], - tokenizer=tokenizer, + train_dataset=train_dataset, + eval_dataset=eval_dataset, data_collator=data_collator, - compute_metrics=compute_metrics, + compute_metrics=define_compute_metrics("accuracy"), ) - # 5. Train - print("\n[5/5] Training...") - print("-" * 60) + # Train + print("\nStarting training...") trainer.train() # Evaluate - print("\n" + "=" * 60) - print("Final Evaluation") - print("=" * 60) - metrics = trainer.evaluate() + print("\nEvaluating model...") + eval_results = trainer.evaluate() + print(f"Evaluation results: {eval_results}") - print(f"Accuracy: {metrics['eval_accuracy']:.4f}") - print(f"F1 Score: {metrics['eval_f1']:.4f}") - print(f"Loss: {metrics['eval_loss']:.4f}") + return model, trainer - # Save - print("\n" + "=" * 60) - print(f"Saving model to {args.output_dir}") - trainer.save_model(args.output_dir) - tokenizer.save_pretrained(args.output_dir) - if args.push_to_hub: - print("Pushing to Hugging Face Hub...") - trainer.push_to_hub() +def test_inference(model, tokenizer, id2label): + """ + Test the trained model with sample texts. + + Args: + model: Trained model + tokenizer: Tokenizer + id2label: Dictionary mapping label IDs to names + """ + print("\n=== Testing Inference ===") + + test_texts = [ + "This movie was absolutely fantastic! I loved every minute of it.", + "Terrible film. Waste of time and money.", + "It was okay, nothing special but not bad either." + ] + + for text in test_texts: + inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) + outputs = model(**inputs) + predictions = outputs.logits.argmax(-1) + predicted_label = id2label[predictions.item()] + confidence = outputs.logits.softmax(-1).max().item() + + print(f"\nText: {text}") + print(f"Prediction: {predicted_label} (confidence: {confidence:.3f})") + + +def main(): + """Main training pipeline.""" + # Configuration + DATASET_NAME = "imdb" + MODEL_NAME = "distilbert-base-uncased" + OUTPUT_DIR = "./results" + MAX_SAMPLES = None # Set to a small number (e.g., 1000) for quick testing + + # Label mapping + id2label = {0: "negative", 1: "positive"} + label2id = {"negative": 0, "positive": 1} + num_labels = len(id2label) print("=" * 60) - print("Training complete!") + print("Fine-Tuning Text Classification Model") print("=" * 60) - # Quick inference example - print("\nQuick inference example:") - from transformers import pipeline - - classifier = pipeline( - "text-classification", - model=args.output_dir, - tokenizer=args.output_dir, + # Load and prepare data + tokenized_datasets, tokenizer = load_and_prepare_data( + dataset_name=DATASET_NAME, + model_name=MODEL_NAME, + max_samples=MAX_SAMPLES ) - example_text = "This is a great example of how to use transformers!" - result = classifier(example_text) - print(f"Text: {example_text}") - print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})") + # Create model + model = create_model( + model_name=MODEL_NAME, + num_labels=num_labels, + id2label=id2label, + label2id=label2id + ) + + # Train model + model, trainer = train_model( + model=model, + tokenizer=tokenizer, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + output_dir=OUTPUT_DIR + ) + + # Save final model + print(f"\nSaving model to {OUTPUT_DIR}/final_model") + trainer.save_model(f"{OUTPUT_DIR}/final_model") + tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model") + + # Test inference + test_inference(model, tokenizer, id2label) + + print("\n" + "=" * 60) + print("Training completed successfully!") + print("=" * 60) if __name__ == "__main__": diff --git a/scientific-packages/transformers/scripts/generate_text.py b/scientific-packages/transformers/scripts/generate_text.py old mode 100755 new mode 100644 index f813a9c..1b52a49 --- a/scientific-packages/transformers/scripts/generate_text.py +++ b/scientific-packages/transformers/scripts/generate_text.py @@ -1,231 +1,188 @@ #!/usr/bin/env python3 """ -Text generation with various strategies. +Text generation with different decoding strategies. -This script demonstrates different generation strategies: -- Greedy decoding -- Beam search -- Sampling with temperature -- Top-k and top-p sampling - -Usage: - python generate_text.py --model gpt2 --prompt "The future of AI" --strategy sampling +This script demonstrates various text generation approaches using +different sampling and decoding strategies. """ -import argparse -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig -def generate_with_greedy(model, tokenizer, prompt, max_length): - """Greedy decoding (deterministic).""" - print("\n" + "=" * 60) - print("GREEDY DECODING") - print("=" * 60) +def load_model_and_tokenizer(model_name="gpt2"): + """ + Load model and tokenizer. - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + Args: + model_name: Name of the model to load + Returns: + model, tokenizer + """ + print(f"Loading model: {model_name}") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name) + + # Set pad token if not already set + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + return model, tokenizer + + +def generate_with_greedy(model, tokenizer, prompt, max_new_tokens=50): + """Greedy decoding - always picks highest probability token.""" + print("\n=== Greedy Decoding ===") + print(f"Prompt: {prompt}") + + inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, - max_new_tokens=max_length, - pad_token_id=tokenizer.eos_token_id, + max_new_tokens=max_new_tokens, + do_sample=False, + num_beams=1, + pad_token_id=tokenizer.pad_token_id ) - text = tokenizer.decode(outputs[0], skip_special_tokens=True) - print(f"\nPrompt: {prompt}") - print(f"\nGenerated:\n{text}") + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Generated: {generated_text}\n") -def generate_with_beam_search(model, tokenizer, prompt, max_length, num_beams=5): - """Beam search for higher quality.""" - print("\n" + "=" * 60) - print(f"BEAM SEARCH (num_beams={num_beams})") - print("=" * 60) - - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +def generate_with_beam_search(model, tokenizer, prompt, max_new_tokens=50, num_beams=5): + """Beam search - explores multiple hypotheses.""" + print("\n=== Beam Search ===") + print(f"Prompt: {prompt}") + inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, - max_new_tokens=max_length, + max_new_tokens=max_new_tokens, num_beams=num_beams, early_stopping=True, no_repeat_ngram_size=2, - pad_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id ) - text = tokenizer.decode(outputs[0], skip_special_tokens=True) - print(f"\nPrompt: {prompt}") - print(f"\nGenerated:\n{text}") + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Generated: {generated_text}\n") -def generate_with_sampling(model, tokenizer, prompt, max_length, temperature=0.8): - """Sampling with temperature.""" - print("\n" + "=" * 60) - print(f"SAMPLING (temperature={temperature})") - print("=" * 60) - - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +def generate_with_sampling(model, tokenizer, prompt, max_new_tokens=50, + temperature=0.7, top_k=50, top_p=0.9): + """Sampling with temperature, top-k, and nucleus (top-p) sampling.""" + print("\n=== Sampling (Temperature + Top-K + Top-P) ===") + print(f"Prompt: {prompt}") + print(f"Parameters: temperature={temperature}, top_k={top_k}, top_p={top_p}") + inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, - max_new_tokens=max_length, + max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, - pad_token_id=tokenizer.eos_token_id, - ) - - text = tokenizer.decode(outputs[0], skip_special_tokens=True) - print(f"\nPrompt: {prompt}") - print(f"\nGenerated:\n{text}") - - -def generate_with_top_k_top_p(model, tokenizer, prompt, max_length, top_k=50, top_p=0.95, temperature=0.8): - """Top-k and top-p (nucleus) sampling.""" - print("\n" + "=" * 60) - print(f"TOP-K TOP-P SAMPLING (k={top_k}, p={top_p}, temp={temperature})") - print("=" * 60) - - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - - outputs = model.generate( - **inputs, - max_new_tokens=max_length, - do_sample=True, top_k=top_k, top_p=top_p, - temperature=temperature, - repetition_penalty=1.2, - no_repeat_ngram_size=3, - pad_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id ) - text = tokenizer.decode(outputs[0], skip_special_tokens=True) - print(f"\nPrompt: {prompt}") - print(f"\nGenerated:\n{text}") + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Generated: {generated_text}\n") -def generate_multiple(model, tokenizer, prompt, max_length, num_sequences=3): +def generate_multiple_sequences(model, tokenizer, prompt, max_new_tokens=50, + num_return_sequences=3): """Generate multiple diverse sequences.""" - print("\n" + "=" * 60) - print(f"MULTIPLE SEQUENCES (n={num_sequences})") - print("=" * 60) - - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + print("\n=== Multiple Sequences (with Sampling) ===") + print(f"Prompt: {prompt}") + print(f"Generating {num_return_sequences} sequences...") + inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, - max_new_tokens=max_length, + max_new_tokens=max_new_tokens, do_sample=True, - num_return_sequences=num_sequences, - temperature=0.9, + temperature=0.8, top_p=0.95, - pad_token_id=tokenizer.eos_token_id, + num_return_sequences=num_return_sequences, + pad_token_id=tokenizer.pad_token_id ) - print(f"\nPrompt: {prompt}\n") - for i, output in enumerate(outputs, 1): - text = tokenizer.decode(output, skip_special_tokens=True) - print(f"\n--- Sequence {i} ---\n{text}\n") + for i, output in enumerate(outputs): + generated_text = tokenizer.decode(output, skip_special_tokens=True) + print(f"\nSequence {i+1}: {generated_text}") + print() + + +def generate_with_config(model, tokenizer, prompt): + """Use GenerationConfig for reusable configuration.""" + print("\n=== Using GenerationConfig ===") + print(f"Prompt: {prompt}") + + # Create a generation config + generation_config = GenerationConfig( + max_new_tokens=50, + do_sample=True, + temperature=0.7, + top_p=0.9, + top_k=50, + repetition_penalty=1.2, + no_repeat_ngram_size=3, + pad_token_id=tokenizer.pad_token_id + ) + + inputs = tokenizer(prompt, return_tensors="pt") + outputs = model.generate(**inputs, generation_config=generation_config) + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Generated: {generated_text}\n") + + +def compare_temperatures(model, tokenizer, prompt, max_new_tokens=50): + """Compare different temperature settings.""" + print("\n=== Temperature Comparison ===") + print(f"Prompt: {prompt}\n") + + temperatures = [0.3, 0.7, 1.0, 1.5] + + for temp in temperatures: + inputs = tokenizer(prompt, return_tensors="pt") + outputs = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=temp, + top_p=0.9, + pad_token_id=tokenizer.pad_token_id + ) + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Temperature {temp}: {generated_text}\n") def main(): - parser = argparse.ArgumentParser(description="Text generation with various strategies") - parser.add_argument( - "--model", - type=str, - default="gpt2", - help="Model name or path", - ) - parser.add_argument( - "--prompt", - type=str, - required=True, - help="Input prompt for generation", - ) - parser.add_argument( - "--strategy", - type=str, - default="all", - choices=["greedy", "beam", "sampling", "top_k_top_p", "multiple", "all"], - help="Generation strategy to use", - ) - parser.add_argument( - "--max-length", - type=int, - default=100, - help="Maximum number of new tokens to generate", - ) - parser.add_argument( - "--device", - type=str, - default="auto", - help="Device (cuda, cpu, or auto)", - ) - parser.add_argument( - "--temperature", - type=float, - default=0.8, - help="Sampling temperature", - ) - parser.add_argument( - "--quantize", - action="store_true", - help="Use 8-bit quantization", - ) - - args = parser.parse_args() - - print("=" * 60) - print("Text Generation Demo") - print("=" * 60) - print(f"Model: {args.model}") - print(f"Strategy: {args.strategy}") - print(f"Max length: {args.max_length}") - print(f"Device: {args.device}") - print("=" * 60) + """Run all generation examples.""" + print("=" * 70) + print("Text Generation Examples") + print("=" * 70) # Load model and tokenizer - print("\nLoading model...") + model, tokenizer = load_model_and_tokenizer("gpt2") - if args.device == "auto": - device_map = "auto" - device = None - else: - device_map = None - device = args.device + # Example prompts + story_prompt = "Once upon a time in a distant galaxy" + factual_prompt = "The three branches of the US government are" - model_kwargs = {"device_map": device_map} if device_map else {} + # Demonstrate different strategies + generate_with_greedy(model, tokenizer, story_prompt) + generate_with_beam_search(model, tokenizer, factual_prompt) + generate_with_sampling(model, tokenizer, story_prompt) + generate_multiple_sequences(model, tokenizer, story_prompt, num_return_sequences=3) + generate_with_config(model, tokenizer, story_prompt) + compare_temperatures(model, tokenizer, story_prompt) - if args.quantize: - print("Using 8-bit quantization...") - model_kwargs["load_in_8bit"] = True - - model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs) - tokenizer = AutoTokenizer.from_pretrained(args.model) - - if device and not device_map: - model = model.to(device) - - print(f"Model loaded on: {model.device if hasattr(model, 'device') else 'multiple devices'}") - - # Generate based on strategy - strategies = { - "greedy": lambda: generate_with_greedy(model, tokenizer, args.prompt, args.max_length), - "beam": lambda: generate_with_beam_search(model, tokenizer, args.prompt, args.max_length), - "sampling": lambda: generate_with_sampling(model, tokenizer, args.prompt, args.max_length, args.temperature), - "top_k_top_p": lambda: generate_with_top_k_top_p(model, tokenizer, args.prompt, args.max_length), - "multiple": lambda: generate_multiple(model, tokenizer, args.prompt, args.max_length), - } - - if args.strategy == "all": - for strategy_fn in strategies.values(): - strategy_fn() - else: - strategies[args.strategy]() - - print("\n" + "=" * 60) - print("Generation complete!") - print("=" * 60) + print("=" * 70) + print("All generation examples completed!") + print("=" * 70) if __name__ == "__main__": diff --git a/scientific-packages/transformers/scripts/quick_inference.py b/scientific-packages/transformers/scripts/quick_inference.py old mode 100755 new mode 100644 index 8f931f5..36f80ff --- a/scientific-packages/transformers/scripts/quick_inference.py +++ b/scientific-packages/transformers/scripts/quick_inference.py @@ -1,105 +1,132 @@ #!/usr/bin/env python3 """ -Quick inference script using Transformers pipelines. +Quick inference using Transformers pipelines. -This script demonstrates how to use various pipeline tasks for quick inference -without manually managing models, tokenizers, or preprocessing. - -Usage: - python quick_inference.py --task text-generation --model gpt2 --input "Hello world" - python quick_inference.py --task sentiment-analysis --input "I love this!" +This script demonstrates how to quickly use pre-trained models for inference +across various tasks using the pipeline API. """ -import argparse -from transformers import pipeline, infer_device +from transformers import pipeline + + +def text_classification_example(): + """Sentiment analysis example.""" + print("=== Text Classification ===") + classifier = pipeline("text-classification") + result = classifier("I love using Transformers! It makes NLP so easy.") + print(f"Result: {result}\n") + + +def named_entity_recognition_example(): + """Named Entity Recognition example.""" + print("=== Named Entity Recognition ===") + ner = pipeline("token-classification", aggregation_strategy="simple") + text = "My name is Sarah and I work at Microsoft in Seattle" + entities = ner(text) + for entity in entities: + print(f"{entity['word']}: {entity['entity_group']} (score: {entity['score']:.3f})") + print() + + +def question_answering_example(): + """Question Answering example.""" + print("=== Question Answering ===") + qa = pipeline("question-answering") + context = "Paris is the capital and most populous city of France. It is located in northern France." + question = "What is the capital of France?" + answer = qa(question=question, context=context) + print(f"Question: {question}") + print(f"Answer: {answer['answer']} (score: {answer['score']:.3f})\n") + + +def text_generation_example(): + """Text generation example.""" + print("=== Text Generation ===") + generator = pipeline("text-generation", model="gpt2") + prompt = "Once upon a time in a land far away" + generated = generator(prompt, max_length=50, num_return_sequences=1) + print(f"Prompt: {prompt}") + print(f"Generated: {generated[0]['generated_text']}\n") + + +def summarization_example(): + """Text summarization example.""" + print("=== Summarization ===") + summarizer = pipeline("summarization") + article = """ + The Transformers library provides thousands of pretrained models to perform tasks + on texts such as classification, information extraction, question answering, + summarization, translation, text generation, etc in over 100 languages. Its aim + is to make cutting-edge NLP easier to use for everyone. The library provides APIs + to quickly download and use pretrained models on a given text, fine-tune them on + your own datasets then share them with the community on the model hub. + """ + summary = summarizer(article, max_length=50, min_length=25, do_sample=False) + print(f"Summary: {summary[0]['summary_text']}\n") + + +def translation_example(): + """Translation example.""" + print("=== Translation ===") + translator = pipeline("translation_en_to_fr") + text = "Hello, how are you today?" + translation = translator(text) + print(f"English: {text}") + print(f"French: {translation[0]['translation_text']}\n") + + +def zero_shot_classification_example(): + """Zero-shot classification example.""" + print("=== Zero-Shot Classification ===") + classifier = pipeline("zero-shot-classification") + text = "This is a breaking news story about a major earthquake." + candidate_labels = ["politics", "sports", "science", "breaking news"] + result = classifier(text, candidate_labels) + print(f"Text: {text}") + print("Predictions:") + for label, score in zip(result['labels'], result['scores']): + print(f" {label}: {score:.3f}") + print() + + +def image_classification_example(): + """Image classification example (requires PIL).""" + print("=== Image Classification ===") + try: + from PIL import Image + import requests + + classifier = pipeline("image-classification") + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + predictions = classifier(image) + print("Top predictions:") + for pred in predictions[:3]: + print(f" {pred['label']}: {pred['score']:.3f}") + print() + except ImportError: + print("PIL not installed. Skipping image classification example.\n") def main(): - parser = argparse.ArgumentParser(description="Quick inference with Transformers pipelines") - parser.add_argument( - "--task", - type=str, - required=True, - help="Pipeline task (text-generation, sentiment-analysis, question-answering, etc.)", - ) - parser.add_argument( - "--model", - type=str, - default=None, - help="Model name or path (default: use task default)", - ) - parser.add_argument( - "--input", - type=str, - required=True, - help="Input text for inference", - ) - parser.add_argument( - "--context", - type=str, - default=None, - help="Context for question-answering tasks", - ) - parser.add_argument( - "--max-length", - type=int, - default=50, - help="Maximum generation length", - ) - parser.add_argument( - "--device", - type=str, - default=None, - help="Device (cuda, cpu, or auto-detect)", - ) + """Run all examples.""" + print("Transformers Quick Inference Examples") + print("=" * 50 + "\n") - args = parser.parse_args() + # Text tasks + text_classification_example() + named_entity_recognition_example() + question_answering_example() + text_generation_example() + summarization_example() + translation_example() + zero_shot_classification_example() - # Auto-detect device if not specified - if args.device is None: - device = infer_device() - else: - device = args.device + # Vision task (optional) + image_classification_example() - print(f"Using device: {device}") - print(f"Task: {args.task}") - print(f"Model: {args.model or 'default'}") - print("-" * 50) - - # Create pipeline - pipe = pipeline( - args.task, - model=args.model, - device=device, - ) - - # Run inference based on task - if args.task == "question-answering": - if args.context is None: - print("Error: --context required for question-answering") - return - result = pipe(question=args.input, context=args.context) - print(f"Question: {args.input}") - print(f"Context: {args.context}") - print(f"\nAnswer: {result['answer']}") - print(f"Score: {result['score']:.4f}") - - elif args.task == "text-generation": - result = pipe(args.input, max_length=args.max_length) - print(f"Prompt: {args.input}") - print(f"\nGenerated: {result[0]['generated_text']}") - - elif args.task in ["sentiment-analysis", "text-classification"]: - result = pipe(args.input) - print(f"Text: {args.input}") - print(f"\nLabel: {result[0]['label']}") - print(f"Score: {result[0]['score']:.4f}") - - else: - # Generic handling for other tasks - result = pipe(args.input) - print(f"Input: {args.input}") - print(f"\nResult: {result}") + print("=" * 50) + print("All examples completed!") if __name__ == "__main__":