mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
Update Huggingface Transformer
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,22 +1,8 @@
|
||||
# Text Generation Strategies
|
||||
|
||||
Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity.
|
||||
Transformers provides flexible text generation capabilities through the `generate()` method, supporting multiple decoding strategies and configuration options.
|
||||
|
||||
## Overview
|
||||
|
||||
Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost.
|
||||
|
||||
**When to use each strategy:**
|
||||
- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical
|
||||
- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization)
|
||||
- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue)
|
||||
- **Top-k/Top-p**: Balanced creativity and coherence
|
||||
|
||||
## Basic Generation Methods
|
||||
|
||||
### Greedy Decoding
|
||||
|
||||
Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences.
|
||||
## Basic Generation
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
@@ -24,507 +10,364 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
inputs = tokenizer("The future of AI", return_tensors="pt")
|
||||
|
||||
# Greedy decoding (default)
|
||||
inputs = tokenizer("Once upon a time", return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
generated_text = tokenizer.decode(outputs[0])
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Deterministic (always same output for same input)
|
||||
- Fast (single forward pass per token)
|
||||
- Prone to repetition in longer sequences
|
||||
- Best for: Short generations, deterministic applications
|
||||
## Decoding Strategies
|
||||
|
||||
**Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50, # Number of tokens to generate
|
||||
min_length=10, # Minimum total length
|
||||
pad_token_id=tokenizer.pad_token_id,
|
||||
)
|
||||
```
|
||||
### 1. Greedy Decoding
|
||||
|
||||
### Beam Search
|
||||
|
||||
Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability.
|
||||
Selects the token with highest probability at each step. Deterministic but can be repetitive.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5, # Number of beams
|
||||
early_stopping=True, # Stop when all beams finish
|
||||
no_repeat_ngram_size=2, # Prevent 2-gram repetition
|
||||
do_sample=False,
|
||||
num_beams=1 # Greedy is default when num_beams=1 and do_sample=False
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Higher quality than greedy for tasks with "correct" answers
|
||||
- Slower than greedy (num_beams forward passes per step)
|
||||
- Still can suffer from repetition
|
||||
- Best for: Translation, summarization, QA generation
|
||||
### 2. Beam Search
|
||||
|
||||
Explores multiple hypotheses simultaneously, keeping top-k candidates at each step.
|
||||
|
||||
**Advanced Parameters:**
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5, # Number of beams
|
||||
early_stopping=True, # Stop when all beams reach EOS
|
||||
no_repeat_ngram_size=2, # Prevent repeating n-grams
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- `num_beams`: Number of beams (higher = more thorough but slower)
|
||||
- `early_stopping`: Stop when all beams finish (True/False)
|
||||
- `length_penalty`: Exponential penalty for length (>1.0 favors longer sequences)
|
||||
- `no_repeat_ngram_size`: Prevent repeating n-grams
|
||||
|
||||
### 3. Sampling (Multinomial)
|
||||
|
||||
Samples from probability distribution, introducing randomness and diversity.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True,
|
||||
temperature=0.7, # Controls randomness (lower = more focused)
|
||||
top_k=50, # Consider only top-k tokens
|
||||
top_p=0.9, # Nucleus sampling (cumulative probability threshold)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- `temperature`: Scales logits before softmax (0.1-2.0 typical range)
|
||||
- Lower (0.1-0.7): More focused, deterministic
|
||||
- Higher (0.8-1.5): More creative, random
|
||||
- `top_k`: Sample from top-k tokens only
|
||||
- `top_p`: Nucleus sampling - sample from smallest set with cumulative probability > p
|
||||
|
||||
### 4. Beam Search with Sampling
|
||||
|
||||
Combines beam search with sampling for diverse but coherent outputs.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=5,
|
||||
num_beam_groups=1, # Diverse beam search groups
|
||||
diversity_penalty=0.0, # Penalty for similar beams
|
||||
length_penalty=1.0, # >1: longer sequences, <1: shorter
|
||||
early_stopping=True, # Stop when num_beams sequences finish
|
||||
no_repeat_ngram_size=2, # Block repeating n-grams
|
||||
num_return_sequences=1, # Return top-k sequences (≤ num_beams)
|
||||
)
|
||||
```
|
||||
|
||||
**Length Penalty:**
|
||||
- `length_penalty > 1.0`: Favor longer sequences
|
||||
- `length_penalty = 1.0`: No penalty
|
||||
- `length_penalty < 1.0`: Favor shorter sequences
|
||||
|
||||
### Sampling (Multinomial)
|
||||
|
||||
Randomly sample tokens according to the probability distribution.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
do_sample=True, # Enable sampling
|
||||
temperature=1.0, # Sampling temperature
|
||||
num_beams=1, # Must be 1 for sampling
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Non-deterministic (different output each time)
|
||||
- More diverse and creative than greedy/beam search
|
||||
- Can produce incoherent output if not controlled
|
||||
- Best for: Creative writing, dialogue, open-ended generation
|
||||
|
||||
**Temperature Parameter:**
|
||||
```python
|
||||
# Low temperature (0.1-0.7): More focused, less random
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.5)
|
||||
|
||||
# Medium temperature (0.7-1.0): Balanced
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=0.8)
|
||||
|
||||
# High temperature (1.0-2.0): More random, more creative
|
||||
outputs = model.generate(**inputs, do_sample=True, temperature=1.5)
|
||||
```
|
||||
|
||||
- `temperature → 0`: Approaches greedy decoding
|
||||
- `temperature = 1.0`: Sample from original distribution
|
||||
- `temperature > 1.0`: Flatter distribution, more random
|
||||
- `temperature < 1.0`: Sharper distribution, more confident
|
||||
|
||||
## Advanced Sampling Methods
|
||||
|
||||
### Top-k Sampling
|
||||
|
||||
Sample from only the k most likely tokens.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_k=50, # Consider top 50 tokens
|
||||
temperature=0.8,
|
||||
top_k=50,
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Filter to top-k most probable tokens
|
||||
2. Renormalize probabilities
|
||||
3. Sample from filtered distribution
|
||||
### 5. Contrastive Search
|
||||
|
||||
**Choosing k:**
|
||||
- `k=1`: Equivalent to greedy decoding
|
||||
- `k=10-50`: More focused, coherent output
|
||||
- `k=100-500`: More diverse output
|
||||
- Too high k: Includes low-probability tokens (noise)
|
||||
- Too low k: Less diverse, may miss good alternatives
|
||||
|
||||
### Top-p (Nucleus) Sampling
|
||||
|
||||
Sample from the smallest set of tokens whose cumulative probability ≥ p.
|
||||
Balances coherence and diversity using contrastive objective.
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=50,
|
||||
top_p=0.95, # Nucleus probability
|
||||
temperature=0.8,
|
||||
penalty_alpha=0.6, # Contrastive penalty
|
||||
top_k=4, # Consider top-k candidates
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
1. Sort tokens by probability
|
||||
2. Find smallest set with cumulative probability ≥ p
|
||||
3. Sample from this set
|
||||
### 6. Assisted Decoding
|
||||
|
||||
**Choosing p:**
|
||||
- `p=0.9-0.95`: Good balance (recommended)
|
||||
- `p=1.0`: Sample from full distribution
|
||||
- Higher p: More diverse, might include unlikely tokens
|
||||
- Lower p: More focused, like top-k with adaptive k
|
||||
|
||||
**Top-p vs Top-k:**
|
||||
- Top-p adapts to probability distribution shape
|
||||
- Top-k is fixed regardless of distribution
|
||||
- Top-p generally better for variable-quality contexts
|
||||
- Can combine: `top_k=50, top_p=0.95` (apply both filters)
|
||||
|
||||
### Combining Strategies
|
||||
Uses a smaller "assistant" model to speed up generation of larger model.
|
||||
|
||||
```python
|
||||
# Recommended for high-quality open-ended generation
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
|
||||
assistant_model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
assistant_model=assistant_model,
|
||||
max_new_tokens=50,
|
||||
)
|
||||
```
|
||||
|
||||
## GenerationConfig
|
||||
|
||||
Configure generation parameters with `GenerationConfig` for reusability.
|
||||
|
||||
```python
|
||||
from transformers import GenerationConfig
|
||||
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=100,
|
||||
temperature=0.8, # Moderate temperature
|
||||
top_k=50, # Limit to top 50 tokens
|
||||
top_p=0.95, # Nucleus sampling
|
||||
repetition_penalty=1.2, # Discourage repetition
|
||||
no_repeat_ngram_size=3, # Block 3-gram repetition
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
|
||||
# Use with model
|
||||
outputs = model.generate(**inputs, generation_config=generation_config)
|
||||
|
||||
# Save and load
|
||||
generation_config.save_pretrained("./config")
|
||||
loaded_config = GenerationConfig.from_pretrained("./config")
|
||||
```
|
||||
|
||||
## Controlling Generation Quality
|
||||
## Key Parameters Reference
|
||||
|
||||
### Output Length Control
|
||||
|
||||
- `max_length`: Maximum total tokens (input + output)
|
||||
- `max_new_tokens`: Maximum new tokens to generate (recommended over max_length)
|
||||
- `min_length`: Minimum total tokens
|
||||
- `min_new_tokens`: Minimum new tokens to generate
|
||||
|
||||
### Sampling Parameters
|
||||
|
||||
- `temperature`: Sampling temperature (0.1-2.0, default 1.0)
|
||||
- `top_k`: Top-k sampling (1-100, typically 50)
|
||||
- `top_p`: Nucleus sampling (0.0-1.0, typically 0.9)
|
||||
- `do_sample`: Enable sampling (True/False)
|
||||
|
||||
### Beam Search Parameters
|
||||
|
||||
- `num_beams`: Number of beams (1-20, typically 5)
|
||||
- `early_stopping`: Stop when beams finish (True/False)
|
||||
- `length_penalty`: Length penalty (>1.0 favors longer, <1.0 favors shorter)
|
||||
- `num_beam_groups`: Diverse beam search groups
|
||||
- `diversity_penalty`: Penalty for similar beams
|
||||
|
||||
### Repetition Control
|
||||
|
||||
Prevent models from repeating themselves:
|
||||
- `repetition_penalty`: Penalty for repeating tokens (1.0-2.0, default 1.0)
|
||||
- `no_repeat_ngram_size`: Prevent repeating n-grams (2-5 typical)
|
||||
- `encoder_repetition_penalty`: Penalty for repeating encoder tokens
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
### Special Tokens
|
||||
|
||||
# Method 1: Repetition penalty
|
||||
repetition_penalty=1.2, # Penalize repeated tokens (>1.0)
|
||||
- `bos_token_id`: Beginning of sequence token
|
||||
- `eos_token_id`: End of sequence token (or list of tokens)
|
||||
- `pad_token_id`: Padding token
|
||||
- `forced_bos_token_id`: Force specific token at beginning
|
||||
- `forced_eos_token_id`: Force specific token at end
|
||||
|
||||
# Method 2: Block n-gram repetition
|
||||
no_repeat_ngram_size=3, # Never repeat 3-grams
|
||||
### Multiple Sequences
|
||||
|
||||
# Method 3: Encoder repetition penalty (for seq2seq)
|
||||
encoder_repetition_penalty=1.0, # Penalize input tokens
|
||||
)
|
||||
```
|
||||
- `num_return_sequences`: Number of sequences to return
|
||||
- `num_beam_groups`: Number of diverse beam groups
|
||||
|
||||
**Repetition Penalty Values:**
|
||||
- `1.0`: No penalty
|
||||
- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3)
|
||||
- `>1.5`: Strong penalty (may harm coherence)
|
||||
## Advanced Generation Techniques
|
||||
|
||||
### Length Control
|
||||
### Constrained Generation
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
|
||||
# Hard constraints
|
||||
min_length=20, # Minimum total length
|
||||
max_length=100, # Maximum total length
|
||||
max_new_tokens=50, # Maximum new tokens (excluding input)
|
||||
|
||||
# Soft constraints (with beam search)
|
||||
length_penalty=1.0, # Encourage longer/shorter outputs
|
||||
|
||||
# Early stopping
|
||||
early_stopping=True, # Stop when condition met
|
||||
)
|
||||
```
|
||||
|
||||
### Bad Words and Forced Tokens
|
||||
|
||||
```python
|
||||
# Prevent specific tokens
|
||||
bad_words_ids = [
|
||||
tokenizer.encode("badword1", add_special_tokens=False),
|
||||
tokenizer.encode("badword2", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
bad_words_ids=bad_words_ids,
|
||||
)
|
||||
|
||||
# Force specific tokens
|
||||
force_words_ids = [
|
||||
tokenizer.encode("important", add_special_tokens=False),
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
force_words_ids=force_words_ids,
|
||||
)
|
||||
```
|
||||
|
||||
## Streaming Generation
|
||||
|
||||
Generate and process tokens as they're produced:
|
||||
|
||||
```python
|
||||
from transformers import TextStreamer, TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
# Simple streaming (prints to stdout)
|
||||
streamer = TextStreamer(tokenizer, skip_prompt=True)
|
||||
outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
|
||||
# Iterator streaming (for custom processing)
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|
||||
|
||||
generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100)
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for text in streamer:
|
||||
print(text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
## Advanced Techniques
|
||||
|
||||
### Contrastive Search
|
||||
|
||||
Balance coherence and diversity using contrastive objective:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
penalty_alpha=0.6, # Contrastive penalty
|
||||
top_k=4, # Consider top-4 tokens
|
||||
)
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
- Open-ended text generation
|
||||
- Reduces repetition without sacrificing coherence
|
||||
- Good alternative to sampling
|
||||
|
||||
### Diverse Beam Search
|
||||
|
||||
Generate multiple diverse outputs:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=10,
|
||||
num_beam_groups=5, # 5 groups of 2 beams each
|
||||
diversity_penalty=1.0, # Penalty for similar beams
|
||||
num_return_sequences=5, # Return 5 diverse outputs
|
||||
)
|
||||
```
|
||||
|
||||
### Constrained Beam Search
|
||||
|
||||
Force output to include specific phrases:
|
||||
Force generation to include specific tokens or follow patterns.
|
||||
|
||||
```python
|
||||
from transformers import PhrasalConstraint
|
||||
|
||||
constraints = [
|
||||
PhrasalConstraint(
|
||||
tokenizer("machine learning", add_special_tokens=False).input_ids
|
||||
),
|
||||
PhrasalConstraint(tokenizer("New York", add_special_tokens=False).input_ids)
|
||||
]
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
constraints=constraints,
|
||||
num_beams=10, # Requires beam search
|
||||
num_beams=5,
|
||||
)
|
||||
```
|
||||
|
||||
## Speculative Decoding
|
||||
### Streaming Generation
|
||||
|
||||
Accelerate generation using a smaller draft model:
|
||||
Generate tokens one at a time for real-time display.
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers import TextIteratorStreamer
|
||||
from threading import Thread
|
||||
|
||||
# Load main and assistant models
|
||||
model = AutoModelForCausalLM.from_pretrained("large-model")
|
||||
assistant_model = AutoModelForCausalLM.from_pretrained("small-model")
|
||||
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
||||
|
||||
generation_kwargs = dict(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
streamer=streamer,
|
||||
)
|
||||
|
||||
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
||||
thread.start()
|
||||
|
||||
for new_text in streamer:
|
||||
print(new_text, end="", flush=True)
|
||||
|
||||
thread.join()
|
||||
```
|
||||
|
||||
### Logit Processors
|
||||
|
||||
Customize token selection with custom logit processors.
|
||||
|
||||
```python
|
||||
from transformers import LogitsProcessor, LogitsProcessorList
|
||||
|
||||
class CustomLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids, scores):
|
||||
# Modify scores here
|
||||
return scores
|
||||
|
||||
logits_processor = LogitsProcessorList([CustomLogitsProcessor()])
|
||||
|
||||
# Generate with speculative decoding
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
assistant_model=assistant_model,
|
||||
logits_processor=logits_processor,
|
||||
)
|
||||
```
|
||||
|
||||
### Stopping Criteria
|
||||
|
||||
Define custom stopping conditions.
|
||||
|
||||
```python
|
||||
from transformers import StoppingCriteria, StoppingCriteriaList
|
||||
|
||||
class CustomStoppingCriteria(StoppingCriteria):
|
||||
def __call__(self, input_ids, scores, **kwargs):
|
||||
# Return True to stop generation
|
||||
return False
|
||||
|
||||
stopping_criteria = StoppingCriteriaList([CustomStoppingCriteria()])
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
stopping_criteria=stopping_criteria,
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Creative Tasks (Stories, Dialogue)
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=200,
|
||||
do_sample=True,
|
||||
temperature=0.8,
|
||||
)
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- 2-3x faster generation
|
||||
- Identical output distribution to regular generation
|
||||
- Works with sampling and greedy decoding
|
||||
|
||||
## Recipe: Recommended Settings by Task
|
||||
|
||||
### Creative Writing / Dialogue
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=200,
|
||||
temperature=0.9,
|
||||
top_p=0.95,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
)
|
||||
```
|
||||
|
||||
### Translation / Summarization
|
||||
|
||||
### For Factual Tasks (Summaries, QA)
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
num_beams=5,
|
||||
max_new_tokens=150,
|
||||
max_new_tokens=100,
|
||||
num_beams=4,
|
||||
early_stopping=True,
|
||||
length_penalty=1.0,
|
||||
no_repeat_ngram_size=2,
|
||||
length_penalty=1.0,
|
||||
)
|
||||
```
|
||||
|
||||
### Code Generation
|
||||
|
||||
### For Chat/Instruction Following
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=300,
|
||||
temperature=0.2, # Low temperature for correctness
|
||||
top_p=0.95,
|
||||
max_new_tokens=512,
|
||||
do_sample=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Chatbot / Instruction Following
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
do_sample=True,
|
||||
max_new_tokens=256,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.15,
|
||||
repetition_penalty=1.1,
|
||||
)
|
||||
```
|
||||
|
||||
### Factual QA / Information Extraction
|
||||
## Vision-Language Model Generation
|
||||
|
||||
For models like LLaVA, BLIP-2, etc.:
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=50,
|
||||
num_beams=3,
|
||||
early_stopping=True,
|
||||
# Or greedy for very short answers:
|
||||
# (no special parameters needed)
|
||||
)
|
||||
```
|
||||
from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
from PIL import Image
|
||||
|
||||
## Debugging Generation
|
||||
model = AutoModelForVision2Seq.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
### Check Token Probabilities
|
||||
|
||||
```python
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=20,
|
||||
output_scores=True, # Return generation scores
|
||||
return_dict_in_generate=True, # Return as dict
|
||||
)
|
||||
|
||||
# Access generation scores
|
||||
scores = outputs.scores # Tuple of tensors (seq_len, vocab_size)
|
||||
|
||||
# Get token probabilities
|
||||
import torch
|
||||
probs = torch.softmax(scores[0], dim=-1)
|
||||
```
|
||||
|
||||
### Monitor Generation Process
|
||||
|
||||
```python
|
||||
from transformers import LogitsProcessor, LogitsProcessorList
|
||||
|
||||
class DebugLogitsProcessor(LogitsProcessor):
|
||||
def __call__(self, input_ids, scores):
|
||||
# Print top 5 tokens at each step
|
||||
top_tokens = scores[0].topk(5)
|
||||
print(f"Top 5 tokens: {top_tokens}")
|
||||
return scores
|
||||
image = Image.open("image.jpg")
|
||||
inputs = processor(text="Describe this image", images=image, return_tensors="pt")
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=10,
|
||||
logits_processor=LogitsProcessorList([DebugLogitsProcessor()]),
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
**Issue: Repetitive output**
|
||||
- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3`
|
||||
- For sampling: Increase `temperature`, enable `top_p`
|
||||
|
||||
**Issue: Incoherent output**
|
||||
- Solution: Lower `temperature` (0.5-0.8), use beam search
|
||||
- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens
|
||||
|
||||
**Issue: Too short output**
|
||||
- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search)
|
||||
- Check if EOS token is being generated early
|
||||
|
||||
**Issue: Too slow generation**
|
||||
- Solution: Use greedy instead of beam search
|
||||
- Reduce `num_beams`
|
||||
- Try speculative decoding with assistant model
|
||||
- Use smaller model variant
|
||||
|
||||
**Issue: Output doesn't follow format**
|
||||
- Solution: Use constrained beam search
|
||||
- Add format examples to prompt
|
||||
- Use `bad_words_ids` to prevent format-breaking tokens
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Use KV Cache
|
||||
```python
|
||||
# Use half precision
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
# Use KV cache optimization (default, but can be disabled)
|
||||
# KV cache is enabled by default
|
||||
outputs = model.generate(**inputs, use_cache=True)
|
||||
|
||||
# Batch generation
|
||||
inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
|
||||
# Static cache for longer sequences (if supported)
|
||||
outputs = model.generate(**inputs, cache_implementation="static")
|
||||
```
|
||||
|
||||
This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`.
|
||||
### Mixed Precision
|
||||
```python
|
||||
import torch
|
||||
|
||||
with torch.cuda.amp.autocast():
|
||||
outputs = model.generate(**inputs, max_new_tokens=100)
|
||||
```
|
||||
|
||||
### Batch Generation
|
||||
```python
|
||||
texts = ["Prompt 1", "Prompt 2", "Prompt 3"]
|
||||
inputs = tokenizer(texts, return_tensors="pt", padding=True)
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
```
|
||||
|
||||
### Quantization
|
||||
```python
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto"
|
||||
)
|
||||
```
|
||||
|
||||
234
scientific-packages/transformers/references/pipelines.md
Normal file
234
scientific-packages/transformers/references/pipelines.md
Normal file
@@ -0,0 +1,234 @@
|
||||
# Transformers Pipelines
|
||||
|
||||
Pipelines provide a simple and optimized interface for inference across many machine learning tasks. They abstract away the complexity of tokenization, model invocation, and post-processing.
|
||||
|
||||
## Usage Pattern
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Basic usage
|
||||
classifier = pipeline("text-classification")
|
||||
result = classifier("This movie was amazing!")
|
||||
|
||||
# With specific model
|
||||
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
|
||||
result = classifier("This movie was amazing!")
|
||||
```
|
||||
|
||||
## Natural Language Processing Pipelines
|
||||
|
||||
### Text Classification
|
||||
```python
|
||||
classifier = pipeline("text-classification")
|
||||
classifier("I love this product!")
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998}]
|
||||
```
|
||||
|
||||
### Zero-Shot Classification
|
||||
```python
|
||||
classifier = pipeline("zero-shot-classification")
|
||||
classifier("This is about climate change", candidate_labels=["politics", "science", "sports"])
|
||||
```
|
||||
|
||||
### Token Classification (NER)
|
||||
```python
|
||||
ner = pipeline("token-classification")
|
||||
ner("My name is Sarah and I work at Microsoft in Seattle")
|
||||
```
|
||||
|
||||
### Question Answering
|
||||
```python
|
||||
qa = pipeline("question-answering")
|
||||
qa(question="What is the capital?", context="The capital of France is Paris.")
|
||||
```
|
||||
|
||||
### Text Generation
|
||||
```python
|
||||
generator = pipeline("text-generation")
|
||||
generator("Once upon a time", max_length=50)
|
||||
```
|
||||
|
||||
### Text2Text Generation
|
||||
```python
|
||||
generator = pipeline("text2text-generation", model="t5-base")
|
||||
generator("translate English to French: Hello")
|
||||
```
|
||||
|
||||
### Summarization
|
||||
```python
|
||||
summarizer = pipeline("summarization")
|
||||
summarizer("Long article text here...", max_length=130, min_length=30)
|
||||
```
|
||||
|
||||
### Translation
|
||||
```python
|
||||
translator = pipeline("translation_en_to_fr")
|
||||
translator("Hello, how are you?")
|
||||
```
|
||||
|
||||
### Fill Mask
|
||||
```python
|
||||
unmasker = pipeline("fill-mask")
|
||||
unmasker("Paris is the [MASK] of France.")
|
||||
```
|
||||
|
||||
### Feature Extraction
|
||||
```python
|
||||
extractor = pipeline("feature-extraction")
|
||||
embeddings = extractor("This is a sentence")
|
||||
```
|
||||
|
||||
### Document Question Answering
|
||||
```python
|
||||
doc_qa = pipeline("document-question-answering")
|
||||
doc_qa(image="document.png", question="What is the invoice number?")
|
||||
```
|
||||
|
||||
### Table Question Answering
|
||||
```python
|
||||
table_qa = pipeline("table-question-answering")
|
||||
table_qa(table=data, query="How many employees?")
|
||||
```
|
||||
|
||||
## Computer Vision Pipelines
|
||||
|
||||
### Image Classification
|
||||
```python
|
||||
classifier = pipeline("image-classification")
|
||||
classifier("cat.jpg")
|
||||
```
|
||||
|
||||
### Zero-Shot Image Classification
|
||||
```python
|
||||
classifier = pipeline("zero-shot-image-classification")
|
||||
classifier("cat.jpg", candidate_labels=["cat", "dog", "bird"])
|
||||
```
|
||||
|
||||
### Object Detection
|
||||
```python
|
||||
detector = pipeline("object-detection")
|
||||
detector("street.jpg")
|
||||
```
|
||||
|
||||
### Image Segmentation
|
||||
```python
|
||||
segmenter = pipeline("image-segmentation")
|
||||
segmenter("image.jpg")
|
||||
```
|
||||
|
||||
### Image-to-Image
|
||||
```python
|
||||
img2img = pipeline("image-to-image", model="lllyasviel/sd-controlnet-canny")
|
||||
img2img("input.jpg")
|
||||
```
|
||||
|
||||
### Depth Estimation
|
||||
```python
|
||||
depth = pipeline("depth-estimation")
|
||||
depth("image.jpg")
|
||||
```
|
||||
|
||||
### Video Classification
|
||||
```python
|
||||
classifier = pipeline("video-classification")
|
||||
classifier("video.mp4")
|
||||
```
|
||||
|
||||
### Keypoint Matching
|
||||
```python
|
||||
matcher = pipeline("keypoint-matching")
|
||||
matcher(image1="img1.jpg", image2="img2.jpg")
|
||||
```
|
||||
|
||||
## Audio Pipelines
|
||||
|
||||
### Automatic Speech Recognition
|
||||
```python
|
||||
asr = pipeline("automatic-speech-recognition")
|
||||
asr("audio.wav")
|
||||
```
|
||||
|
||||
### Audio Classification
|
||||
```python
|
||||
classifier = pipeline("audio-classification")
|
||||
classifier("audio.wav")
|
||||
```
|
||||
|
||||
### Zero-Shot Audio Classification
|
||||
```python
|
||||
classifier = pipeline("zero-shot-audio-classification")
|
||||
classifier("audio.wav", candidate_labels=["speech", "music", "noise"])
|
||||
```
|
||||
|
||||
### Text-to-Audio/Text-to-Speech
|
||||
```python
|
||||
synthesizer = pipeline("text-to-audio")
|
||||
audio = synthesizer("Hello, how are you today?")
|
||||
```
|
||||
|
||||
## Multimodal Pipelines
|
||||
|
||||
### Image-to-Text (Image Captioning)
|
||||
```python
|
||||
captioner = pipeline("image-to-text")
|
||||
captioner("image.jpg")
|
||||
```
|
||||
|
||||
### Visual Question Answering
|
||||
```python
|
||||
vqa = pipeline("visual-question-answering")
|
||||
vqa(image="image.jpg", question="What color is the car?")
|
||||
```
|
||||
|
||||
### Image-Text-to-Text (VLMs)
|
||||
```python
|
||||
vlm = pipeline("image-text-to-text")
|
||||
vlm(images="image.jpg", text="Describe this image in detail")
|
||||
```
|
||||
|
||||
### Zero-Shot Object Detection
|
||||
```python
|
||||
detector = pipeline("zero-shot-object-detection")
|
||||
detector("image.jpg", candidate_labels=["car", "person", "tree"])
|
||||
```
|
||||
|
||||
## Pipeline Configuration
|
||||
|
||||
### Common Parameters
|
||||
|
||||
- `model`: Specify model identifier or path
|
||||
- `device`: Set device (0 for GPU, -1 for CPU, or "cuda:0")
|
||||
- `batch_size`: Process multiple inputs at once
|
||||
- `torch_dtype`: Set precision (torch.float16, torch.bfloat16)
|
||||
|
||||
```python
|
||||
# GPU with half precision
|
||||
pipe = pipeline("text-generation", model="gpt2", device=0, torch_dtype=torch.float16)
|
||||
|
||||
# Batch processing
|
||||
pipe(["text 1", "text 2", "text 3"], batch_size=8)
|
||||
```
|
||||
|
||||
### Task-Specific Parameters
|
||||
|
||||
Each pipeline accepts task-specific parameters in the call:
|
||||
|
||||
```python
|
||||
# Text generation
|
||||
generator("prompt", max_length=100, temperature=0.7, top_p=0.9, num_return_sequences=3)
|
||||
|
||||
# Summarization
|
||||
summarizer("text", max_length=130, min_length=30, do_sample=False)
|
||||
|
||||
# Translation
|
||||
translator("text", max_length=512, num_beams=4)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Reuse pipelines**: Create once, use multiple times for efficiency
|
||||
2. **Batch processing**: Use batches for multiple inputs to maximize throughput
|
||||
3. **GPU acceleration**: Set `device=0` for GPU when available
|
||||
4. **Model selection**: Choose task-specific models for best results
|
||||
5. **Memory management**: Use `torch_dtype=torch.float16` for large models
|
||||
@@ -1,504 +0,0 @@
|
||||
# Model Quantization Guide
|
||||
|
||||
Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy.
|
||||
|
||||
## Overview
|
||||
|
||||
Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables:
|
||||
- Running larger models on limited hardware
|
||||
- Faster inference (reduced memory bandwidth)
|
||||
- Lower deployment costs
|
||||
- Enabling fine-tuning of models that wouldn't fit in memory
|
||||
|
||||
**Tradeoffs:**
|
||||
- Slight accuracy loss (typically < 1-2%)
|
||||
- Initial quantization overhead
|
||||
- Some methods require calibration data
|
||||
|
||||
## Quick Comparison
|
||||
|
||||
| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup |
|
||||
|--------|-----------|-------|----------|-------------|----------|-------|
|
||||
| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy |
|
||||
| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium |
|
||||
| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium |
|
||||
| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy |
|
||||
| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium |
|
||||
|
||||
## Bitsandbytes (BnB)
|
||||
|
||||
On-the-fly quantization with excellent PEFT fine-tuning support.
|
||||
|
||||
### 8-bit Quantization
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True, # Enable 8-bit quantization
|
||||
device_map="auto", # Automatic device placement
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
||||
|
||||
# Use normally
|
||||
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
|
||||
outputs = model.generate(**inputs, max_new_tokens=50)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~7GB (50% reduction)
|
||||
- 13B model: ~26GB → ~13GB
|
||||
- 70B model: ~140GB → ~70GB
|
||||
|
||||
**Characteristics:**
|
||||
- Fast inference
|
||||
- Minimal accuracy loss
|
||||
- Works with PEFT (LoRA, QLoRA)
|
||||
- Supports CPU and CUDA GPUs
|
||||
|
||||
### 4-bit Quantization (QLoRA)
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
||||
import torch
|
||||
|
||||
# Configure 4-bit quantization
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True, # Enable 4-bit quantization
|
||||
bnb_4bit_quant_type="nf4", # Quantization type ("nf4" or "fp4")
|
||||
bnb_4bit_compute_dtype=torch.float16, # Computation dtype
|
||||
bnb_4bit_use_double_quant=True, # Nested quantization for more savings
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Memory Savings:**
|
||||
- 7B model: ~14GB → ~4GB (70% reduction)
|
||||
- 13B model: ~26GB → ~7GB
|
||||
- 70B model: ~140GB → ~35GB
|
||||
|
||||
**Quantization Types:**
|
||||
- `nf4`: Normal Float 4 (recommended, better quality)
|
||||
- `fp4`: Float Point 4 (slightly more memory efficient)
|
||||
|
||||
**Compute Dtype:**
|
||||
```python
|
||||
# For better quality
|
||||
bnb_4bit_compute_dtype=torch.float16
|
||||
|
||||
# For best performance on Ampere+ GPUs
|
||||
bnb_4bit_compute_dtype=torch.bfloat16
|
||||
```
|
||||
|
||||
**Double Quantization:**
|
||||
```python
|
||||
# Enable for additional ~0.4 bits/param savings
|
||||
bnb_4bit_use_double_quant=True # Quantize the quantization constants
|
||||
```
|
||||
|
||||
### Fine-tuning with QLoRA
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
|
||||
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||
import torch
|
||||
|
||||
# Load quantized model
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=bnb_config,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Prepare for training
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Configure LoRA
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Train normally
|
||||
trainer = Trainer(model=model, args=training_args, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GPTQ
|
||||
|
||||
Post-training quantization requiring calibration, optimized for inference speed.
|
||||
|
||||
### Loading GPTQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
|
||||
# Load pre-quantized GPTQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ", # Pre-quantized model
|
||||
device_map="auto",
|
||||
revision="gptq-4bit-32g-actorder_True", # Specific quantization config
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # 2, 3, 4, 8 bits
|
||||
dataset="c4", # Calibration dataset
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
device_map="auto",
|
||||
quantization_config=gptq_config,
|
||||
)
|
||||
|
||||
# Save quantized model
|
||||
model.save_pretrained("llama-2-7b-gptq")
|
||||
```
|
||||
|
||||
**Configuration Options:**
|
||||
```python
|
||||
gptq_config = GPTQConfig(
|
||||
bits=4, # Quantization bits
|
||||
group_size=128, # Group size for quantization (128, 32, -1)
|
||||
dataset="c4", # Calibration dataset
|
||||
desc_act=False, # Activation order (can improve accuracy)
|
||||
sym=True, # Symmetric quantization
|
||||
damp_percent=0.1, # Dampening factor
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Fastest inference among quantization methods
|
||||
- Requires one-time calibration (slow)
|
||||
- Best when using pre-quantized models from Hub
|
||||
- Limited fine-tuning support
|
||||
- Excellent for production deployment
|
||||
|
||||
## AWQ (Activation-aware Weight Quantization)
|
||||
|
||||
Protects important weights for better quality.
|
||||
|
||||
### Loading AWQ Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AwqConfig
|
||||
|
||||
# Load pre-quantized AWQ model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-AWQ",
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Or quantize yourself
|
||||
awq_config = AwqConfig(
|
||||
bits=4, # 4-bit quantization
|
||||
group_size=128, # Quantization group size
|
||||
zero_point=True, # Use zero-point quantization
|
||||
version="GEMM", # Quantization version
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=awq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Better accuracy than GPTQ at same bit width
|
||||
- Excellent inference speed
|
||||
- Supports PEFT fine-tuning
|
||||
- Requires calibration data
|
||||
|
||||
### Fine-tuning AWQ Models
|
||||
|
||||
```python
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
# AWQ models support LoRA fine-tuning
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
target_modules=["q_proj", "v_proj"],
|
||||
lora_dropout=0.05,
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
trainer = Trainer(model=model, ...)
|
||||
trainer.train()
|
||||
```
|
||||
|
||||
## GGUF (GGML Format)
|
||||
|
||||
CPU-optimized quantization format, popular in llama.cpp ecosystem.
|
||||
|
||||
### Using GGUF Models
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Load GGUF model
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GGUF",
|
||||
gguf_file="llama-2-7b.Q4_K_M.gguf", # Specific quantization file
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF")
|
||||
```
|
||||
|
||||
**GGUF Quantization Types:**
|
||||
- `Q4_0`: 4-bit, smallest, lowest quality
|
||||
- `Q4_K_M`: 4-bit, medium quality (recommended)
|
||||
- `Q5_K_M`: 5-bit, good quality
|
||||
- `Q6_K`: 6-bit, high quality
|
||||
- `Q8_0`: 8-bit, very high quality
|
||||
|
||||
**Characteristics:**
|
||||
- Optimized for CPU inference
|
||||
- Wide range of bit depths (1-8)
|
||||
- Good for Apple Silicon (M1/M2)
|
||||
- No fine-tuning support
|
||||
- Excellent for local/edge deployment
|
||||
|
||||
## HQQ (Half-Quadratic Quantization)
|
||||
|
||||
Flexible quantization with good accuracy retention.
|
||||
|
||||
### Using HQQ
|
||||
|
||||
```python
|
||||
from transformers import AutoModelForCausalLM, HqqConfig
|
||||
|
||||
hqq_config = HqqConfig(
|
||||
nbits=4, # Quantization bits
|
||||
group_size=64, # Group size
|
||||
quant_zero=False, # Quantize zero point
|
||||
quant_scale=False, # Quantize scale
|
||||
axis=0, # Quantization axis
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=hqq_config,
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
**Characteristics:**
|
||||
- Very fast quantization
|
||||
- No calibration data needed
|
||||
- Support for 1-8 bits
|
||||
- Can serialize/deserialize
|
||||
- Good accuracy vs size tradeoff
|
||||
|
||||
## Choosing a Quantization Method
|
||||
|
||||
### Decision Tree
|
||||
|
||||
**For inference only:**
|
||||
1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models)
|
||||
2. CPU-only deployment? → **GGUF**
|
||||
3. Want easiest setup? → **Bitsandbytes 8-bit**
|
||||
4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit**
|
||||
|
||||
**For fine-tuning:**
|
||||
1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)**
|
||||
2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA**
|
||||
3. Need very large models? → **QLoRA with double quantization**
|
||||
|
||||
**For production:**
|
||||
1. Latency-critical? → **GPTQ or AWQ**
|
||||
2. Cost-optimized? → **Bitsandbytes 8-bit**
|
||||
3. CPU deployment? → **GGUF**
|
||||
|
||||
## Memory Requirements
|
||||
|
||||
Approximate memory for Llama-2 7B model:
|
||||
|
||||
| Method | Memory | vs FP16 |
|
||||
|--------|--------|---------|
|
||||
| FP32 | 28GB | 2x |
|
||||
| FP16 / BF16 | 14GB | 1x |
|
||||
| 8-bit (BnB) | 7GB | 0.5x |
|
||||
| 4-bit (QLoRA) | 3.5GB | 0.25x |
|
||||
| 4-bit Double Quant | 3GB | 0.21x |
|
||||
| GPTQ 4-bit | 4GB | 0.29x |
|
||||
| AWQ 4-bit | 4GB | 0.29x |
|
||||
|
||||
**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Training
|
||||
|
||||
```python
|
||||
# QLoRA recommended configuration
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # BF16 if available
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
# LoRA configuration
|
||||
lora_config = LoraConfig(
|
||||
r=16, # Rank (8, 16, 32, 64)
|
||||
lora_alpha=32, # Scaling (typically 2*r)
|
||||
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM"
|
||||
)
|
||||
```
|
||||
|
||||
### For Inference
|
||||
|
||||
```python
|
||||
# High-speed inference
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/Llama-2-7B-GPTQ",
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16, # Use FP16 for activations
|
||||
)
|
||||
|
||||
# Balanced quality/speed
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
load_in_8bit=True,
|
||||
device_map="auto",
|
||||
)
|
||||
|
||||
# Maximum compression
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
),
|
||||
device_map="auto",
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-GPU Setups
|
||||
|
||||
```python
|
||||
# Automatically distribute across GPUs
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map="auto", # Automatic distribution
|
||||
max_memory={0: "20GB", 1: "20GB"}, # Optional: limit per GPU
|
||||
)
|
||||
|
||||
# Manual device map
|
||||
device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
# ... distribute layers ...
|
||||
"model.norm": 1,
|
||||
"lm_head": 1,
|
||||
}
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"meta-llama/Llama-2-70b-hf",
|
||||
load_in_4bit=True,
|
||||
device_map=device_map,
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Issue: OOM during quantization**
|
||||
```python
|
||||
# Solution: Use low_cpu_mem_usage
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"model-name",
|
||||
quantization_config=config,
|
||||
device_map="auto",
|
||||
low_cpu_mem_usage=True, # Reduce CPU memory during loading
|
||||
)
|
||||
```
|
||||
|
||||
**Issue: Slow quantization**
|
||||
```python
|
||||
# GPTQ/AWQ take time to calibrate
|
||||
# Solution: Use pre-quantized models from Hub
|
||||
model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ")
|
||||
|
||||
# Or use BnB for instant quantization
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True)
|
||||
```
|
||||
|
||||
**Issue: Poor quality after quantization**
|
||||
```python
|
||||
# Try different quantization types
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4", # Try "nf4" instead of "fp4"
|
||||
bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 if available
|
||||
)
|
||||
|
||||
# Or use 8-bit instead of 4-bit
|
||||
model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True)
|
||||
```
|
||||
|
||||
**Issue: Can't fine-tune quantized model**
|
||||
```python
|
||||
# Ensure using compatible quantization method
|
||||
from peft import prepare_model_for_kbit_training
|
||||
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
# Only BnB and AWQ support PEFT fine-tuning
|
||||
# GPTQ has limited support, GGUF doesn't support fine-tuning
|
||||
```
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB:
|
||||
|
||||
| Method | Speed | Memory |
|
||||
|--------|-------|--------|
|
||||
| FP16 | 100 tok/s | 14GB |
|
||||
| 8-bit | 90 tok/s | 7GB |
|
||||
| 4-bit QLoRA | 70 tok/s | 4GB |
|
||||
| GPTQ 4-bit | 95 tok/s | 4GB |
|
||||
| AWQ 4-bit | 95 tok/s | 4GB |
|
||||
|
||||
**Note:** Actual performance varies by hardware, sequence length, and batch size.
|
||||
|
||||
## Resources
|
||||
|
||||
- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub
|
||||
- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes
|
||||
- **PEFT library:** https://github.com/huggingface/peft
|
||||
- **QLoRA paper:** https://arxiv.org/abs/2305.14314
|
||||
|
||||
For task-specific quantization examples, see `training_guide.md`.
|
||||
File diff suppressed because it is too large
Load Diff
328
scientific-packages/transformers/references/training.md
Normal file
328
scientific-packages/transformers/references/training.md
Normal file
@@ -0,0 +1,328 @@
|
||||
# Training with Transformers
|
||||
|
||||
Transformers provides comprehensive training capabilities through the `Trainer` API, supporting distributed training, mixed precision, and advanced optimization techniques.
|
||||
|
||||
## Basic Training Workflow
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
Trainer,
|
||||
TrainingArguments
|
||||
)
|
||||
from datasets import load_dataset
|
||||
|
||||
# 1. Load and preprocess data
|
||||
dataset = load_dataset("imdb")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
||||
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
# 2. Load model
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2
|
||||
)
|
||||
|
||||
# 3. Define training arguments
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
learning_rate=2e-5,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
)
|
||||
|
||||
# 4. Create trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["test"],
|
||||
)
|
||||
|
||||
# 5. Train
|
||||
trainer.train()
|
||||
|
||||
# 6. Evaluate
|
||||
trainer.evaluate()
|
||||
|
||||
# 7. Save model
|
||||
trainer.save_model("./final_model")
|
||||
```
|
||||
|
||||
## TrainingArguments Configuration
|
||||
|
||||
### Essential Parameters
|
||||
|
||||
**Output and Logging:**
|
||||
- `output_dir`: Directory for checkpoints and outputs (required)
|
||||
- `logging_dir`: TensorBoard log directory (default: `{output_dir}/runs`)
|
||||
- `logging_steps`: Log every N steps (default: 500)
|
||||
- `logging_strategy`: "steps" or "epoch"
|
||||
|
||||
**Training Duration:**
|
||||
- `num_train_epochs`: Number of epochs (default: 3.0)
|
||||
- `max_steps`: Max training steps (overrides num_train_epochs if set)
|
||||
|
||||
**Batch Size and Gradient Accumulation:**
|
||||
- `per_device_train_batch_size`: Batch size per device (default: 8)
|
||||
- `per_device_eval_batch_size`: Eval batch size per device (default: 8)
|
||||
- `gradient_accumulation_steps`: Accumulate gradients over N steps (default: 1)
|
||||
- Effective batch size = `per_device_train_batch_size * gradient_accumulation_steps * num_gpus`
|
||||
|
||||
**Learning Rate:**
|
||||
- `learning_rate`: Peak learning rate (default: 5e-5)
|
||||
- `lr_scheduler_type`: Scheduler type ("linear", "cosine", "constant", etc.)
|
||||
- `warmup_steps`: Warmup steps (default: 0)
|
||||
- `warmup_ratio`: Warmup as fraction of total steps
|
||||
|
||||
**Evaluation:**
|
||||
- `eval_strategy`: "no", "steps", or "epoch" (default: "no")
|
||||
- `eval_steps`: Evaluate every N steps (if eval_strategy="steps")
|
||||
- `eval_delay`: Delay evaluation until N steps
|
||||
|
||||
**Checkpointing:**
|
||||
- `save_strategy`: "no", "steps", or "epoch" (default: "steps")
|
||||
- `save_steps`: Save checkpoint every N steps (default: 500)
|
||||
- `save_total_limit`: Keep only N most recent checkpoints
|
||||
- `load_best_model_at_end`: Load best checkpoint at end (default: False)
|
||||
- `metric_for_best_model`: Metric to determine best model
|
||||
|
||||
**Optimization:**
|
||||
- `optim`: Optimizer ("adamw_torch", "adamw_hf", "sgd", etc.)
|
||||
- `weight_decay`: Weight decay coefficient (default: 0.0)
|
||||
- `adam_beta1`, `adam_beta2`: Adam optimizer betas
|
||||
- `adam_epsilon`: Epsilon for Adam (default: 1e-8)
|
||||
- `max_grad_norm`: Max gradient norm for clipping (default: 1.0)
|
||||
|
||||
### Mixed Precision Training
|
||||
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
fp16=True, # Use fp16 on NVIDIA GPUs
|
||||
fp16_opt_level="O1", # O0, O1, O2, O3 (Apex levels)
|
||||
# or
|
||||
bf16=True, # Use bf16 on Ampere+ GPUs (better than fp16)
|
||||
)
|
||||
```
|
||||
|
||||
### Distributed Training
|
||||
|
||||
**DataParallel (single-node multi-GPU):**
|
||||
```python
|
||||
# Automatic with multiple GPUs
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
per_device_train_batch_size=16, # Per GPU
|
||||
)
|
||||
# Run: python script.py
|
||||
```
|
||||
|
||||
**DistributedDataParallel (multi-node or multi-GPU):**
|
||||
```bash
|
||||
# Single node, multiple GPUs
|
||||
python -m torch.distributed.launch --nproc_per_node=4 script.py
|
||||
|
||||
# Or use accelerate
|
||||
accelerate config
|
||||
accelerate launch script.py
|
||||
```
|
||||
|
||||
**DeepSpeed Integration:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
deepspeed="ds_config.json", # DeepSpeed config file
|
||||
)
|
||||
```
|
||||
|
||||
### Advanced Features
|
||||
|
||||
**Gradient Checkpointing (reduce memory):**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
gradient_checkpointing=True,
|
||||
)
|
||||
```
|
||||
|
||||
**Compilation with torch.compile:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
torch_compile=True,
|
||||
torch_compile_backend="inductor", # or "cudagraphs"
|
||||
)
|
||||
```
|
||||
|
||||
**Push to Hub:**
|
||||
```python
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./results",
|
||||
push_to_hub=True,
|
||||
hub_model_id="username/model-name",
|
||||
hub_strategy="every_save", # or "end"
|
||||
)
|
||||
```
|
||||
|
||||
## Custom Training Components
|
||||
|
||||
### Custom Metrics
|
||||
|
||||
```python
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
metric = evaluate.load("accuracy")
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Loss Function
|
||||
|
||||
```python
|
||||
class CustomTrainer(Trainer):
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop("labels")
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
|
||||
# Custom loss calculation
|
||||
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
|
||||
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
```
|
||||
|
||||
### Data Collator
|
||||
|
||||
```python
|
||||
from transformers import DataCollatorWithPadding
|
||||
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
```
|
||||
|
||||
### Callbacks
|
||||
|
||||
```python
|
||||
from transformers import TrainerCallback
|
||||
|
||||
class CustomCallback(TrainerCallback):
|
||||
def on_epoch_end(self, args, state, control, **kwargs):
|
||||
print(f"Epoch {state.epoch} completed!")
|
||||
return control
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
callbacks=[CustomCallback],
|
||||
)
|
||||
```
|
||||
|
||||
## Hyperparameter Search
|
||||
|
||||
```python
|
||||
def model_init():
|
||||
return AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=2
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model_init=model_init,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# Optuna-based search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
direction="maximize",
|
||||
backend="optuna",
|
||||
n_trials=10,
|
||||
hp_space=lambda trial: {
|
||||
"learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
|
||||
"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
|
||||
"num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Training Best Practices
|
||||
|
||||
1. **Start with small learning rates**: 2e-5 to 5e-5 for fine-tuning
|
||||
2. **Use warmup**: 5-10% of total steps for learning rate warmup
|
||||
3. **Monitor training**: Use eval_strategy="epoch" or "steps" to track progress
|
||||
4. **Save checkpoints**: Set save_strategy and save_total_limit
|
||||
5. **Use mixed precision**: Enable fp16 or bf16 for faster training
|
||||
6. **Gradient accumulation**: For large effective batch sizes on limited memory
|
||||
7. **Load best model**: Set load_best_model_at_end=True to avoid overfitting
|
||||
8. **Push to Hub**: Enable push_to_hub for easy model sharing and versioning
|
||||
|
||||
## Common Training Patterns
|
||||
|
||||
### Classification
|
||||
```python
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=num_classes,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
```
|
||||
|
||||
### Question Answering
|
||||
```python
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
|
||||
### Token Classification (NER)
|
||||
```python
|
||||
model = AutoModelForTokenClassification.from_pretrained(
|
||||
"bert-base-uncased",
|
||||
num_labels=num_tags,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
```
|
||||
|
||||
### Sequence-to-Sequence
|
||||
```python
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
||||
```
|
||||
|
||||
### Causal Language Modeling
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
||||
```
|
||||
|
||||
### Masked Language Modeling
|
||||
```python
|
||||
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
||||
```
|
||||
337
scientific-packages/transformers/scripts/fine_tune_classifier.py
Executable file → Normal file
337
scientific-packages/transformers/scripts/fine_tune_classifier.py
Executable file → Normal file
@@ -1,19 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete example for fine-tuning a text classification model.
|
||||
Fine-tune a transformer model for text classification.
|
||||
|
||||
This script demonstrates the full workflow:
|
||||
1. Load dataset
|
||||
2. Preprocess with tokenizer
|
||||
3. Configure model
|
||||
4. Train with Trainer
|
||||
5. Evaluate and save
|
||||
|
||||
Usage:
|
||||
python fine_tune_classifier.py --model bert-base-uncased --dataset imdb --epochs 3
|
||||
This script demonstrates the complete workflow for fine-tuning a pre-trained
|
||||
model on a classification task using the Trainer API.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
@@ -23,189 +16,225 @@ from transformers import (
|
||||
DataCollatorWithPadding,
|
||||
)
|
||||
import evaluate
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
"""Compute accuracy and F1 score."""
|
||||
metric_accuracy = evaluate.load("accuracy")
|
||||
metric_f1 = evaluate.load("f1")
|
||||
def load_and_prepare_data(dataset_name="imdb", model_name="distilbert-base-uncased", max_samples=None):
|
||||
"""
|
||||
Load dataset and tokenize.
|
||||
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
Args:
|
||||
dataset_name: Name of the dataset to load
|
||||
model_name: Name of the model/tokenizer to use
|
||||
max_samples: Limit number of samples (for quick testing)
|
||||
|
||||
accuracy = metric_accuracy.compute(predictions=predictions, references=labels)
|
||||
f1 = metric_f1.compute(predictions=predictions, references=labels)
|
||||
Returns:
|
||||
tokenized_datasets, tokenizer
|
||||
"""
|
||||
print(f"Loading dataset: {dataset_name}")
|
||||
dataset = load_dataset(dataset_name)
|
||||
|
||||
return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}
|
||||
# Optionally limit samples for quick testing
|
||||
if max_samples:
|
||||
dataset["train"] = dataset["train"].select(range(max_samples))
|
||||
dataset["test"] = dataset["test"].select(range(min(max_samples, len(dataset["test"]))))
|
||||
|
||||
print(f"Loading tokenizer: {model_name}")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(
|
||||
examples["text"],
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=512
|
||||
)
|
||||
|
||||
print("Tokenizing dataset...")
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
return tokenized_datasets, tokenizer
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fine-tune a text classification model")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="bert-base-uncased",
|
||||
help="Pretrained model name or path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="imdb",
|
||||
help="Dataset name from Hugging Face Hub",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-samples",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum samples to use (for quick testing)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./results",
|
||||
help="Output directory for checkpoints",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--epochs",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of training epochs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Batch size per device",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning-rate",
|
||||
type=float,
|
||||
default=2e-5,
|
||||
help="Learning rate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--push-to-hub",
|
||||
action="store_true",
|
||||
help="Push model to Hugging Face Hub after training",
|
||||
)
|
||||
def create_model(model_name, num_labels, id2label, label2id):
|
||||
"""
|
||||
Create classification model.
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Text Classification Fine-Tuning")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Dataset: {args.dataset}")
|
||||
print(f"Epochs: {args.epochs}")
|
||||
print(f"Batch size: {args.batch_size}")
|
||||
print(f"Learning rate: {args.learning_rate}")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. Load dataset
|
||||
print("\n[1/5] Loading dataset...")
|
||||
dataset = load_dataset(args.dataset)
|
||||
|
||||
if args.max_samples:
|
||||
dataset["train"] = dataset["train"].select(range(args.max_samples))
|
||||
dataset["test"] = dataset["test"].select(range(args.max_samples // 5))
|
||||
|
||||
print(f"Train samples: {len(dataset['train'])}")
|
||||
print(f"Test samples: {len(dataset['test'])}")
|
||||
|
||||
# 2. Preprocess
|
||||
print("\n[2/5] Preprocessing data...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
def preprocess_function(examples):
|
||||
return tokenizer(examples["text"], truncation=True, max_length=512)
|
||||
|
||||
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# 3. Load model
|
||||
print("\n[3/5] Loading model...")
|
||||
|
||||
# Determine number of labels
|
||||
num_labels = len(set(dataset["train"]["label"]))
|
||||
Args:
|
||||
model_name: Name of the pre-trained model
|
||||
num_labels: Number of classification labels
|
||||
id2label: Dictionary mapping label IDs to names
|
||||
label2id: Dictionary mapping label names to IDs
|
||||
|
||||
Returns:
|
||||
model
|
||||
"""
|
||||
print(f"Loading model: {model_name}")
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
args.model,
|
||||
model_name,
|
||||
num_labels=num_labels,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
return model
|
||||
|
||||
print(f"Number of labels: {num_labels}")
|
||||
print(f"Model parameters: {model.num_parameters():,}")
|
||||
|
||||
# 4. Configure training
|
||||
print("\n[4/5] Configuring training...")
|
||||
def define_compute_metrics(metric_name="accuracy"):
|
||||
"""
|
||||
Define function to compute metrics during evaluation.
|
||||
|
||||
Args:
|
||||
metric_name: Name of the metric to use
|
||||
|
||||
Returns:
|
||||
compute_metrics function
|
||||
"""
|
||||
metric = evaluate.load(metric_name)
|
||||
|
||||
def compute_metrics(eval_pred):
|
||||
logits, labels = eval_pred
|
||||
predictions = np.argmax(logits, axis=-1)
|
||||
return metric.compute(predictions=predictions, references=labels)
|
||||
|
||||
return compute_metrics
|
||||
|
||||
|
||||
def train_model(model, tokenizer, train_dataset, eval_dataset, output_dir="./results"):
|
||||
"""
|
||||
Train the model.
|
||||
|
||||
Args:
|
||||
model: The model to train
|
||||
tokenizer: The tokenizer
|
||||
train_dataset: Training dataset
|
||||
eval_dataset: Evaluation dataset
|
||||
output_dir: Directory for checkpoints and logs
|
||||
|
||||
Returns:
|
||||
trained model, trainer
|
||||
"""
|
||||
# Define training arguments
|
||||
training_args = TrainingArguments(
|
||||
output_dir=args.output_dir,
|
||||
learning_rate=args.learning_rate,
|
||||
per_device_train_batch_size=args.batch_size,
|
||||
per_device_eval_batch_size=args.batch_size,
|
||||
num_train_epochs=args.epochs,
|
||||
output_dir=output_dir,
|
||||
num_train_epochs=3,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=64,
|
||||
learning_rate=2e-5,
|
||||
weight_decay=0.01,
|
||||
eval_strategy="epoch",
|
||||
save_strategy="epoch",
|
||||
load_best_model_at_end=True,
|
||||
push_to_hub=args.push_to_hub,
|
||||
metric_for_best_model="accuracy",
|
||||
logging_dir=f"{output_dir}/logs",
|
||||
logging_steps=100,
|
||||
save_total_limit=2,
|
||||
fp16=False, # Set to True if using GPU with fp16 support
|
||||
)
|
||||
|
||||
# Create data collator
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
# Create trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_dataset["train"],
|
||||
eval_dataset=tokenized_dataset["test"],
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
compute_metrics=define_compute_metrics("accuracy"),
|
||||
)
|
||||
|
||||
# 5. Train
|
||||
print("\n[5/5] Training...")
|
||||
print("-" * 60)
|
||||
# Train
|
||||
print("\nStarting training...")
|
||||
trainer.train()
|
||||
|
||||
# Evaluate
|
||||
print("\n" + "=" * 60)
|
||||
print("Final Evaluation")
|
||||
print("=" * 60)
|
||||
metrics = trainer.evaluate()
|
||||
print("\nEvaluating model...")
|
||||
eval_results = trainer.evaluate()
|
||||
print(f"Evaluation results: {eval_results}")
|
||||
|
||||
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
|
||||
print(f"F1 Score: {metrics['eval_f1']:.4f}")
|
||||
print(f"Loss: {metrics['eval_loss']:.4f}")
|
||||
return model, trainer
|
||||
|
||||
# Save
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Saving model to {args.output_dir}")
|
||||
trainer.save_model(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
if args.push_to_hub:
|
||||
print("Pushing to Hugging Face Hub...")
|
||||
trainer.push_to_hub()
|
||||
def test_inference(model, tokenizer, id2label):
|
||||
"""
|
||||
Test the trained model with sample texts.
|
||||
|
||||
Args:
|
||||
model: Trained model
|
||||
tokenizer: Tokenizer
|
||||
id2label: Dictionary mapping label IDs to names
|
||||
"""
|
||||
print("\n=== Testing Inference ===")
|
||||
|
||||
test_texts = [
|
||||
"This movie was absolutely fantastic! I loved every minute of it.",
|
||||
"Terrible film. Waste of time and money.",
|
||||
"It was okay, nothing special but not bad either."
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
||||
outputs = model(**inputs)
|
||||
predictions = outputs.logits.argmax(-1)
|
||||
predicted_label = id2label[predictions.item()]
|
||||
confidence = outputs.logits.softmax(-1).max().item()
|
||||
|
||||
print(f"\nText: {text}")
|
||||
print(f"Prediction: {predicted_label} (confidence: {confidence:.3f})")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main training pipeline."""
|
||||
# Configuration
|
||||
DATASET_NAME = "imdb"
|
||||
MODEL_NAME = "distilbert-base-uncased"
|
||||
OUTPUT_DIR = "./results"
|
||||
MAX_SAMPLES = None # Set to a small number (e.g., 1000) for quick testing
|
||||
|
||||
# Label mapping
|
||||
id2label = {0: "negative", 1: "positive"}
|
||||
label2id = {"negative": 0, "positive": 1}
|
||||
num_labels = len(id2label)
|
||||
|
||||
print("=" * 60)
|
||||
print("Training complete!")
|
||||
print("Fine-Tuning Text Classification Model")
|
||||
print("=" * 60)
|
||||
|
||||
# Quick inference example
|
||||
print("\nQuick inference example:")
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline(
|
||||
"text-classification",
|
||||
model=args.output_dir,
|
||||
tokenizer=args.output_dir,
|
||||
# Load and prepare data
|
||||
tokenized_datasets, tokenizer = load_and_prepare_data(
|
||||
dataset_name=DATASET_NAME,
|
||||
model_name=MODEL_NAME,
|
||||
max_samples=MAX_SAMPLES
|
||||
)
|
||||
|
||||
example_text = "This is a great example of how to use transformers!"
|
||||
result = classifier(example_text)
|
||||
print(f"Text: {example_text}")
|
||||
print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})")
|
||||
# Create model
|
||||
model = create_model(
|
||||
model_name=MODEL_NAME,
|
||||
num_labels=num_labels,
|
||||
id2label=id2label,
|
||||
label2id=label2id
|
||||
)
|
||||
|
||||
# Train model
|
||||
model, trainer = train_model(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=tokenized_datasets["train"],
|
||||
eval_dataset=tokenized_datasets["test"],
|
||||
output_dir=OUTPUT_DIR
|
||||
)
|
||||
|
||||
# Save final model
|
||||
print(f"\nSaving model to {OUTPUT_DIR}/final_model")
|
||||
trainer.save_model(f"{OUTPUT_DIR}/final_model")
|
||||
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")
|
||||
|
||||
# Test inference
|
||||
test_inference(model, tokenizer, id2label)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Training completed successfully!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
309
scientific-packages/transformers/scripts/generate_text.py
Executable file → Normal file
309
scientific-packages/transformers/scripts/generate_text.py
Executable file → Normal file
@@ -1,231 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Text generation with various strategies.
|
||||
Text generation with different decoding strategies.
|
||||
|
||||
This script demonstrates different generation strategies:
|
||||
- Greedy decoding
|
||||
- Beam search
|
||||
- Sampling with temperature
|
||||
- Top-k and top-p sampling
|
||||
|
||||
Usage:
|
||||
python generate_text.py --model gpt2 --prompt "The future of AI" --strategy sampling
|
||||
This script demonstrates various text generation approaches using
|
||||
different sampling and decoding strategies.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
||||
|
||||
|
||||
def generate_with_greedy(model, tokenizer, prompt, max_length):
|
||||
"""Greedy decoding (deterministic)."""
|
||||
print("\n" + "=" * 60)
|
||||
print("GREEDY DECODING")
|
||||
print("=" * 60)
|
||||
def load_model_and_tokenizer(model_name="gpt2"):
|
||||
"""
|
||||
Load model and tokenizer.
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
Args:
|
||||
model_name: Name of the model to load
|
||||
|
||||
Returns:
|
||||
model, tokenizer
|
||||
"""
|
||||
print(f"Loading model: {model_name}")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Set pad token if not already set
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def generate_with_greedy(model, tokenizer, prompt, max_new_tokens=50):
|
||||
"""Greedy decoding - always picks highest probability token."""
|
||||
print("\n=== Greedy Decoding ===")
|
||||
print(f"Prompt: {prompt}")
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=False,
|
||||
num_beams=1,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"Generated: {generated_text}\n")
|
||||
|
||||
|
||||
def generate_with_beam_search(model, tokenizer, prompt, max_length, num_beams=5):
|
||||
"""Beam search for higher quality."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"BEAM SEARCH (num_beams={num_beams})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
def generate_with_beam_search(model, tokenizer, prompt, max_new_tokens=50, num_beams=5):
|
||||
"""Beam search - explores multiple hypotheses."""
|
||||
print("\n=== Beam Search ===")
|
||||
print(f"Prompt: {prompt}")
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
max_new_tokens=max_new_tokens,
|
||||
num_beams=num_beams,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=2,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"Generated: {generated_text}\n")
|
||||
|
||||
|
||||
def generate_with_sampling(model, tokenizer, prompt, max_length, temperature=0.8):
|
||||
"""Sampling with temperature."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"SAMPLING (temperature={temperature})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
def generate_with_sampling(model, tokenizer, prompt, max_new_tokens=50,
|
||||
temperature=0.7, top_k=50, top_p=0.9):
|
||||
"""Sampling with temperature, top-k, and nucleus (top-p) sampling."""
|
||||
print("\n=== Sampling (Temperature + Top-K + Top-P) ===")
|
||||
print(f"Prompt: {prompt}")
|
||||
print(f"Parameters: temperature={temperature}, top_k={top_k}, top_p={top_p}")
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
|
||||
|
||||
def generate_with_top_k_top_p(model, tokenizer, prompt, max_length, top_k=50, top_p=0.95, temperature=0.8):
|
||||
"""Top-k and top-p (nucleus) sampling."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"TOP-K TOP-P SAMPLING (k={top_k}, p={top_p}, temp={temperature})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
do_sample=True,
|
||||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
temperature=temperature,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"\nPrompt: {prompt}")
|
||||
print(f"\nGenerated:\n{text}")
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"Generated: {generated_text}\n")
|
||||
|
||||
|
||||
def generate_multiple(model, tokenizer, prompt, max_length, num_sequences=3):
|
||||
def generate_multiple_sequences(model, tokenizer, prompt, max_new_tokens=50,
|
||||
num_return_sequences=3):
|
||||
"""Generate multiple diverse sequences."""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"MULTIPLE SEQUENCES (n={num_sequences})")
|
||||
print("=" * 60)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||
print("\n=== Multiple Sequences (with Sampling) ===")
|
||||
print(f"Prompt: {prompt}")
|
||||
print(f"Generating {num_return_sequences} sequences...")
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_length,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=True,
|
||||
num_return_sequences=num_sequences,
|
||||
temperature=0.9,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
num_return_sequences=num_return_sequences,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
print(f"\nPrompt: {prompt}\n")
|
||||
for i, output in enumerate(outputs, 1):
|
||||
text = tokenizer.decode(output, skip_special_tokens=True)
|
||||
print(f"\n--- Sequence {i} ---\n{text}\n")
|
||||
for i, output in enumerate(outputs):
|
||||
generated_text = tokenizer.decode(output, skip_special_tokens=True)
|
||||
print(f"\nSequence {i+1}: {generated_text}")
|
||||
print()
|
||||
|
||||
|
||||
def generate_with_config(model, tokenizer, prompt):
|
||||
"""Use GenerationConfig for reusable configuration."""
|
||||
print("\n=== Using GenerationConfig ===")
|
||||
print(f"Prompt: {prompt}")
|
||||
|
||||
# Create a generation config
|
||||
generation_config = GenerationConfig(
|
||||
max_new_tokens=50,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
top_k=50,
|
||||
repetition_penalty=1.2,
|
||||
no_repeat_ngram_size=3,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, generation_config=generation_config)
|
||||
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"Generated: {generated_text}\n")
|
||||
|
||||
|
||||
def compare_temperatures(model, tokenizer, prompt, max_new_tokens=50):
|
||||
"""Compare different temperature settings."""
|
||||
print("\n=== Temperature Comparison ===")
|
||||
print(f"Prompt: {prompt}\n")
|
||||
|
||||
temperatures = [0.3, 0.7, 1.0, 1.5]
|
||||
|
||||
for temp in temperatures:
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=True,
|
||||
temperature=temp,
|
||||
top_p=0.9,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(f"Temperature {temp}: {generated_text}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Text generation with various strategies")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="gpt2",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input prompt for generation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strategy",
|
||||
type=str,
|
||||
default="all",
|
||||
choices=["greedy", "beam", "sampling", "top_k_top_p", "multiple", "all"],
|
||||
help="Generation strategy to use",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-length",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Maximum number of new tokens to generate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Device (cuda, cpu, or auto)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Sampling temperature",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantize",
|
||||
action="store_true",
|
||||
help="Use 8-bit quantization",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Text Generation Demo")
|
||||
print("=" * 60)
|
||||
print(f"Model: {args.model}")
|
||||
print(f"Strategy: {args.strategy}")
|
||||
print(f"Max length: {args.max_length}")
|
||||
print(f"Device: {args.device}")
|
||||
print("=" * 60)
|
||||
"""Run all generation examples."""
|
||||
print("=" * 70)
|
||||
print("Text Generation Examples")
|
||||
print("=" * 70)
|
||||
|
||||
# Load model and tokenizer
|
||||
print("\nLoading model...")
|
||||
model, tokenizer = load_model_and_tokenizer("gpt2")
|
||||
|
||||
if args.device == "auto":
|
||||
device_map = "auto"
|
||||
device = None
|
||||
else:
|
||||
device_map = None
|
||||
device = args.device
|
||||
# Example prompts
|
||||
story_prompt = "Once upon a time in a distant galaxy"
|
||||
factual_prompt = "The three branches of the US government are"
|
||||
|
||||
model_kwargs = {"device_map": device_map} if device_map else {}
|
||||
# Demonstrate different strategies
|
||||
generate_with_greedy(model, tokenizer, story_prompt)
|
||||
generate_with_beam_search(model, tokenizer, factual_prompt)
|
||||
generate_with_sampling(model, tokenizer, story_prompt)
|
||||
generate_multiple_sequences(model, tokenizer, story_prompt, num_return_sequences=3)
|
||||
generate_with_config(model, tokenizer, story_prompt)
|
||||
compare_temperatures(model, tokenizer, story_prompt)
|
||||
|
||||
if args.quantize:
|
||||
print("Using 8-bit quantization...")
|
||||
model_kwargs["load_in_8bit"] = True
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
if device and not device_map:
|
||||
model = model.to(device)
|
||||
|
||||
print(f"Model loaded on: {model.device if hasattr(model, 'device') else 'multiple devices'}")
|
||||
|
||||
# Generate based on strategy
|
||||
strategies = {
|
||||
"greedy": lambda: generate_with_greedy(model, tokenizer, args.prompt, args.max_length),
|
||||
"beam": lambda: generate_with_beam_search(model, tokenizer, args.prompt, args.max_length),
|
||||
"sampling": lambda: generate_with_sampling(model, tokenizer, args.prompt, args.max_length, args.temperature),
|
||||
"top_k_top_p": lambda: generate_with_top_k_top_p(model, tokenizer, args.prompt, args.max_length),
|
||||
"multiple": lambda: generate_multiple(model, tokenizer, args.prompt, args.max_length),
|
||||
}
|
||||
|
||||
if args.strategy == "all":
|
||||
for strategy_fn in strategies.values():
|
||||
strategy_fn()
|
||||
else:
|
||||
strategies[args.strategy]()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Generation complete!")
|
||||
print("=" * 60)
|
||||
print("=" * 70)
|
||||
print("All generation examples completed!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
209
scientific-packages/transformers/scripts/quick_inference.py
Executable file → Normal file
209
scientific-packages/transformers/scripts/quick_inference.py
Executable file → Normal file
@@ -1,105 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick inference script using Transformers pipelines.
|
||||
Quick inference using Transformers pipelines.
|
||||
|
||||
This script demonstrates how to use various pipeline tasks for quick inference
|
||||
without manually managing models, tokenizers, or preprocessing.
|
||||
|
||||
Usage:
|
||||
python quick_inference.py --task text-generation --model gpt2 --input "Hello world"
|
||||
python quick_inference.py --task sentiment-analysis --input "I love this!"
|
||||
This script demonstrates how to quickly use pre-trained models for inference
|
||||
across various tasks using the pipeline API.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from transformers import pipeline, infer_device
|
||||
from transformers import pipeline
|
||||
|
||||
|
||||
def text_classification_example():
|
||||
"""Sentiment analysis example."""
|
||||
print("=== Text Classification ===")
|
||||
classifier = pipeline("text-classification")
|
||||
result = classifier("I love using Transformers! It makes NLP so easy.")
|
||||
print(f"Result: {result}\n")
|
||||
|
||||
|
||||
def named_entity_recognition_example():
|
||||
"""Named Entity Recognition example."""
|
||||
print("=== Named Entity Recognition ===")
|
||||
ner = pipeline("token-classification", aggregation_strategy="simple")
|
||||
text = "My name is Sarah and I work at Microsoft in Seattle"
|
||||
entities = ner(text)
|
||||
for entity in entities:
|
||||
print(f"{entity['word']}: {entity['entity_group']} (score: {entity['score']:.3f})")
|
||||
print()
|
||||
|
||||
|
||||
def question_answering_example():
|
||||
"""Question Answering example."""
|
||||
print("=== Question Answering ===")
|
||||
qa = pipeline("question-answering")
|
||||
context = "Paris is the capital and most populous city of France. It is located in northern France."
|
||||
question = "What is the capital of France?"
|
||||
answer = qa(question=question, context=context)
|
||||
print(f"Question: {question}")
|
||||
print(f"Answer: {answer['answer']} (score: {answer['score']:.3f})\n")
|
||||
|
||||
|
||||
def text_generation_example():
|
||||
"""Text generation example."""
|
||||
print("=== Text Generation ===")
|
||||
generator = pipeline("text-generation", model="gpt2")
|
||||
prompt = "Once upon a time in a land far away"
|
||||
generated = generator(prompt, max_length=50, num_return_sequences=1)
|
||||
print(f"Prompt: {prompt}")
|
||||
print(f"Generated: {generated[0]['generated_text']}\n")
|
||||
|
||||
|
||||
def summarization_example():
|
||||
"""Text summarization example."""
|
||||
print("=== Summarization ===")
|
||||
summarizer = pipeline("summarization")
|
||||
article = """
|
||||
The Transformers library provides thousands of pretrained models to perform tasks
|
||||
on texts such as classification, information extraction, question answering,
|
||||
summarization, translation, text generation, etc in over 100 languages. Its aim
|
||||
is to make cutting-edge NLP easier to use for everyone. The library provides APIs
|
||||
to quickly download and use pretrained models on a given text, fine-tune them on
|
||||
your own datasets then share them with the community on the model hub.
|
||||
"""
|
||||
summary = summarizer(article, max_length=50, min_length=25, do_sample=False)
|
||||
print(f"Summary: {summary[0]['summary_text']}\n")
|
||||
|
||||
|
||||
def translation_example():
|
||||
"""Translation example."""
|
||||
print("=== Translation ===")
|
||||
translator = pipeline("translation_en_to_fr")
|
||||
text = "Hello, how are you today?"
|
||||
translation = translator(text)
|
||||
print(f"English: {text}")
|
||||
print(f"French: {translation[0]['translation_text']}\n")
|
||||
|
||||
|
||||
def zero_shot_classification_example():
|
||||
"""Zero-shot classification example."""
|
||||
print("=== Zero-Shot Classification ===")
|
||||
classifier = pipeline("zero-shot-classification")
|
||||
text = "This is a breaking news story about a major earthquake."
|
||||
candidate_labels = ["politics", "sports", "science", "breaking news"]
|
||||
result = classifier(text, candidate_labels)
|
||||
print(f"Text: {text}")
|
||||
print("Predictions:")
|
||||
for label, score in zip(result['labels'], result['scores']):
|
||||
print(f" {label}: {score:.3f}")
|
||||
print()
|
||||
|
||||
|
||||
def image_classification_example():
|
||||
"""Image classification example (requires PIL)."""
|
||||
print("=== Image Classification ===")
|
||||
try:
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
classifier = pipeline("image-classification")
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
predictions = classifier(image)
|
||||
print("Top predictions:")
|
||||
for pred in predictions[:3]:
|
||||
print(f" {pred['label']}: {pred['score']:.3f}")
|
||||
print()
|
||||
except ImportError:
|
||||
print("PIL not installed. Skipping image classification example.\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Quick inference with Transformers pipelines")
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Pipeline task (text-generation, sentiment-analysis, question-answering, etc.)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model name or path (default: use task default)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input text for inference",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--context",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Context for question-answering tasks",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-length",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Maximum generation length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Device (cuda, cpu, or auto-detect)",
|
||||
)
|
||||
"""Run all examples."""
|
||||
print("Transformers Quick Inference Examples")
|
||||
print("=" * 50 + "\n")
|
||||
|
||||
args = parser.parse_args()
|
||||
# Text tasks
|
||||
text_classification_example()
|
||||
named_entity_recognition_example()
|
||||
question_answering_example()
|
||||
text_generation_example()
|
||||
summarization_example()
|
||||
translation_example()
|
||||
zero_shot_classification_example()
|
||||
|
||||
# Auto-detect device if not specified
|
||||
if args.device is None:
|
||||
device = infer_device()
|
||||
else:
|
||||
device = args.device
|
||||
# Vision task (optional)
|
||||
image_classification_example()
|
||||
|
||||
print(f"Using device: {device}")
|
||||
print(f"Task: {args.task}")
|
||||
print(f"Model: {args.model or 'default'}")
|
||||
print("-" * 50)
|
||||
|
||||
# Create pipeline
|
||||
pipe = pipeline(
|
||||
args.task,
|
||||
model=args.model,
|
||||
device=device,
|
||||
)
|
||||
|
||||
# Run inference based on task
|
||||
if args.task == "question-answering":
|
||||
if args.context is None:
|
||||
print("Error: --context required for question-answering")
|
||||
return
|
||||
result = pipe(question=args.input, context=args.context)
|
||||
print(f"Question: {args.input}")
|
||||
print(f"Context: {args.context}")
|
||||
print(f"\nAnswer: {result['answer']}")
|
||||
print(f"Score: {result['score']:.4f}")
|
||||
|
||||
elif args.task == "text-generation":
|
||||
result = pipe(args.input, max_length=args.max_length)
|
||||
print(f"Prompt: {args.input}")
|
||||
print(f"\nGenerated: {result[0]['generated_text']}")
|
||||
|
||||
elif args.task in ["sentiment-analysis", "text-classification"]:
|
||||
result = pipe(args.input)
|
||||
print(f"Text: {args.input}")
|
||||
print(f"\nLabel: {result[0]['label']}")
|
||||
print(f"Score: {result[0]['score']:.4f}")
|
||||
|
||||
else:
|
||||
# Generic handling for other tasks
|
||||
result = pipe(args.input)
|
||||
print(f"Input: {args.input}")
|
||||
print(f"\nResult: {result}")
|
||||
print("=" * 50)
|
||||
print("All examples completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user