mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
486 lines
12 KiB
Markdown
486 lines
12 KiB
Markdown
# Transformers API Reference
|
|
|
|
This reference covers the core classes and APIs in the Transformers library.
|
|
|
|
## Core Auto Classes
|
|
|
|
Auto classes provide a convenient way to automatically select the appropriate architecture based on model name or checkpoint.
|
|
|
|
### AutoTokenizer
|
|
|
|
```python
|
|
from transformers import AutoTokenizer
|
|
|
|
# Load tokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
# Tokenize single text
|
|
encoded = tokenizer("Hello, how are you?")
|
|
# Returns: {'input_ids': [...], 'attention_mask': [...]}
|
|
|
|
# Tokenize with options
|
|
encoded = tokenizer(
|
|
"Hello, how are you?",
|
|
padding="max_length",
|
|
truncation=True,
|
|
max_length=512,
|
|
return_tensors="pt" # "pt" for PyTorch, "tf" for TensorFlow
|
|
)
|
|
|
|
# Tokenize pairs (for classification, QA, etc.)
|
|
encoded = tokenizer(
|
|
"Question or sentence A",
|
|
"Context or sentence B",
|
|
padding=True,
|
|
truncation=True
|
|
)
|
|
|
|
# Batch tokenization
|
|
texts = ["Text 1", "Text 2", "Text 3"]
|
|
encoded = tokenizer(texts, padding=True, truncation=True)
|
|
|
|
# Decode tokens back to text
|
|
text = tokenizer.decode(token_ids, skip_special_tokens=True)
|
|
|
|
# Batch decode
|
|
texts = tokenizer.batch_decode(batch_token_ids, skip_special_tokens=True)
|
|
```
|
|
|
|
**Key Parameters:**
|
|
- `padding`: "max_length", "longest", or True (pad to max in batch)
|
|
- `truncation`: True or strategy ("longest_first", "only_first", "only_second")
|
|
- `max_length`: Maximum sequence length
|
|
- `return_tensors`: "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
|
|
- `return_attention_mask`: Return attention masks (default True)
|
|
- `return_token_type_ids`: Return token type IDs for pairs (default True)
|
|
- `add_special_tokens`: Add special tokens like [CLS], [SEP] (default True)
|
|
|
|
**Special Properties:**
|
|
- `tokenizer.vocab_size`: Size of vocabulary
|
|
- `tokenizer.pad_token_id`: ID of padding token
|
|
- `tokenizer.eos_token_id`: ID of end-of-sequence token
|
|
- `tokenizer.bos_token_id`: ID of beginning-of-sequence token
|
|
- `tokenizer.unk_token_id`: ID of unknown token
|
|
|
|
### AutoModel
|
|
|
|
Base model class that outputs hidden states.
|
|
|
|
```python
|
|
from transformers import AutoModel
|
|
|
|
model = AutoModel.from_pretrained("bert-base-uncased")
|
|
|
|
# Forward pass
|
|
outputs = model(**inputs)
|
|
|
|
# Access hidden states
|
|
last_hidden_state = outputs.last_hidden_state # [batch_size, seq_length, hidden_size]
|
|
pooler_output = outputs.pooler_output # [batch_size, hidden_size]
|
|
|
|
# Get all hidden states
|
|
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
|
|
outputs = model(**inputs)
|
|
all_hidden_states = outputs.hidden_states # Tuple of tensors
|
|
```
|
|
|
|
### Task-Specific Auto Classes
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoModelForSequenceClassification,
|
|
AutoModelForTokenClassification,
|
|
AutoModelForQuestionAnswering,
|
|
AutoModelForCausalLM,
|
|
AutoModelForMaskedLM,
|
|
AutoModelForSeq2SeqLM,
|
|
AutoModelForImageClassification,
|
|
AutoModelForObjectDetection,
|
|
AutoModelForVision2Seq,
|
|
)
|
|
|
|
# Sequence classification (sentiment, topic, etc.)
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
"bert-base-uncased",
|
|
num_labels=3,
|
|
id2label={0: "negative", 1: "neutral", 2: "positive"},
|
|
label2id={"negative": 0, "neutral": 1, "positive": 2}
|
|
)
|
|
|
|
# Token classification (NER, POS tagging)
|
|
model = AutoModelForTokenClassification.from_pretrained(
|
|
"bert-base-uncased",
|
|
num_labels=9 # Number of entity types
|
|
)
|
|
|
|
# Question answering
|
|
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
|
|
|
# Causal language modeling (GPT-style)
|
|
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
|
|
# Masked language modeling (BERT-style)
|
|
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
|
|
|
# Sequence-to-sequence (T5, BART)
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
|
|
|
# Image classification
|
|
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
|
|
|
|
# Object detection
|
|
model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50")
|
|
|
|
# Vision-to-text (image captioning, VQA)
|
|
model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base")
|
|
```
|
|
|
|
### AutoProcessor
|
|
|
|
For multimodal models that need both text and image processing.
|
|
|
|
```python
|
|
from transformers import AutoProcessor
|
|
|
|
# For vision-language models
|
|
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
|
|
|
# Process image and text
|
|
from PIL import Image
|
|
image = Image.open("image.jpg")
|
|
inputs = processor(images=image, text="caption", return_tensors="pt")
|
|
|
|
# For audio models
|
|
processor = AutoProcessor.from_pretrained("openai/whisper-base")
|
|
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
|
|
```
|
|
|
|
### AutoImageProcessor
|
|
|
|
For vision-only models.
|
|
|
|
```python
|
|
from transformers import AutoImageProcessor
|
|
|
|
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
|
|
|
# Process single image
|
|
from PIL import Image
|
|
image = Image.open("image.jpg")
|
|
inputs = processor(image, return_tensors="pt")
|
|
|
|
# Batch processing
|
|
images = [Image.open(f"image{i}.jpg") for i in range(10)]
|
|
inputs = processor(images, return_tensors="pt")
|
|
```
|
|
|
|
## Model Loading Options
|
|
|
|
### from_pretrained Parameters
|
|
|
|
```python
|
|
model = AutoModel.from_pretrained(
|
|
"model-name",
|
|
# Device and precision
|
|
device_map="auto", # Automatic device placement
|
|
torch_dtype=torch.float16, # Use fp16
|
|
low_cpu_mem_usage=True, # Reduce CPU memory during loading
|
|
|
|
# Quantization
|
|
load_in_8bit=True, # 8-bit quantization
|
|
load_in_4bit=True, # 4-bit quantization
|
|
|
|
# Model configuration
|
|
num_labels=3, # For classification
|
|
id2label={...}, # Label mapping
|
|
label2id={...},
|
|
|
|
# Outputs
|
|
output_hidden_states=True,
|
|
output_attentions=True,
|
|
|
|
# Trust remote code
|
|
trust_remote_code=True, # For custom models
|
|
|
|
# Caching
|
|
cache_dir="./cache",
|
|
force_download=False,
|
|
resume_download=True,
|
|
)
|
|
```
|
|
|
|
### Quantization with BitsAndBytes
|
|
|
|
```python
|
|
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
|
|
|
|
# 4-bit quantization
|
|
quantization_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_compute_dtype=torch.float16,
|
|
bnb_4bit_use_double_quant=True,
|
|
bnb_4bit_quant_type="nf4"
|
|
)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
"meta-llama/Llama-2-7b-hf",
|
|
quantization_config=quantization_config,
|
|
device_map="auto"
|
|
)
|
|
```
|
|
|
|
## Training Components
|
|
|
|
### TrainingArguments
|
|
|
|
See `training.md` for comprehensive coverage. Key parameters:
|
|
|
|
```python
|
|
from transformers import TrainingArguments
|
|
|
|
args = TrainingArguments(
|
|
output_dir="./results",
|
|
num_train_epochs=3,
|
|
per_device_train_batch_size=16,
|
|
per_device_eval_batch_size=64,
|
|
learning_rate=2e-5,
|
|
weight_decay=0.01,
|
|
eval_strategy="epoch",
|
|
save_strategy="epoch",
|
|
load_best_model_at_end=True,
|
|
metric_for_best_model="accuracy",
|
|
fp16=True,
|
|
logging_steps=100,
|
|
save_total_limit=2,
|
|
)
|
|
```
|
|
|
|
### Trainer
|
|
|
|
```python
|
|
from transformers import Trainer
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=eval_dataset,
|
|
compute_metrics=compute_metrics,
|
|
data_collator=data_collator,
|
|
callbacks=[callback1, callback2],
|
|
)
|
|
|
|
# Train
|
|
trainer.train()
|
|
|
|
# Resume from checkpoint
|
|
trainer.train(resume_from_checkpoint=True)
|
|
|
|
# Evaluate
|
|
metrics = trainer.evaluate()
|
|
|
|
# Predict
|
|
predictions = trainer.predict(test_dataset)
|
|
|
|
# Hyperparameter search
|
|
best_trial = trainer.hyperparameter_search(
|
|
direction="maximize",
|
|
backend="optuna",
|
|
n_trials=10,
|
|
)
|
|
|
|
# Save model
|
|
trainer.save_model("./final_model")
|
|
|
|
# Push to Hub
|
|
trainer.push_to_hub(commit_message="Training complete")
|
|
```
|
|
|
|
### Data Collators
|
|
|
|
```python
|
|
from transformers import (
|
|
DataCollatorWithPadding,
|
|
DataCollatorForTokenClassification,
|
|
DataCollatorForSeq2Seq,
|
|
DataCollatorForLanguageModeling,
|
|
DefaultDataCollator,
|
|
)
|
|
|
|
# For classification/regression with dynamic padding
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
|
|
|
# For token classification (NER)
|
|
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
|
|
|
|
# For seq2seq tasks
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
|
|
|
# For language modeling
|
|
data_collator = DataCollatorForLanguageModeling(
|
|
tokenizer=tokenizer,
|
|
mlm=True, # True for masked LM, False for causal LM
|
|
mlm_probability=0.15
|
|
)
|
|
|
|
# Default (no special handling)
|
|
data_collator = DefaultDataCollator()
|
|
```
|
|
|
|
## Generation Components
|
|
|
|
### GenerationConfig
|
|
|
|
See `generation_strategies.md` for comprehensive coverage.
|
|
|
|
```python
|
|
from transformers import GenerationConfig
|
|
|
|
config = GenerationConfig(
|
|
max_new_tokens=100,
|
|
do_sample=True,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
top_k=50,
|
|
num_beams=5,
|
|
repetition_penalty=1.2,
|
|
no_repeat_ngram_size=3,
|
|
)
|
|
|
|
# Use with model
|
|
outputs = model.generate(**inputs, generation_config=config)
|
|
```
|
|
|
|
### generate() Method
|
|
|
|
```python
|
|
outputs = model.generate(
|
|
input_ids=inputs.input_ids,
|
|
attention_mask=inputs.attention_mask,
|
|
max_new_tokens=100,
|
|
do_sample=True,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
num_return_sequences=3,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
)
|
|
```
|
|
|
|
## Pipeline API
|
|
|
|
See `pipelines.md` for comprehensive coverage.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
# Basic usage
|
|
pipe = pipeline("task-name", model="model-name", device=0)
|
|
results = pipe(inputs)
|
|
|
|
# With custom model
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("model-name")
|
|
tokenizer = AutoTokenizer.from_pretrained("model-name")
|
|
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
|
```
|
|
|
|
## Configuration Classes
|
|
|
|
### Model Configuration
|
|
|
|
```python
|
|
from transformers import AutoConfig
|
|
|
|
# Load configuration
|
|
config = AutoConfig.from_pretrained("bert-base-uncased")
|
|
|
|
# Access configuration
|
|
print(config.hidden_size)
|
|
print(config.num_attention_heads)
|
|
print(config.num_hidden_layers)
|
|
|
|
# Modify configuration
|
|
config.num_labels = 5
|
|
config.output_hidden_states = True
|
|
|
|
# Create model with config
|
|
model = AutoModel.from_config(config)
|
|
|
|
# Save configuration
|
|
config.save_pretrained("./config")
|
|
```
|
|
|
|
## Utilities
|
|
|
|
### Hub Utilities
|
|
|
|
```python
|
|
from huggingface_hub import login, snapshot_download
|
|
|
|
# Login
|
|
login(token="hf_...")
|
|
|
|
# Download model
|
|
snapshot_download(repo_id="model-name", cache_dir="./cache")
|
|
|
|
# Push to Hub
|
|
model.push_to_hub("username/model-name", commit_message="Initial commit")
|
|
tokenizer.push_to_hub("username/model-name")
|
|
```
|
|
|
|
### Evaluation Metrics
|
|
|
|
```python
|
|
import evaluate
|
|
|
|
# Load metric
|
|
metric = evaluate.load("accuracy")
|
|
|
|
# Compute metric
|
|
results = metric.compute(predictions=predictions, references=labels)
|
|
|
|
# Common metrics
|
|
accuracy = evaluate.load("accuracy")
|
|
precision = evaluate.load("precision")
|
|
recall = evaluate.load("recall")
|
|
f1 = evaluate.load("f1")
|
|
bleu = evaluate.load("bleu")
|
|
rouge = evaluate.load("rouge")
|
|
```
|
|
|
|
## Model Outputs
|
|
|
|
All models return dataclass objects with named attributes:
|
|
|
|
```python
|
|
# Sequence classification output
|
|
outputs = model(**inputs)
|
|
logits = outputs.logits # [batch_size, num_labels]
|
|
loss = outputs.loss # If labels provided
|
|
|
|
# Causal LM output
|
|
outputs = model(**inputs)
|
|
logits = outputs.logits # [batch_size, seq_length, vocab_size]
|
|
past_key_values = outputs.past_key_values # KV cache
|
|
|
|
# Seq2Seq output
|
|
outputs = model(**inputs, labels=labels)
|
|
loss = outputs.loss
|
|
logits = outputs.logits
|
|
encoder_last_hidden_state = outputs.encoder_last_hidden_state
|
|
|
|
# Access as dict
|
|
outputs_dict = outputs.to_tuple() # or dict(outputs)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Use Auto classes**: AutoModel, AutoTokenizer for flexibility
|
|
2. **Device management**: Use `device_map="auto"` for multi-GPU
|
|
3. **Memory optimization**: Use `torch_dtype=torch.float16` and quantization
|
|
4. **Caching**: Set `cache_dir` to avoid re-downloading
|
|
5. **Batch processing**: Process multiple inputs at once for efficiency
|
|
6. **Trust remote code**: Only set `trust_remote_code=True` for trusted sources
|