Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,699 @@
# Transformers API Reference
This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
## Core Model Classes
### PreTrainedModel
Base class for all models. Handles loading, saving, and common model operations.
**Key Methods:**
```python
from transformers import PreTrainedModel
# Load pretrained model
model = ModelClass.from_pretrained(
pretrained_model_name_or_path,
config=None, # Custom config
cache_dir=None, # Custom cache location
force_download=False, # Force re-download
resume_download=False, # Resume interrupted download
proxies=None, # HTTP proxies
local_files_only=False, # Only use cached files
token=None, # HF auth token
revision="main", # Git branch/tag
trust_remote_code=False, # Allow custom model code
device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.)
torch_dtype=None, # Model dtype (torch.float16, "auto", etc.)
low_cpu_mem_usage=False, # Reduce CPU memory during loading
**model_kwargs
)
# Save model
model.save_pretrained(
save_directory,
save_config=True, # Save config.json
state_dict=None, # Custom state dict
save_function=torch.save, # Custom save function
push_to_hub=False, # Upload to Hub
max_shard_size="5GB", # Max checkpoint size
safe_serialization=True, # Use SafeTensors format
variant=None, # Model variant name
)
# Generate text (for generative models)
outputs = model.generate(
inputs=None, # Input token IDs
max_length=20, # Max total length
max_new_tokens=None, # Max new tokens to generate
min_length=0, # Minimum length
do_sample=False, # Enable sampling
early_stopping=False, # Stop when num_beams finish
num_beams=1, # Beam search width
temperature=1.0, # Sampling temperature
top_k=50, # Top-k sampling
top_p=1.0, # Nucleus sampling
repetition_penalty=1.0, # Penalize repetition
length_penalty=1.0, # Beam search length penalty
no_repeat_ngram_size=0, # Block repeated n-grams
num_return_sequences=1, # Number of sequences to return
**model_kwargs
)
# Resize token embeddings (after adding tokens)
new_embeddings = model.resize_token_embeddings(
new_num_tokens,
pad_to_multiple_of=None
)
# Utility methods
num_params = model.num_parameters(only_trainable=False)
model.gradient_checkpointing_enable() # Enable gradient checkpointing
model.enable_input_require_grads() # For PEFT with frozen models
```
### AutoModel Classes
Automatically instantiate the correct model architecture.
**Available Classes:**
- `AutoModel`: Base model (returns hidden states)
- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
- `AutoModelForSequenceClassification`: Text classification
- `AutoModelForTokenClassification`: Token classification (NER)
- `AutoModelForQuestionAnswering`: Extractive QA
- `AutoModelForImageClassification`: Image classification
- `AutoModelForObjectDetection`: Object detection
- `AutoModelForSemanticSegmentation`: Semantic segmentation
- `AutoModelForAudioClassification`: Audio classification
- `AutoModelForSpeechSeq2Seq`: Speech-to-text
- `AutoModelForVision2Seq`: Image captioning, VQA
**Usage:**
```python
from transformers import AutoModel, AutoConfig
# Load with default configuration
model = AutoModel.from_pretrained("bert-base-uncased")
# Load with custom configuration
config = AutoConfig.from_pretrained("bert-base-uncased")
config.hidden_dropout_prob = 0.2
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
# Register custom models
from transformers import AutoConfig, AutoModel
AutoConfig.register("my-model", MyModelConfig)
AutoModel.register(MyModelConfig, MyModel)
```
## Tokenizer Classes
### PreTrainedTokenizer / PreTrainedTokenizerFast
Convert text to token IDs and vice versa.
**Key Methods:**
```python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
use_fast=True, # Use fast (Rust) tokenizer if available
revision="main",
**kwargs
)
# Encoding (text → token IDs)
encoded = tokenizer(
text, # String or List[str]
text_pair=None, # Second sequence for pairs
add_special_tokens=True, # Add [CLS], [SEP], etc.
padding=False, # True, False, "longest", "max_length"
truncation=False, # True, False, "longest_first", "only_first", "only_second"
max_length=None, # Max sequence length
stride=0, # Overlap for split sequences
return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
return_token_type_ids=None, # Return token type IDs
return_attention_mask=None, # Return attention mask
return_overflowing_tokens=False, # Return overflowing tokens
return_special_tokens_mask=False, # Return special token mask
return_offsets_mapping=False, # Return char-level offsets (fast only)
return_length=False, # Return sequence lengths
**kwargs
)
# Decoding (token IDs → text)
text = tokenizer.decode(
token_ids,
skip_special_tokens=False, # Remove special tokens
clean_up_tokenization_spaces=True, # Clean up spacing
)
# Batch decoding
texts = tokenizer.batch_decode(
sequences,
skip_special_tokens=False,
clean_up_tokenization_spaces=True,
)
# Tokenization (text → tokens)
tokens = tokenizer.tokenize(text, **kwargs)
# Convert tokens to IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
# Convert IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(ids)
# Add new tokens
num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
# Add special tokens
tokenizer.add_special_tokens({
"bos_token": "[BOS]",
"eos_token": "[EOS]",
"unk_token": "[UNK]",
"sep_token": "[SEP]",
"pad_token": "[PAD]",
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
})
# Chat template formatting
formatted = tokenizer.apply_chat_template(
conversation, # List[Dict[str, str]] with "role" and "content"
chat_template=None, # Custom template
add_generation_prompt=False, # Add prompt for model to continue
tokenize=True, # Return token IDs
padding=False,
truncation=False,
max_length=None,
return_tensors=None,
return_dict=True,
)
# Save tokenizer
tokenizer.save_pretrained(save_directory)
# Get vocab size
vocab_size = len(tokenizer)
# Get special tokens
pad_token = tokenizer.pad_token
pad_token_id = tokenizer.pad_token_id
# Similar for: bos, eos, unk, sep, cls, mask
```
**Special Token Attributes:**
```python
tokenizer.bos_token # Beginning of sequence
tokenizer.eos_token # End of sequence
tokenizer.unk_token # Unknown token
tokenizer.sep_token # Separator token
tokenizer.pad_token # Padding token
tokenizer.cls_token # Classification token
tokenizer.mask_token # Mask token
# Corresponding IDs
tokenizer.bos_token_id
tokenizer.eos_token_id
# ... etc
```
## Image Processors
### AutoImageProcessor
Preprocess images for vision models.
**Key Methods:**
```python
from transformers import AutoImageProcessor
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
# Process images
inputs = processor(
images, # PIL Image, np.array, torch.Tensor, or List
return_tensors="pt", # "pt", "tf", "np", None
do_resize=True, # Resize to model size
size=None, # Target size dict
resample=None, # Resampling method
do_rescale=True, # Rescale pixel values
do_normalize=True, # Normalize with mean/std
image_mean=None, # Custom mean
image_std=None, # Custom std
do_center_crop=False, # Center crop
crop_size=None, # Crop size
**kwargs
)
# Returns: BatchFeature with 'pixel_values' key
```
## Training Components
### TrainingArguments
Configuration for the Trainer class.
**Essential Arguments:**
```python
from transformers import TrainingArguments
args = TrainingArguments(
# ===== Output & Logging =====
output_dir="./results", # REQUIRED: Output directory
overwrite_output_dir=False, # Overwrite output directory
# ===== Training Parameters =====
num_train_epochs=3.0, # Number of epochs
max_steps=-1, # Max training steps (overrides epochs)
per_device_train_batch_size=8, # Train batch size per device
per_device_eval_batch_size=8, # Eval batch size per device
gradient_accumulation_steps=1, # Accumulation steps
# ===== Learning Rate & Optimization =====
learning_rate=5e-5, # Initial learning rate
weight_decay=0.0, # Weight decay
adam_beta1=0.9, # Adam beta1
adam_beta2=0.999, # Adam beta2
adam_epsilon=1e-8, # Adam epsilon
max_grad_norm=1.0, # Gradient clipping
optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
# ===== Learning Rate Scheduler =====
lr_scheduler_type="linear", # Scheduler type
warmup_steps=0, # Warmup steps
warmup_ratio=0.0, # Warmup ratio (alternative to steps)
# ===== Evaluation =====
eval_strategy="no", # "no", "steps", "epoch"
eval_steps=None, # Eval every N steps
eval_delay=0, # Delay first eval
eval_accumulation_steps=None, # Accumulate eval outputs
# ===== Checkpointing =====
save_strategy="steps", # "no", "steps", "epoch"
save_steps=500, # Save every N steps
save_total_limit=None, # Max checkpoints to keep
save_safetensors=True, # Save as SafeTensors
save_on_each_node=False, # Save on each node (distributed)
# ===== Best Model Selection =====
load_best_model_at_end=False, # Load best checkpoint at end
metric_for_best_model=None, # Metric to use
greater_is_better=None, # True if higher is better
# ===== Logging =====
logging_dir=None, # TensorBoard log directory
logging_strategy="steps", # "no", "steps", "epoch"
logging_steps=500, # Log every N steps
logging_first_step=False, # Log first step
logging_nan_inf_filter=True, # Filter NaN/Inf
# ===== Mixed Precision =====
fp16=False, # Use fp16 training
fp16_opt_level="O1", # Apex AMP optimization level
fp16_backend="auto", # "auto", "apex", "cpu_amp"
bf16=False, # Use bfloat16 training
bf16_full_eval=False, # Use bf16 for evaluation
tf32=None, # Use TF32 (Ampere+ GPUs)
# ===== Memory Optimization =====
gradient_checkpointing=False, # Enable gradient checkpointing
gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing
torch_empty_cache_steps=None, # Clear cache every N steps
# ===== Distributed Training =====
local_rank=-1, # Local rank for distributed
ddp_backend=None, # "nccl", "gloo", "mpi", "ccl"
ddp_find_unused_parameters=None, # Find unused parameters
ddp_bucket_cap_mb=None, # DDP bucket size
fsdp="", # FSDP configuration
fsdp_config=None, # FSDP config dict
deepspeed=None, # DeepSpeed config
# ===== Hub Integration =====
push_to_hub=False, # Push to Hugging Face Hub
hub_model_id=None, # Hub model ID
hub_strategy="every_save", # "every_save", "checkpoint", "end"
hub_token=None, # Hub authentication token
hub_private_repo=False, # Make repo private
# ===== Data Handling =====
dataloader_num_workers=0, # DataLoader workers
dataloader_pin_memory=True, # Pin memory
dataloader_drop_last=False, # Drop last incomplete batch
dataloader_prefetch_factor=None, # Prefetch factor
remove_unused_columns=True, # Remove unused dataset columns
label_names=None, # Label column names
# ===== Other =====
seed=42, # Random seed
data_seed=None, # Data sampling seed
jit_mode_eval=False, # Use PyTorch JIT for eval
use_ipex=False, # Use Intel Extension for PyTorch
torch_compile=False, # Use torch.compile()
torch_compile_backend=None, # Compile backend
torch_compile_mode=None, # Compile mode
include_inputs_for_metrics=False, # Pass inputs to compute_metrics
skip_memory_metrics=True, # Skip memory profiling
)
```
### Trainer
Main training class with full training loop.
**Key Methods:**
```python
from transformers import Trainer
trainer = Trainer(
model=None, # Model to train
args=None, # TrainingArguments
data_collator=None, # Data collator
train_dataset=None, # Training dataset
eval_dataset=None, # Evaluation dataset
tokenizer=None, # Tokenizer
model_init=None, # Function to instantiate model
compute_metrics=None, # Function to compute metrics
callbacks=None, # List of callbacks
optimizers=(None, None), # (optimizer, scheduler) tuple
preprocess_logits_for_metrics=None, # Preprocess logits before metrics
)
# Train model
train_result = trainer.train(
resume_from_checkpoint=None, # Resume from checkpoint
trial=None, # Optuna/Ray trial
ignore_keys_for_eval=None, # Keys to ignore in eval
)
# Evaluate model
eval_result = trainer.evaluate(
eval_dataset=None, # Eval dataset (default: self.eval_dataset)
ignore_keys=None, # Keys to ignore
metric_key_prefix="eval", # Prefix for metric names
)
# Make predictions
predictions = trainer.predict(
test_dataset, # Test dataset
ignore_keys=None, # Keys to ignore
metric_key_prefix="test", # Metric prefix
)
# Returns: PredictionOutput(predictions, label_ids, metrics)
# Save model
trainer.save_model(output_dir=None)
# Push to Hub
trainer.push_to_hub(
commit_message="End of training",
blocking=True,
**kwargs
)
# Hyperparameter search
best_trial = trainer.hyperparameter_search(
hp_space=None, # Hyperparameter search space
compute_objective=None, # Objective function
n_trials=20, # Number of trials
direction="minimize", # "minimize" or "maximize"
backend=None, # "optuna", "ray", "sigopt"
**kwargs
)
# Create optimizer
optimizer = trainer.create_optimizer()
# Create scheduler
scheduler = trainer.create_scheduler(
num_training_steps,
optimizer=None
)
# Log metrics
trainer.log_metrics(split, metrics)
trainer.save_metrics(split, metrics)
# Save checkpoint
trainer.save_state()
# Access current step/epoch
current_step = trainer.state.global_step
current_epoch = trainer.state.epoch
# Access training logs
logs = trainer.state.log_history
```
### Seq2SeqTrainer
Specialized trainer for sequence-to-sequence models.
```python
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Use Seq2SeqTrainingArguments with additional parameters
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
predict_with_generate=True, # Use generate() for evaluation
generation_max_length=None, # Max length for generation
generation_num_beams=None, # Num beams for generation
**other_training_arguments
)
# Trainer usage is identical to Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
```
## Pipeline Classes
### pipeline()
Unified inference API for all tasks.
```python
from transformers import pipeline
pipe = pipeline(
task=None, # Task name (required)
model=None, # Model name/path or model object
config=None, # Model config
tokenizer=None, # Tokenizer
feature_extractor=None, # Feature extractor
image_processor=None, # Image processor
framework=None, # "pt" or "tf"
revision=None, # Model revision
use_fast=True, # Use fast tokenizer
token=None, # HF token
device=None, # Device (-1 for CPU, 0+ for GPU)
device_map=None, # Device map for multi-GPU
torch_dtype=None, # Model dtype
trust_remote_code=False, # Allow custom code
model_kwargs=None, # Additional model kwargs
pipeline_class=None, # Custom pipeline class
**kwargs
)
# Use pipeline
results = pipe(
inputs, # Input data
**task_specific_parameters
)
```
## Data Collators
Batch and pad data for training.
```python
from transformers import (
DataCollatorWithPadding, # Dynamic padding for classification
DataCollatorForTokenClassification, # Padding for token classification
DataCollatorForSeq2Seq, # Padding for seq2seq
DataCollatorForLanguageModeling, # MLM/CLM data collation
default_data_collator, # Simple collator (no padding)
)
# Text classification
data_collator = DataCollatorWithPadding(
tokenizer=tokenizer,
padding=True,
max_length=None,
pad_to_multiple_of=None,
)
# Token classification
data_collator = DataCollatorForTokenClassification(
tokenizer=tokenizer,
padding=True,
max_length=None,
pad_to_multiple_of=None,
label_pad_token_id=-100,
)
# Seq2Seq
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=None,
padding=True,
max_length=None,
pad_to_multiple_of=None,
label_pad_token_id=-100,
)
# Language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True, # Masked LM (False for causal LM)
mlm_probability=0.15, # Mask probability
pad_to_multiple_of=None,
)
```
## Optimization & Scheduling
```python
from transformers import (
AdamW, # AdamW optimizer
Adafactor, # Adafactor optimizer
get_scheduler, # Get LR scheduler
get_linear_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_polynomial_decay_schedule_with_warmup,
)
# Create optimizer
optimizer = AdamW(
model.parameters(),
lr=5e-5,
betas=(0.9, 0.999),
eps=1e-8,
weight_decay=0.01,
)
# Create scheduler
scheduler = get_scheduler(
name="linear", # "linear", "cosine", "polynomial", "constant"
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=total_steps,
)
# Or use specific schedulers
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps,
)
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps,
num_cycles=0.5,
)
```
## Configuration Classes
```python
from transformers import AutoConfig
# Load configuration
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
**kwargs
)
# Common configuration attributes
config.vocab_size # Vocabulary size
config.hidden_size # Hidden layer size
config.num_hidden_layers # Number of layers
config.num_attention_heads # Attention heads
config.intermediate_size # FFN intermediate size
config.hidden_dropout_prob # Dropout probability
config.attention_probs_dropout_prob # Attention dropout
config.max_position_embeddings # Max sequence length
# Save configuration
config.save_pretrained(save_directory)
# Create model from config
from transformers import AutoModel
model = AutoModel.from_config(config)
```
## Utility Functions
```python
from transformers import (
set_seed, # Set random seed
logging, # Logging utilities
)
# Set seed for reproducibility
set_seed(42)
# Configure logging
logging.set_verbosity_info()
logging.set_verbosity_warning()
logging.set_verbosity_error()
logging.set_verbosity_debug()
# Get logger
logger = logging.get_logger(__name__)
```
## Model Outputs
All models return model-specific output classes (subclasses of `ModelOutput`):
```python
# Common output attributes
outputs.loss # Loss (if labels provided)
outputs.logits # Model logits
outputs.hidden_states # All hidden states (if output_hidden_states=True)
outputs.attentions # Attention weights (if output_attentions=True)
# Seq2Seq specific
outputs.encoder_last_hidden_state
outputs.encoder_hidden_states
outputs.encoder_attentions
outputs.decoder_hidden_states
outputs.decoder_attentions
outputs.cross_attentions
# Access as dict or tuple
logits = outputs.logits
logits = outputs["logits"]
loss, logits = outputs.to_tuple()[:2]
```
This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.