mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
Add more scientific skills
This commit is contained in:
699
scientific-packages/transformers/references/api_reference.md
Normal file
699
scientific-packages/transformers/references/api_reference.md
Normal file
@@ -0,0 +1,699 @@
|
||||
# Transformers API Reference
|
||||
|
||||
This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
|
||||
|
||||
## Core Model Classes
|
||||
|
||||
### PreTrainedModel
|
||||
|
||||
Base class for all models. Handles loading, saving, and common model operations.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import PreTrainedModel
|
||||
|
||||
# Load pretrained model
|
||||
model = ModelClass.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
config=None, # Custom config
|
||||
cache_dir=None, # Custom cache location
|
||||
force_download=False, # Force re-download
|
||||
resume_download=False, # Resume interrupted download
|
||||
proxies=None, # HTTP proxies
|
||||
local_files_only=False, # Only use cached files
|
||||
token=None, # HF auth token
|
||||
revision="main", # Git branch/tag
|
||||
trust_remote_code=False, # Allow custom model code
|
||||
device_map=None, # Device allocation ("auto", "cpu", "cuda:0", etc.)
|
||||
torch_dtype=None, # Model dtype (torch.float16, "auto", etc.)
|
||||
low_cpu_mem_usage=False, # Reduce CPU memory during loading
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Save model
|
||||
model.save_pretrained(
|
||||
save_directory,
|
||||
save_config=True, # Save config.json
|
||||
state_dict=None, # Custom state dict
|
||||
save_function=torch.save, # Custom save function
|
||||
push_to_hub=False, # Upload to Hub
|
||||
max_shard_size="5GB", # Max checkpoint size
|
||||
safe_serialization=True, # Use SafeTensors format
|
||||
variant=None, # Model variant name
|
||||
)
|
||||
|
||||
# Generate text (for generative models)
|
||||
outputs = model.generate(
|
||||
inputs=None, # Input token IDs
|
||||
max_length=20, # Max total length
|
||||
max_new_tokens=None, # Max new tokens to generate
|
||||
min_length=0, # Minimum length
|
||||
do_sample=False, # Enable sampling
|
||||
early_stopping=False, # Stop when num_beams finish
|
||||
num_beams=1, # Beam search width
|
||||
temperature=1.0, # Sampling temperature
|
||||
top_k=50, # Top-k sampling
|
||||
top_p=1.0, # Nucleus sampling
|
||||
repetition_penalty=1.0, # Penalize repetition
|
||||
length_penalty=1.0, # Beam search length penalty
|
||||
no_repeat_ngram_size=0, # Block repeated n-grams
|
||||
num_return_sequences=1, # Number of sequences to return
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
# Resize token embeddings (after adding tokens)
|
||||
new_embeddings = model.resize_token_embeddings(
|
||||
new_num_tokens,
|
||||
pad_to_multiple_of=None
|
||||
)
|
||||
|
||||
# Utility methods
|
||||
num_params = model.num_parameters(only_trainable=False)
|
||||
model.gradient_checkpointing_enable() # Enable gradient checkpointing
|
||||
model.enable_input_require_grads() # For PEFT with frozen models
|
||||
```
|
||||
|
||||
### AutoModel Classes
|
||||
|
||||
Automatically instantiate the correct model architecture.
|
||||
|
||||
**Available Classes:**
|
||||
|
||||
- `AutoModel`: Base model (returns hidden states)
|
||||
- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
|
||||
- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
|
||||
- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
|
||||
- `AutoModelForSequenceClassification`: Text classification
|
||||
- `AutoModelForTokenClassification`: Token classification (NER)
|
||||
- `AutoModelForQuestionAnswering`: Extractive QA
|
||||
- `AutoModelForImageClassification`: Image classification
|
||||
- `AutoModelForObjectDetection`: Object detection
|
||||
- `AutoModelForSemanticSegmentation`: Semantic segmentation
|
||||
- `AutoModelForAudioClassification`: Audio classification
|
||||
- `AutoModelForSpeechSeq2Seq`: Speech-to-text
|
||||
- `AutoModelForVision2Seq`: Image captioning, VQA
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
from transformers import AutoModel, AutoConfig
|
||||
|
||||
# Load with default configuration
|
||||
model = AutoModel.from_pretrained("bert-base-uncased")
|
||||
|
||||
# Load with custom configuration
|
||||
config = AutoConfig.from_pretrained("bert-base-uncased")
|
||||
config.hidden_dropout_prob = 0.2
|
||||
model = AutoModel.from_pretrained("bert-base-uncased", config=config)
|
||||
|
||||
# Register custom models
|
||||
from transformers import AutoConfig, AutoModel
|
||||
|
||||
AutoConfig.register("my-model", MyModelConfig)
|
||||
AutoModel.register(MyModelConfig, MyModel)
|
||||
```
|
||||
|
||||
## Tokenizer Classes
|
||||
|
||||
### PreTrainedTokenizer / PreTrainedTokenizerFast
|
||||
|
||||
Convert text to token IDs and vice versa.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
use_fast=True, # Use fast (Rust) tokenizer if available
|
||||
revision="main",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Encoding (text → token IDs)
|
||||
encoded = tokenizer(
|
||||
text, # String or List[str]
|
||||
text_pair=None, # Second sequence for pairs
|
||||
add_special_tokens=True, # Add [CLS], [SEP], etc.
|
||||
padding=False, # True, False, "longest", "max_length"
|
||||
truncation=False, # True, False, "longest_first", "only_first", "only_second"
|
||||
max_length=None, # Max sequence length
|
||||
stride=0, # Overlap for split sequences
|
||||
return_tensors=None, # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
|
||||
return_token_type_ids=None, # Return token type IDs
|
||||
return_attention_mask=None, # Return attention mask
|
||||
return_overflowing_tokens=False, # Return overflowing tokens
|
||||
return_special_tokens_mask=False, # Return special token mask
|
||||
return_offsets_mapping=False, # Return char-level offsets (fast only)
|
||||
return_length=False, # Return sequence lengths
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Decoding (token IDs → text)
|
||||
text = tokenizer.decode(
|
||||
token_ids,
|
||||
skip_special_tokens=False, # Remove special tokens
|
||||
clean_up_tokenization_spaces=True, # Clean up spacing
|
||||
)
|
||||
|
||||
# Batch decoding
|
||||
texts = tokenizer.batch_decode(
|
||||
sequences,
|
||||
skip_special_tokens=False,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
# Tokenization (text → tokens)
|
||||
tokens = tokenizer.tokenize(text, **kwargs)
|
||||
|
||||
# Convert tokens to IDs
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# Convert IDs to tokens
|
||||
tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
|
||||
# Add new tokens
|
||||
num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
|
||||
|
||||
# Add special tokens
|
||||
tokenizer.add_special_tokens({
|
||||
"bos_token": "[BOS]",
|
||||
"eos_token": "[EOS]",
|
||||
"unk_token": "[UNK]",
|
||||
"sep_token": "[SEP]",
|
||||
"pad_token": "[PAD]",
|
||||
"cls_token": "[CLS]",
|
||||
"mask_token": "[MASK]",
|
||||
"additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
|
||||
})
|
||||
|
||||
# Chat template formatting
|
||||
formatted = tokenizer.apply_chat_template(
|
||||
conversation, # List[Dict[str, str]] with "role" and "content"
|
||||
chat_template=None, # Custom template
|
||||
add_generation_prompt=False, # Add prompt for model to continue
|
||||
tokenize=True, # Return token IDs
|
||||
padding=False,
|
||||
truncation=False,
|
||||
max_length=None,
|
||||
return_tensors=None,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
# Save tokenizer
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
|
||||
# Get vocab size
|
||||
vocab_size = len(tokenizer)
|
||||
|
||||
# Get special tokens
|
||||
pad_token = tokenizer.pad_token
|
||||
pad_token_id = tokenizer.pad_token_id
|
||||
# Similar for: bos, eos, unk, sep, cls, mask
|
||||
```
|
||||
|
||||
**Special Token Attributes:**
|
||||
|
||||
```python
|
||||
tokenizer.bos_token # Beginning of sequence
|
||||
tokenizer.eos_token # End of sequence
|
||||
tokenizer.unk_token # Unknown token
|
||||
tokenizer.sep_token # Separator token
|
||||
tokenizer.pad_token # Padding token
|
||||
tokenizer.cls_token # Classification token
|
||||
tokenizer.mask_token # Mask token
|
||||
|
||||
# Corresponding IDs
|
||||
tokenizer.bos_token_id
|
||||
tokenizer.eos_token_id
|
||||
# ... etc
|
||||
```
|
||||
|
||||
## Image Processors
|
||||
|
||||
### AutoImageProcessor
|
||||
|
||||
Preprocess images for vision models.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import AutoImageProcessor
|
||||
|
||||
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
||||
|
||||
# Process images
|
||||
inputs = processor(
|
||||
images, # PIL Image, np.array, torch.Tensor, or List
|
||||
return_tensors="pt", # "pt", "tf", "np", None
|
||||
do_resize=True, # Resize to model size
|
||||
size=None, # Target size dict
|
||||
resample=None, # Resampling method
|
||||
do_rescale=True, # Rescale pixel values
|
||||
do_normalize=True, # Normalize with mean/std
|
||||
image_mean=None, # Custom mean
|
||||
image_std=None, # Custom std
|
||||
do_center_crop=False, # Center crop
|
||||
crop_size=None, # Crop size
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Returns: BatchFeature with 'pixel_values' key
|
||||
```
|
||||
|
||||
## Training Components
|
||||
|
||||
### TrainingArguments
|
||||
|
||||
Configuration for the Trainer class.
|
||||
|
||||
**Essential Arguments:**
|
||||
|
||||
```python
|
||||
from transformers import TrainingArguments
|
||||
|
||||
args = TrainingArguments(
|
||||
# ===== Output & Logging =====
|
||||
output_dir="./results", # REQUIRED: Output directory
|
||||
overwrite_output_dir=False, # Overwrite output directory
|
||||
|
||||
# ===== Training Parameters =====
|
||||
num_train_epochs=3.0, # Number of epochs
|
||||
max_steps=-1, # Max training steps (overrides epochs)
|
||||
per_device_train_batch_size=8, # Train batch size per device
|
||||
per_device_eval_batch_size=8, # Eval batch size per device
|
||||
gradient_accumulation_steps=1, # Accumulation steps
|
||||
|
||||
# ===== Learning Rate & Optimization =====
|
||||
learning_rate=5e-5, # Initial learning rate
|
||||
weight_decay=0.0, # Weight decay
|
||||
adam_beta1=0.9, # Adam beta1
|
||||
adam_beta2=0.999, # Adam beta2
|
||||
adam_epsilon=1e-8, # Adam epsilon
|
||||
max_grad_norm=1.0, # Gradient clipping
|
||||
optim="adamw_torch", # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
|
||||
|
||||
# ===== Learning Rate Scheduler =====
|
||||
lr_scheduler_type="linear", # Scheduler type
|
||||
warmup_steps=0, # Warmup steps
|
||||
warmup_ratio=0.0, # Warmup ratio (alternative to steps)
|
||||
|
||||
# ===== Evaluation =====
|
||||
eval_strategy="no", # "no", "steps", "epoch"
|
||||
eval_steps=None, # Eval every N steps
|
||||
eval_delay=0, # Delay first eval
|
||||
eval_accumulation_steps=None, # Accumulate eval outputs
|
||||
|
||||
# ===== Checkpointing =====
|
||||
save_strategy="steps", # "no", "steps", "epoch"
|
||||
save_steps=500, # Save every N steps
|
||||
save_total_limit=None, # Max checkpoints to keep
|
||||
save_safetensors=True, # Save as SafeTensors
|
||||
save_on_each_node=False, # Save on each node (distributed)
|
||||
|
||||
# ===== Best Model Selection =====
|
||||
load_best_model_at_end=False, # Load best checkpoint at end
|
||||
metric_for_best_model=None, # Metric to use
|
||||
greater_is_better=None, # True if higher is better
|
||||
|
||||
# ===== Logging =====
|
||||
logging_dir=None, # TensorBoard log directory
|
||||
logging_strategy="steps", # "no", "steps", "epoch"
|
||||
logging_steps=500, # Log every N steps
|
||||
logging_first_step=False, # Log first step
|
||||
logging_nan_inf_filter=True, # Filter NaN/Inf
|
||||
|
||||
# ===== Mixed Precision =====
|
||||
fp16=False, # Use fp16 training
|
||||
fp16_opt_level="O1", # Apex AMP optimization level
|
||||
fp16_backend="auto", # "auto", "apex", "cpu_amp"
|
||||
bf16=False, # Use bfloat16 training
|
||||
bf16_full_eval=False, # Use bf16 for evaluation
|
||||
tf32=None, # Use TF32 (Ampere+ GPUs)
|
||||
|
||||
# ===== Memory Optimization =====
|
||||
gradient_checkpointing=False, # Enable gradient checkpointing
|
||||
gradient_checkpointing_kwargs=None, # Kwargs for gradient checkpointing
|
||||
torch_empty_cache_steps=None, # Clear cache every N steps
|
||||
|
||||
# ===== Distributed Training =====
|
||||
local_rank=-1, # Local rank for distributed
|
||||
ddp_backend=None, # "nccl", "gloo", "mpi", "ccl"
|
||||
ddp_find_unused_parameters=None, # Find unused parameters
|
||||
ddp_bucket_cap_mb=None, # DDP bucket size
|
||||
fsdp="", # FSDP configuration
|
||||
fsdp_config=None, # FSDP config dict
|
||||
deepspeed=None, # DeepSpeed config
|
||||
|
||||
# ===== Hub Integration =====
|
||||
push_to_hub=False, # Push to Hugging Face Hub
|
||||
hub_model_id=None, # Hub model ID
|
||||
hub_strategy="every_save", # "every_save", "checkpoint", "end"
|
||||
hub_token=None, # Hub authentication token
|
||||
hub_private_repo=False, # Make repo private
|
||||
|
||||
# ===== Data Handling =====
|
||||
dataloader_num_workers=0, # DataLoader workers
|
||||
dataloader_pin_memory=True, # Pin memory
|
||||
dataloader_drop_last=False, # Drop last incomplete batch
|
||||
dataloader_prefetch_factor=None, # Prefetch factor
|
||||
remove_unused_columns=True, # Remove unused dataset columns
|
||||
label_names=None, # Label column names
|
||||
|
||||
# ===== Other =====
|
||||
seed=42, # Random seed
|
||||
data_seed=None, # Data sampling seed
|
||||
jit_mode_eval=False, # Use PyTorch JIT for eval
|
||||
use_ipex=False, # Use Intel Extension for PyTorch
|
||||
torch_compile=False, # Use torch.compile()
|
||||
torch_compile_backend=None, # Compile backend
|
||||
torch_compile_mode=None, # Compile mode
|
||||
include_inputs_for_metrics=False, # Pass inputs to compute_metrics
|
||||
skip_memory_metrics=True, # Skip memory profiling
|
||||
)
|
||||
```
|
||||
|
||||
### Trainer
|
||||
|
||||
Main training class with full training loop.
|
||||
|
||||
**Key Methods:**
|
||||
|
||||
```python
|
||||
from transformers import Trainer
|
||||
|
||||
trainer = Trainer(
|
||||
model=None, # Model to train
|
||||
args=None, # TrainingArguments
|
||||
data_collator=None, # Data collator
|
||||
train_dataset=None, # Training dataset
|
||||
eval_dataset=None, # Evaluation dataset
|
||||
tokenizer=None, # Tokenizer
|
||||
model_init=None, # Function to instantiate model
|
||||
compute_metrics=None, # Function to compute metrics
|
||||
callbacks=None, # List of callbacks
|
||||
optimizers=(None, None), # (optimizer, scheduler) tuple
|
||||
preprocess_logits_for_metrics=None, # Preprocess logits before metrics
|
||||
)
|
||||
|
||||
# Train model
|
||||
train_result = trainer.train(
|
||||
resume_from_checkpoint=None, # Resume from checkpoint
|
||||
trial=None, # Optuna/Ray trial
|
||||
ignore_keys_for_eval=None, # Keys to ignore in eval
|
||||
)
|
||||
|
||||
# Evaluate model
|
||||
eval_result = trainer.evaluate(
|
||||
eval_dataset=None, # Eval dataset (default: self.eval_dataset)
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="eval", # Prefix for metric names
|
||||
)
|
||||
|
||||
# Make predictions
|
||||
predictions = trainer.predict(
|
||||
test_dataset, # Test dataset
|
||||
ignore_keys=None, # Keys to ignore
|
||||
metric_key_prefix="test", # Metric prefix
|
||||
)
|
||||
# Returns: PredictionOutput(predictions, label_ids, metrics)
|
||||
|
||||
# Save model
|
||||
trainer.save_model(output_dir=None)
|
||||
|
||||
# Push to Hub
|
||||
trainer.push_to_hub(
|
||||
commit_message="End of training",
|
||||
blocking=True,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Hyperparameter search
|
||||
best_trial = trainer.hyperparameter_search(
|
||||
hp_space=None, # Hyperparameter search space
|
||||
compute_objective=None, # Objective function
|
||||
n_trials=20, # Number of trials
|
||||
direction="minimize", # "minimize" or "maximize"
|
||||
backend=None, # "optuna", "ray", "sigopt"
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = trainer.create_optimizer()
|
||||
|
||||
# Create scheduler
|
||||
scheduler = trainer.create_scheduler(
|
||||
num_training_steps,
|
||||
optimizer=None
|
||||
)
|
||||
|
||||
# Log metrics
|
||||
trainer.log_metrics(split, metrics)
|
||||
trainer.save_metrics(split, metrics)
|
||||
|
||||
# Save checkpoint
|
||||
trainer.save_state()
|
||||
|
||||
# Access current step/epoch
|
||||
current_step = trainer.state.global_step
|
||||
current_epoch = trainer.state.epoch
|
||||
|
||||
# Access training logs
|
||||
logs = trainer.state.log_history
|
||||
```
|
||||
|
||||
### Seq2SeqTrainer
|
||||
|
||||
Specialized trainer for sequence-to-sequence models.
|
||||
|
||||
```python
|
||||
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
|
||||
|
||||
# Use Seq2SeqTrainingArguments with additional parameters
|
||||
training_args = Seq2SeqTrainingArguments(
|
||||
output_dir="./results",
|
||||
predict_with_generate=True, # Use generate() for evaluation
|
||||
generation_max_length=None, # Max length for generation
|
||||
generation_num_beams=None, # Num beams for generation
|
||||
**other_training_arguments
|
||||
)
|
||||
|
||||
# Trainer usage is identical to Trainer
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
```
|
||||
|
||||
## Pipeline Classes
|
||||
|
||||
### pipeline()
|
||||
|
||||
Unified inference API for all tasks.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
pipe = pipeline(
|
||||
task=None, # Task name (required)
|
||||
model=None, # Model name/path or model object
|
||||
config=None, # Model config
|
||||
tokenizer=None, # Tokenizer
|
||||
feature_extractor=None, # Feature extractor
|
||||
image_processor=None, # Image processor
|
||||
framework=None, # "pt" or "tf"
|
||||
revision=None, # Model revision
|
||||
use_fast=True, # Use fast tokenizer
|
||||
token=None, # HF token
|
||||
device=None, # Device (-1 for CPU, 0+ for GPU)
|
||||
device_map=None, # Device map for multi-GPU
|
||||
torch_dtype=None, # Model dtype
|
||||
trust_remote_code=False, # Allow custom code
|
||||
model_kwargs=None, # Additional model kwargs
|
||||
pipeline_class=None, # Custom pipeline class
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Use pipeline
|
||||
results = pipe(
|
||||
inputs, # Input data
|
||||
**task_specific_parameters
|
||||
)
|
||||
```
|
||||
|
||||
## Data Collators
|
||||
|
||||
Batch and pad data for training.
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
DataCollatorWithPadding, # Dynamic padding for classification
|
||||
DataCollatorForTokenClassification, # Padding for token classification
|
||||
DataCollatorForSeq2Seq, # Padding for seq2seq
|
||||
DataCollatorForLanguageModeling, # MLM/CLM data collation
|
||||
default_data_collator, # Simple collator (no padding)
|
||||
)
|
||||
|
||||
# Text classification
|
||||
data_collator = DataCollatorWithPadding(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
|
||||
# Token classification
|
||||
data_collator = DataCollatorForTokenClassification(
|
||||
tokenizer=tokenizer,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Seq2Seq
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer=tokenizer,
|
||||
model=None,
|
||||
padding=True,
|
||||
max_length=None,
|
||||
pad_to_multiple_of=None,
|
||||
label_pad_token_id=-100,
|
||||
)
|
||||
|
||||
# Language modeling
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=True, # Masked LM (False for causal LM)
|
||||
mlm_probability=0.15, # Mask probability
|
||||
pad_to_multiple_of=None,
|
||||
)
|
||||
```
|
||||
|
||||
## Optimization & Scheduling
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
AdamW, # AdamW optimizer
|
||||
Adafactor, # Adafactor optimizer
|
||||
get_scheduler, # Get LR scheduler
|
||||
get_linear_schedule_with_warmup,
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_polynomial_decay_schedule_with_warmup,
|
||||
)
|
||||
|
||||
# Create optimizer
|
||||
optimizer = AdamW(
|
||||
model.parameters(),
|
||||
lr=5e-5,
|
||||
betas=(0.9, 0.999),
|
||||
eps=1e-8,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
# Create scheduler
|
||||
scheduler = get_scheduler(
|
||||
name="linear", # "linear", "cosine", "polynomial", "constant"
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=0,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
# Or use specific schedulers
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
)
|
||||
|
||||
scheduler = get_cosine_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=total_steps,
|
||||
num_cycles=0.5,
|
||||
)
|
||||
```
|
||||
|
||||
## Configuration Classes
|
||||
|
||||
```python
|
||||
from transformers import AutoConfig
|
||||
|
||||
# Load configuration
|
||||
config = AutoConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Common configuration attributes
|
||||
config.vocab_size # Vocabulary size
|
||||
config.hidden_size # Hidden layer size
|
||||
config.num_hidden_layers # Number of layers
|
||||
config.num_attention_heads # Attention heads
|
||||
config.intermediate_size # FFN intermediate size
|
||||
config.hidden_dropout_prob # Dropout probability
|
||||
config.attention_probs_dropout_prob # Attention dropout
|
||||
config.max_position_embeddings # Max sequence length
|
||||
|
||||
# Save configuration
|
||||
config.save_pretrained(save_directory)
|
||||
|
||||
# Create model from config
|
||||
from transformers import AutoModel
|
||||
model = AutoModel.from_config(config)
|
||||
```
|
||||
|
||||
## Utility Functions
|
||||
|
||||
```python
|
||||
from transformers import (
|
||||
set_seed, # Set random seed
|
||||
logging, # Logging utilities
|
||||
)
|
||||
|
||||
# Set seed for reproducibility
|
||||
set_seed(42)
|
||||
|
||||
# Configure logging
|
||||
logging.set_verbosity_info()
|
||||
logging.set_verbosity_warning()
|
||||
logging.set_verbosity_error()
|
||||
logging.set_verbosity_debug()
|
||||
|
||||
# Get logger
|
||||
logger = logging.get_logger(__name__)
|
||||
```
|
||||
|
||||
## Model Outputs
|
||||
|
||||
All models return model-specific output classes (subclasses of `ModelOutput`):
|
||||
|
||||
```python
|
||||
# Common output attributes
|
||||
outputs.loss # Loss (if labels provided)
|
||||
outputs.logits # Model logits
|
||||
outputs.hidden_states # All hidden states (if output_hidden_states=True)
|
||||
outputs.attentions # Attention weights (if output_attentions=True)
|
||||
|
||||
# Seq2Seq specific
|
||||
outputs.encoder_last_hidden_state
|
||||
outputs.encoder_hidden_states
|
||||
outputs.encoder_attentions
|
||||
outputs.decoder_hidden_states
|
||||
outputs.decoder_attentions
|
||||
outputs.cross_attentions
|
||||
|
||||
# Access as dict or tuple
|
||||
logits = outputs.logits
|
||||
logits = outputs["logits"]
|
||||
loss, logits = outputs.to_tuple()[:2]
|
||||
```
|
||||
|
||||
This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.
|
||||
Reference in New Issue
Block a user