Add more scientific skills

2026-03-28 07:33:45 +08:00 · 2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions
--- a/scientific-packages/transformers/references/api_reference.md
+++ b/scientific-packages/transformers/references/api_reference.md
@@ -0,0 +1,699 @@
+# Transformers API Reference
+
+This document provides comprehensive API reference for the most commonly used classes and methods in the Transformers library.
+
+## Core Model Classes
+
+### PreTrainedModel
+
+Base class for all models. Handles loading, saving, and common model operations.
+
+**Key Methods:**
+
+```python
+from transformers import PreTrainedModel
+
+# Load pretrained model
+model = ModelClass.from_pretrained(
+    pretrained_model_name_or_path,
+    config=None,                   # Custom config
+    cache_dir=None,                # Custom cache location
+    force_download=False,          # Force re-download
+    resume_download=False,         # Resume interrupted download
+    proxies=None,                  # HTTP proxies
+    local_files_only=False,        # Only use cached files
+    token=None,                    # HF auth token
+    revision="main",               # Git branch/tag
+    trust_remote_code=False,       # Allow custom model code
+    device_map=None,               # Device allocation ("auto", "cpu", "cuda:0", etc.)
+    torch_dtype=None,              # Model dtype (torch.float16, "auto", etc.)
+    low_cpu_mem_usage=False,       # Reduce CPU memory during loading
+    **model_kwargs
+)
+
+# Save model
+model.save_pretrained(
+    save_directory,
+    save_config=True,              # Save config.json
+    state_dict=None,               # Custom state dict
+    save_function=torch.save,      # Custom save function
+    push_to_hub=False,             # Upload to Hub
+    max_shard_size="5GB",          # Max checkpoint size
+    safe_serialization=True,       # Use SafeTensors format
+    variant=None,                  # Model variant name
+)
+
+# Generate text (for generative models)
+outputs = model.generate(
+    inputs=None,                   # Input token IDs
+    max_length=20,                 # Max total length
+    max_new_tokens=None,           # Max new tokens to generate
+    min_length=0,                  # Minimum length
+    do_sample=False,               # Enable sampling
+    early_stopping=False,          # Stop when num_beams finish
+    num_beams=1,                   # Beam search width
+    temperature=1.0,               # Sampling temperature
+    top_k=50,                      # Top-k sampling
+    top_p=1.0,                     # Nucleus sampling
+    repetition_penalty=1.0,        # Penalize repetition
+    length_penalty=1.0,            # Beam search length penalty
+    no_repeat_ngram_size=0,        # Block repeated n-grams
+    num_return_sequences=1,        # Number of sequences to return
+    **model_kwargs
+)
+
+# Resize token embeddings (after adding tokens)
+new_embeddings = model.resize_token_embeddings(
+    new_num_tokens,
+    pad_to_multiple_of=None
+)
+
+# Utility methods
+num_params = model.num_parameters(only_trainable=False)
+model.gradient_checkpointing_enable()  # Enable gradient checkpointing
+model.enable_input_require_grads()     # For PEFT with frozen models
+```
+
+### AutoModel Classes
+
+Automatically instantiate the correct model architecture.
+
+**Available Classes:**
+
+- `AutoModel`: Base model (returns hidden states)
+- `AutoModelForCausalLM`: Causal language modeling (GPT-style)
+- `AutoModelForMaskedLM`: Masked language modeling (BERT-style)
+- `AutoModelForSeq2SeqLM`: Sequence-to-sequence (T5, BART)
+- `AutoModelForSequenceClassification`: Text classification
+- `AutoModelForTokenClassification`: Token classification (NER)
+- `AutoModelForQuestionAnswering`: Extractive QA
+- `AutoModelForImageClassification`: Image classification
+- `AutoModelForObjectDetection`: Object detection
+- `AutoModelForSemanticSegmentation`: Semantic segmentation
+- `AutoModelForAudioClassification`: Audio classification
+- `AutoModelForSpeechSeq2Seq`: Speech-to-text
+- `AutoModelForVision2Seq`: Image captioning, VQA
+
+**Usage:**
+
+```python
+from transformers import AutoModel, AutoConfig
+
+# Load with default configuration
+model = AutoModel.from_pretrained("bert-base-uncased")
+
+# Load with custom configuration
+config = AutoConfig.from_pretrained("bert-base-uncased")
+config.hidden_dropout_prob = 0.2
+model = AutoModel.from_pretrained("bert-base-uncased", config=config)
+
+# Register custom models
+from transformers import AutoConfig, AutoModel
+
+AutoConfig.register("my-model", MyModelConfig)
+AutoModel.register(MyModelConfig, MyModel)
+```
+
+## Tokenizer Classes
+
+### PreTrainedTokenizer / PreTrainedTokenizerFast
+
+Convert text to token IDs and vice versa.
+
+**Key Methods:**
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    pretrained_model_name_or_path,
+    use_fast=True,                 # Use fast (Rust) tokenizer if available
+    revision="main",
+    **kwargs
+)
+
+# Encoding (text → token IDs)
+encoded = tokenizer(
+    text,                          # String or List[str]
+    text_pair=None,                # Second sequence for pairs
+    add_special_tokens=True,       # Add [CLS], [SEP], etc.
+    padding=False,                 # True, False, "longest", "max_length"
+    truncation=False,              # True, False, "longest_first", "only_first", "only_second"
+    max_length=None,               # Max sequence length
+    stride=0,                      # Overlap for split sequences
+    return_tensors=None,           # "pt" (PyTorch), "tf" (TensorFlow), "np" (NumPy)
+    return_token_type_ids=None,    # Return token type IDs
+    return_attention_mask=None,    # Return attention mask
+    return_overflowing_tokens=False,  # Return overflowing tokens
+    return_special_tokens_mask=False, # Return special token mask
+    return_offsets_mapping=False,  # Return char-level offsets (fast only)
+    return_length=False,           # Return sequence lengths
+    **kwargs
+)
+
+# Decoding (token IDs → text)
+text = tokenizer.decode(
+    token_ids,
+    skip_special_tokens=False,     # Remove special tokens
+    clean_up_tokenization_spaces=True,  # Clean up spacing
+)
+
+# Batch decoding
+texts = tokenizer.batch_decode(
+    sequences,
+    skip_special_tokens=False,
+    clean_up_tokenization_spaces=True,
+)
+
+# Tokenization (text → tokens)
+tokens = tokenizer.tokenize(text, **kwargs)
+
+# Convert tokens to IDs
+ids = tokenizer.convert_tokens_to_ids(tokens)
+
+# Convert IDs to tokens
+tokens = tokenizer.convert_ids_to_tokens(ids)
+
+# Add new tokens
+num_added = tokenizer.add_tokens(["[NEW_TOKEN1]", "[NEW_TOKEN2]"])
+
+# Add special tokens
+tokenizer.add_special_tokens({
+    "bos_token": "[BOS]",
+    "eos_token": "[EOS]",
+    "unk_token": "[UNK]",
+    "sep_token": "[SEP]",
+    "pad_token": "[PAD]",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]",
+    "additional_special_tokens": ["[SPECIAL1]", "[SPECIAL2]"],
+})
+
+# Chat template formatting
+formatted = tokenizer.apply_chat_template(
+    conversation,                  # List[Dict[str, str]] with "role" and "content"
+    chat_template=None,            # Custom template
+    add_generation_prompt=False,   # Add prompt for model to continue
+    tokenize=True,                 # Return token IDs
+    padding=False,
+    truncation=False,
+    max_length=None,
+    return_tensors=None,
+    return_dict=True,
+)
+
+# Save tokenizer
+tokenizer.save_pretrained(save_directory)
+
+# Get vocab size
+vocab_size = len(tokenizer)
+
+# Get special tokens
+pad_token = tokenizer.pad_token
+pad_token_id = tokenizer.pad_token_id
+# Similar for: bos, eos, unk, sep, cls, mask
+```
+
+**Special Token Attributes:**
+
+```python
+tokenizer.bos_token         # Beginning of sequence
+tokenizer.eos_token         # End of sequence
+tokenizer.unk_token         # Unknown token
+tokenizer.sep_token         # Separator token
+tokenizer.pad_token         # Padding token
+tokenizer.cls_token         # Classification token
+tokenizer.mask_token        # Mask token
+
+# Corresponding IDs
+tokenizer.bos_token_id
+tokenizer.eos_token_id
+# ... etc
+```
+
+## Image Processors
+
+### AutoImageProcessor
+
+Preprocess images for vision models.
+
+**Key Methods:**
+
+```python
+from transformers import AutoImageProcessor
+
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+
+# Process images
+inputs = processor(
+    images,                        # PIL Image, np.array, torch.Tensor, or List
+    return_tensors="pt",           # "pt", "tf", "np", None
+    do_resize=True,                # Resize to model size
+    size=None,                     # Target size dict
+    resample=None,                 # Resampling method
+    do_rescale=True,               # Rescale pixel values
+    do_normalize=True,             # Normalize with mean/std
+    image_mean=None,               # Custom mean
+    image_std=None,                # Custom std
+    do_center_crop=False,          # Center crop
+    crop_size=None,                # Crop size
+    **kwargs
+)
+
+# Returns: BatchFeature with 'pixel_values' key
+```
+
+## Training Components
+
+### TrainingArguments
+
+Configuration for the Trainer class.
+
+**Essential Arguments:**
+
+```python
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    # ===== Output & Logging =====
+    output_dir="./results",              # REQUIRED: Output directory
+    overwrite_output_dir=False,          # Overwrite output directory
+
+    # ===== Training Parameters =====
+    num_train_epochs=3.0,                # Number of epochs
+    max_steps=-1,                        # Max training steps (overrides epochs)
+    per_device_train_batch_size=8,       # Train batch size per device
+    per_device_eval_batch_size=8,        # Eval batch size per device
+    gradient_accumulation_steps=1,       # Accumulation steps
+
+    # ===== Learning Rate & Optimization =====
+    learning_rate=5e-5,                  # Initial learning rate
+    weight_decay=0.0,                    # Weight decay
+    adam_beta1=0.9,                      # Adam beta1
+    adam_beta2=0.999,                    # Adam beta2
+    adam_epsilon=1e-8,                   # Adam epsilon
+    max_grad_norm=1.0,                   # Gradient clipping
+    optim="adamw_torch",                 # Optimizer ("adamw_torch", "adafactor", "adamw_8bit")
+
+    # ===== Learning Rate Scheduler =====
+    lr_scheduler_type="linear",          # Scheduler type
+    warmup_steps=0,                      # Warmup steps
+    warmup_ratio=0.0,                    # Warmup ratio (alternative to steps)
+
+    # ===== Evaluation =====
+    eval_strategy="no",                  # "no", "steps", "epoch"
+    eval_steps=None,                     # Eval every N steps
+    eval_delay=0,                        # Delay first eval
+    eval_accumulation_steps=None,        # Accumulate eval outputs
+
+    # ===== Checkpointing =====
+    save_strategy="steps",               # "no", "steps", "epoch"
+    save_steps=500,                      # Save every N steps
+    save_total_limit=None,               # Max checkpoints to keep
+    save_safetensors=True,               # Save as SafeTensors
+    save_on_each_node=False,             # Save on each node (distributed)
+
+    # ===== Best Model Selection =====
+    load_best_model_at_end=False,        # Load best checkpoint at end
+    metric_for_best_model=None,          # Metric to use
+    greater_is_better=None,              # True if higher is better
+
+    # ===== Logging =====
+    logging_dir=None,                    # TensorBoard log directory
+    logging_strategy="steps",            # "no", "steps", "epoch"
+    logging_steps=500,                   # Log every N steps
+    logging_first_step=False,            # Log first step
+    logging_nan_inf_filter=True,         # Filter NaN/Inf
+
+    # ===== Mixed Precision =====
+    fp16=False,                          # Use fp16 training
+    fp16_opt_level="O1",                 # Apex AMP optimization level
+    fp16_backend="auto",                 # "auto", "apex", "cpu_amp"
+    bf16=False,                          # Use bfloat16 training
+    bf16_full_eval=False,                # Use bf16 for evaluation
+    tf32=None,                           # Use TF32 (Ampere+ GPUs)
+
+    # ===== Memory Optimization =====
+    gradient_checkpointing=False,        # Enable gradient checkpointing
+    gradient_checkpointing_kwargs=None,  # Kwargs for gradient checkpointing
+    torch_empty_cache_steps=None,        # Clear cache every N steps
+
+    # ===== Distributed Training =====
+    local_rank=-1,                       # Local rank for distributed
+    ddp_backend=None,                    # "nccl", "gloo", "mpi", "ccl"
+    ddp_find_unused_parameters=None,     # Find unused parameters
+    ddp_bucket_cap_mb=None,              # DDP bucket size
+    fsdp="",                             # FSDP configuration
+    fsdp_config=None,                    # FSDP config dict
+    deepspeed=None,                      # DeepSpeed config
+
+    # ===== Hub Integration =====
+    push_to_hub=False,                   # Push to Hugging Face Hub
+    hub_model_id=None,                   # Hub model ID
+    hub_strategy="every_save",           # "every_save", "checkpoint", "end"
+    hub_token=None,                      # Hub authentication token
+    hub_private_repo=False,              # Make repo private
+
+    # ===== Data Handling =====
+    dataloader_num_workers=0,            # DataLoader workers
+    dataloader_pin_memory=True,          # Pin memory
+    dataloader_drop_last=False,          # Drop last incomplete batch
+    dataloader_prefetch_factor=None,     # Prefetch factor
+    remove_unused_columns=True,          # Remove unused dataset columns
+    label_names=None,                    # Label column names
+
+    # ===== Other =====
+    seed=42,                             # Random seed
+    data_seed=None,                      # Data sampling seed
+    jit_mode_eval=False,                 # Use PyTorch JIT for eval
+    use_ipex=False,                      # Use Intel Extension for PyTorch
+    torch_compile=False,                 # Use torch.compile()
+    torch_compile_backend=None,          # Compile backend
+    torch_compile_mode=None,             # Compile mode
+    include_inputs_for_metrics=False,    # Pass inputs to compute_metrics
+    skip_memory_metrics=True,            # Skip memory profiling
+)
+```
+
+### Trainer
+
+Main training class with full training loop.
+
+**Key Methods:**
+
+```python
+from transformers import Trainer
+
+trainer = Trainer(
+    model=None,                          # Model to train
+    args=None,                           # TrainingArguments
+    data_collator=None,                  # Data collator
+    train_dataset=None,                  # Training dataset
+    eval_dataset=None,                   # Evaluation dataset
+    tokenizer=None,                      # Tokenizer
+    model_init=None,                     # Function to instantiate model
+    compute_metrics=None,                # Function to compute metrics
+    callbacks=None,                      # List of callbacks
+    optimizers=(None, None),             # (optimizer, scheduler) tuple
+    preprocess_logits_for_metrics=None,  # Preprocess logits before metrics
+)
+
+# Train model
+train_result = trainer.train(
+    resume_from_checkpoint=None,         # Resume from checkpoint
+    trial=None,                          # Optuna/Ray trial
+    ignore_keys_for_eval=None,           # Keys to ignore in eval
+)
+
+# Evaluate model
+eval_result = trainer.evaluate(
+    eval_dataset=None,                   # Eval dataset (default: self.eval_dataset)
+    ignore_keys=None,                    # Keys to ignore
+    metric_key_prefix="eval",            # Prefix for metric names
+)
+
+# Make predictions
+predictions = trainer.predict(
+    test_dataset,                        # Test dataset
+    ignore_keys=None,                    # Keys to ignore
+    metric_key_prefix="test",            # Metric prefix
+)
+# Returns: PredictionOutput(predictions, label_ids, metrics)
+
+# Save model
+trainer.save_model(output_dir=None)
+
+# Push to Hub
+trainer.push_to_hub(
+    commit_message="End of training",
+    blocking=True,
+    **kwargs
+)
+
+# Hyperparameter search
+best_trial = trainer.hyperparameter_search(
+    hp_space=None,                       # Hyperparameter search space
+    compute_objective=None,              # Objective function
+    n_trials=20,                         # Number of trials
+    direction="minimize",                # "minimize" or "maximize"
+    backend=None,                        # "optuna", "ray", "sigopt"
+    **kwargs
+)
+
+# Create optimizer
+optimizer = trainer.create_optimizer()
+
+# Create scheduler
+scheduler = trainer.create_scheduler(
+    num_training_steps,
+    optimizer=None
+)
+
+# Log metrics
+trainer.log_metrics(split, metrics)
+trainer.save_metrics(split, metrics)
+
+# Save checkpoint
+trainer.save_state()
+
+# Access current step/epoch
+current_step = trainer.state.global_step
+current_epoch = trainer.state.epoch
+
+# Access training logs
+logs = trainer.state.log_history
+```
+
+### Seq2SeqTrainer
+
+Specialized trainer for sequence-to-sequence models.
+
+```python
+from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
+
+# Use Seq2SeqTrainingArguments with additional parameters
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./results",
+    predict_with_generate=True,          # Use generate() for evaluation
+    generation_max_length=None,          # Max length for generation
+    generation_num_beams=None,           # Num beams for generation
+    **other_training_arguments
+)
+
+# Trainer usage is identical to Trainer
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+```
+
+## Pipeline Classes
+
+### pipeline()
+
+Unified inference API for all tasks.
+
+```python
+from transformers import pipeline
+
+pipe = pipeline(
+    task=None,                           # Task name (required)
+    model=None,                          # Model name/path or model object
+    config=None,                         # Model config
+    tokenizer=None,                      # Tokenizer
+    feature_extractor=None,              # Feature extractor
+    image_processor=None,                # Image processor
+    framework=None,                      # "pt" or "tf"
+    revision=None,                       # Model revision
+    use_fast=True,                       # Use fast tokenizer
+    token=None,                          # HF token
+    device=None,                         # Device (-1 for CPU, 0+ for GPU)
+    device_map=None,                     # Device map for multi-GPU
+    torch_dtype=None,                    # Model dtype
+    trust_remote_code=False,             # Allow custom code
+    model_kwargs=None,                   # Additional model kwargs
+    pipeline_class=None,                 # Custom pipeline class
+    **kwargs
+)
+
+# Use pipeline
+results = pipe(
+    inputs,                              # Input data
+    **task_specific_parameters
+)
+```
+
+## Data Collators
+
+Batch and pad data for training.
+
+```python
+from transformers import (
+    DataCollatorWithPadding,             # Dynamic padding for classification
+    DataCollatorForTokenClassification,  # Padding for token classification
+    DataCollatorForSeq2Seq,              # Padding for seq2seq
+    DataCollatorForLanguageModeling,     # MLM/CLM data collation
+    default_data_collator,               # Simple collator (no padding)
+)
+
+# Text classification
+data_collator = DataCollatorWithPadding(
+    tokenizer=tokenizer,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+)
+
+# Token classification
+data_collator = DataCollatorForTokenClassification(
+    tokenizer=tokenizer,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+)
+
+# Seq2Seq
+data_collator = DataCollatorForSeq2Seq(
+    tokenizer=tokenizer,
+    model=None,
+    padding=True,
+    max_length=None,
+    pad_to_multiple_of=None,
+    label_pad_token_id=-100,
+)
+
+# Language modeling
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=True,                            # Masked LM (False for causal LM)
+    mlm_probability=0.15,                # Mask probability
+    pad_to_multiple_of=None,
+)
+```
+
+## Optimization & Scheduling
+
+```python
+from transformers import (
+    AdamW,                               # AdamW optimizer
+    Adafactor,                           # Adafactor optimizer
+    get_scheduler,                       # Get LR scheduler
+    get_linear_schedule_with_warmup,
+    get_cosine_schedule_with_warmup,
+    get_polynomial_decay_schedule_with_warmup,
+)
+
+# Create optimizer
+optimizer = AdamW(
+    model.parameters(),
+    lr=5e-5,
+    betas=(0.9, 0.999),
+    eps=1e-8,
+    weight_decay=0.01,
+)
+
+# Create scheduler
+scheduler = get_scheduler(
+    name="linear",                       # "linear", "cosine", "polynomial", "constant"
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=total_steps,
+)
+
+# Or use specific schedulers
+scheduler = get_linear_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=warmup_steps,
+    num_training_steps=total_steps,
+)
+
+scheduler = get_cosine_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=warmup_steps,
+    num_training_steps=total_steps,
+    num_cycles=0.5,
+)
+```
+
+## Configuration Classes
+
+```python
+from transformers import AutoConfig
+
+# Load configuration
+config = AutoConfig.from_pretrained(
+    pretrained_model_name_or_path,
+    **kwargs
+)
+
+# Common configuration attributes
+config.vocab_size                        # Vocabulary size
+config.hidden_size                       # Hidden layer size
+config.num_hidden_layers                 # Number of layers
+config.num_attention_heads               # Attention heads
+config.intermediate_size                 # FFN intermediate size
+config.hidden_dropout_prob               # Dropout probability
+config.attention_probs_dropout_prob      # Attention dropout
+config.max_position_embeddings           # Max sequence length
+
+# Save configuration
+config.save_pretrained(save_directory)
+
+# Create model from config
+from transformers import AutoModel
+model = AutoModel.from_config(config)
+```
+
+## Utility Functions
+
+```python
+from transformers import (
+    set_seed,                            # Set random seed
+    logging,                             # Logging utilities
+)
+
+# Set seed for reproducibility
+set_seed(42)
+
+# Configure logging
+logging.set_verbosity_info()
+logging.set_verbosity_warning()
+logging.set_verbosity_error()
+logging.set_verbosity_debug()
+
+# Get logger
+logger = logging.get_logger(__name__)
+```
+
+## Model Outputs
+
+All models return model-specific output classes (subclasses of `ModelOutput`):
+
+```python
+# Common output attributes
+outputs.loss                             # Loss (if labels provided)
+outputs.logits                           # Model logits
+outputs.hidden_states                    # All hidden states (if output_hidden_states=True)
+outputs.attentions                       # Attention weights (if output_attentions=True)
+
+# Seq2Seq specific
+outputs.encoder_last_hidden_state
+outputs.encoder_hidden_states
+outputs.encoder_attentions
+outputs.decoder_hidden_states
+outputs.decoder_attentions
+outputs.cross_attentions
+
+# Access as dict or tuple
+logits = outputs.logits
+logits = outputs["logits"]
+loss, logits = outputs.to_tuple()[:2]
+```
+
+This reference covers the most commonly used API components. For complete documentation, refer to https://huggingface.co/docs/transformers.
--- a/scientific-packages/transformers/references/generation_strategies.md
+++ b/scientific-packages/transformers/references/generation_strategies.md
@@ -0,0 +1,530 @@
+# Text Generation Strategies
+
+Comprehensive guide to text generation methods in Transformers for controlling output quality, creativity, and diversity.
+
+## Overview
+
+Text generation is the process of predicting tokens sequentially using a language model. The choice of generation strategy significantly impacts output quality, diversity, and computational cost.
+
+**When to use each strategy:**
+- **Greedy**: Fast, deterministic, good for short outputs or when consistency is critical
+- **Beam Search**: Better quality for tasks with clear "correct" answers (translation, summarization)
+- **Sampling**: Creative, diverse outputs for open-ended generation (stories, dialogue)
+- **Top-k/Top-p**: Balanced creativity and coherence
+
+## Basic Generation Methods
+
+### Greedy Decoding
+
+Selects the highest probability token at each step. Fast but prone to repetition and suboptimal sequences.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+inputs = tokenizer("The future of AI", return_tensors="pt")
+
+# Greedy decoding (default)
+outputs = model.generate(**inputs, max_new_tokens=50)
+print(tokenizer.decode(outputs[0]))
+```
+
+**Characteristics:**
+- Deterministic (always same output for same input)
+- Fast (single forward pass per token)
+- Prone to repetition in longer sequences
+- Best for: Short generations, deterministic applications
+
+**Parameters:**
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,              # Number of tokens to generate
+    min_length=10,                  # Minimum total length
+    pad_token_id=tokenizer.pad_token_id,
+)
+```
+
+### Beam Search
+
+Maintains multiple hypotheses (beams) and selects the sequence with highest overall probability.
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,
+    num_beams=5,                    # Number of beams
+    early_stopping=True,            # Stop when all beams finish
+    no_repeat_ngram_size=2,         # Prevent 2-gram repetition
+)
+```
+
+**Characteristics:**
+- Higher quality than greedy for tasks with "correct" answers
+- Slower than greedy (num_beams forward passes per step)
+- Still can suffer from repetition
+- Best for: Translation, summarization, QA generation
+
+**Advanced Parameters:**
+```python
+outputs = model.generate(
+    **inputs,
+    num_beams=5,
+    num_beam_groups=1,              # Diverse beam search groups
+    diversity_penalty=0.0,          # Penalty for similar beams
+    length_penalty=1.0,             # >1: longer sequences, <1: shorter
+    early_stopping=True,            # Stop when num_beams sequences finish
+    no_repeat_ngram_size=2,         # Block repeating n-grams
+    num_return_sequences=1,         # Return top-k sequences (≤ num_beams)
+)
+```
+
+**Length Penalty:**
+- `length_penalty > 1.0`: Favor longer sequences
+- `length_penalty = 1.0`: No penalty
+- `length_penalty < 1.0`: Favor shorter sequences
+
+### Sampling (Multinomial)
+
+Randomly sample tokens according to the probability distribution.
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,
+    do_sample=True,                 # Enable sampling
+    temperature=1.0,                # Sampling temperature
+    num_beams=1,                    # Must be 1 for sampling
+)
+```
+
+**Characteristics:**
+- Non-deterministic (different output each time)
+- More diverse and creative than greedy/beam search
+- Can produce incoherent output if not controlled
+- Best for: Creative writing, dialogue, open-ended generation
+
+**Temperature Parameter:**
+```python
+# Low temperature (0.1-0.7): More focused, less random
+outputs = model.generate(**inputs, do_sample=True, temperature=0.5)
+
+# Medium temperature (0.7-1.0): Balanced
+outputs = model.generate(**inputs, do_sample=True, temperature=0.8)
+
+# High temperature (1.0-2.0): More random, more creative
+outputs = model.generate(**inputs, do_sample=True, temperature=1.5)
+```
+
+- `temperature → 0`: Approaches greedy decoding
+- `temperature = 1.0`: Sample from original distribution
+- `temperature > 1.0`: Flatter distribution, more random
+- `temperature < 1.0`: Sharper distribution, more confident
+
+## Advanced Sampling Methods
+
+### Top-k Sampling
+
+Sample from only the k most likely tokens.
+
+```python
+outputs = model.generate(
+    **inputs,
+    do_sample=True,
+    max_new_tokens=50,
+    top_k=50,                       # Consider top 50 tokens
+    temperature=0.8,
+)
+```
+
+**How it works:**
+1. Filter to top-k most probable tokens
+2. Renormalize probabilities
+3. Sample from filtered distribution
+
+**Choosing k:**
+- `k=1`: Equivalent to greedy decoding
+- `k=10-50`: More focused, coherent output
+- `k=100-500`: More diverse output
+- Too high k: Includes low-probability tokens (noise)
+- Too low k: Less diverse, may miss good alternatives
+
+### Top-p (Nucleus) Sampling
+
+Sample from the smallest set of tokens whose cumulative probability ≥ p.
+
+```python
+outputs = model.generate(
+    **inputs,
+    do_sample=True,
+    max_new_tokens=50,
+    top_p=0.95,                     # Nucleus probability
+    temperature=0.8,
+)
+```
+
+**How it works:**
+1. Sort tokens by probability
+2. Find smallest set with cumulative probability ≥ p
+3. Sample from this set
+
+**Choosing p:**
+- `p=0.9-0.95`: Good balance (recommended)
+- `p=1.0`: Sample from full distribution
+- Higher p: More diverse, might include unlikely tokens
+- Lower p: More focused, like top-k with adaptive k
+
+**Top-p vs Top-k:**
+- Top-p adapts to probability distribution shape
+- Top-k is fixed regardless of distribution
+- Top-p generally better for variable-quality contexts
+- Can combine: `top_k=50, top_p=0.95` (apply both filters)
+
+### Combining Strategies
+
+```python
+# Recommended for high-quality open-ended generation
+outputs = model.generate(
+    **inputs,
+    do_sample=True,
+    max_new_tokens=100,
+    temperature=0.8,                # Moderate temperature
+    top_k=50,                       # Limit to top 50 tokens
+    top_p=0.95,                     # Nucleus sampling
+    repetition_penalty=1.2,         # Discourage repetition
+    no_repeat_ngram_size=3,         # Block 3-gram repetition
+)
+```
+
+## Controlling Generation Quality
+
+### Repetition Control
+
+Prevent models from repeating themselves:
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=100,
+
+    # Method 1: Repetition penalty
+    repetition_penalty=1.2,         # Penalize repeated tokens (>1.0)
+
+    # Method 2: Block n-gram repetition
+    no_repeat_ngram_size=3,         # Never repeat 3-grams
+
+    # Method 3: Encoder repetition penalty (for seq2seq)
+    encoder_repetition_penalty=1.0, # Penalize input tokens
+)
+```
+
+**Repetition Penalty Values:**
+- `1.0`: No penalty
+- `1.0-1.5`: Mild penalty (recommended: 1.1-1.3)
+- `>1.5`: Strong penalty (may harm coherence)
+
+### Length Control
+
+```python
+outputs = model.generate(
+    **inputs,
+
+    # Hard constraints
+    min_length=20,                  # Minimum total length
+    max_length=100,                 # Maximum total length
+    max_new_tokens=50,              # Maximum new tokens (excluding input)
+
+    # Soft constraints (with beam search)
+    length_penalty=1.0,             # Encourage longer/shorter outputs
+
+    # Early stopping
+    early_stopping=True,            # Stop when condition met
+)
+```
+
+### Bad Words and Forced Tokens
+
+```python
+# Prevent specific tokens
+bad_words_ids = [
+    tokenizer.encode("badword1", add_special_tokens=False),
+    tokenizer.encode("badword2", add_special_tokens=False),
+]
+
+outputs = model.generate(
+    **inputs,
+    bad_words_ids=bad_words_ids,
+)
+
+# Force specific tokens
+force_words_ids = [
+    tokenizer.encode("important", add_special_tokens=False),
+]
+
+outputs = model.generate(
+    **inputs,
+    force_words_ids=force_words_ids,
+)
+```
+
+## Streaming Generation
+
+Generate and process tokens as they're produced:
+
+```python
+from transformers import TextStreamer, TextIteratorStreamer
+from threading import Thread
+
+# Simple streaming (prints to stdout)
+streamer = TextStreamer(tokenizer, skip_prompt=True)
+outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=100)
+
+# Iterator streaming (for custom processing)
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+
+generation_kwargs = dict(**inputs, streamer=streamer, max_new_tokens=100)
+thread = Thread(target=model.generate, kwargs=generation_kwargs)
+thread.start()
+
+for text in streamer:
+    print(text, end="", flush=True)
+
+thread.join()
+```
+
+## Advanced Techniques
+
+### Contrastive Search
+
+Balance coherence and diversity using contrastive objective:
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,
+    penalty_alpha=0.6,              # Contrastive penalty
+    top_k=4,                        # Consider top-4 tokens
+)
+```
+
+**When to use:**
+- Open-ended text generation
+- Reduces repetition without sacrificing coherence
+- Good alternative to sampling
+
+### Diverse Beam Search
+
+Generate multiple diverse outputs:
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,
+    num_beams=10,
+    num_beam_groups=5,              # 5 groups of 2 beams each
+    diversity_penalty=1.0,          # Penalty for similar beams
+    num_return_sequences=5,         # Return 5 diverse outputs
+)
+```
+
+### Constrained Beam Search
+
+Force output to include specific phrases:
+
+```python
+from transformers import PhrasalConstraint
+
+constraints = [
+    PhrasalConstraint(
+        tokenizer("machine learning", add_special_tokens=False).input_ids
+    ),
+]
+
+outputs = model.generate(
+    **inputs,
+    constraints=constraints,
+    num_beams=10,                   # Requires beam search
+)
+```
+
+## Speculative Decoding
+
+Accelerate generation using a smaller draft model:
+
+```python
+from transformers import AutoModelForCausalLM
+
+# Load main and assistant models
+model = AutoModelForCausalLM.from_pretrained("large-model")
+assistant_model = AutoModelForCausalLM.from_pretrained("small-model")
+
+# Generate with speculative decoding
+outputs = model.generate(
+    **inputs,
+    assistant_model=assistant_model,
+    do_sample=True,
+    temperature=0.8,
+)
+```
+
+**Benefits:**
+- 2-3x faster generation
+- Identical output distribution to regular generation
+- Works with sampling and greedy decoding
+
+## Recipe: Recommended Settings by Task
+
+### Creative Writing / Dialogue
+
+```python
+outputs = model.generate(
+    **inputs,
+    do_sample=True,
+    max_new_tokens=200,
+    temperature=0.9,
+    top_p=0.95,
+    top_k=50,
+    repetition_penalty=1.2,
+    no_repeat_ngram_size=3,
+)
+```
+
+### Translation / Summarization
+
+```python
+outputs = model.generate(
+    **inputs,
+    num_beams=5,
+    max_new_tokens=150,
+    early_stopping=True,
+    length_penalty=1.0,
+    no_repeat_ngram_size=2,
+)
+```
+
+### Code Generation
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=300,
+    temperature=0.2,                # Low temperature for correctness
+    top_p=0.95,
+    do_sample=True,
+)
+```
+
+### Chatbot / Instruction Following
+
+```python
+outputs = model.generate(
+    **inputs,
+    do_sample=True,
+    max_new_tokens=256,
+    temperature=0.7,
+    top_p=0.9,
+    repetition_penalty=1.15,
+)
+```
+
+### Factual QA / Information Extraction
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=50,
+    num_beams=3,
+    early_stopping=True,
+    # Or greedy for very short answers:
+    # (no special parameters needed)
+)
+```
+
+## Debugging Generation
+
+### Check Token Probabilities
+
+```python
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=20,
+    output_scores=True,             # Return generation scores
+    return_dict_in_generate=True,   # Return as dict
+)
+
+# Access generation scores
+scores = outputs.scores  # Tuple of tensors (seq_len, vocab_size)
+
+# Get token probabilities
+import torch
+probs = torch.softmax(scores[0], dim=-1)
+```
+
+### Monitor Generation Process
+
+```python
+from transformers import LogitsProcessor, LogitsProcessorList
+
+class DebugLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids, scores):
+        # Print top 5 tokens at each step
+        top_tokens = scores[0].topk(5)
+        print(f"Top 5 tokens: {top_tokens}")
+        return scores
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=10,
+    logits_processor=LogitsProcessorList([DebugLogitsProcessor()]),
+)
+```
+
+## Common Issues and Solutions
+
+**Issue: Repetitive output**
+- Solution: Increase `repetition_penalty` (1.2-1.5), set `no_repeat_ngram_size=3`
+- For sampling: Increase `temperature`, enable `top_p`
+
+**Issue: Incoherent output**
+- Solution: Lower `temperature` (0.5-0.8), use beam search
+- Set `top_k=50` or `top_p=0.9` to filter unlikely tokens
+
+**Issue: Too short output**
+- Solution: Increase `min_length`, set `length_penalty > 1.0` (beam search)
+- Check if EOS token is being generated early
+
+**Issue: Too slow generation**
+- Solution: Use greedy instead of beam search
+- Reduce `num_beams`
+- Try speculative decoding with assistant model
+- Use smaller model variant
+
+**Issue: Output doesn't follow format**
+- Solution: Use constrained beam search
+- Add format examples to prompt
+- Use `bad_words_ids` to prevent format-breaking tokens
+
+## Performance Optimization
+
+```python
+# Use half precision
+model = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+# Use KV cache optimization (default, but can be disabled)
+outputs = model.generate(**inputs, use_cache=True)
+
+# Batch generation
+inputs = tokenizer(["Prompt 1", "Prompt 2"], return_tensors="pt", padding=True)
+outputs = model.generate(**inputs, max_new_tokens=50)
+
+# Static cache for longer sequences (if supported)
+outputs = model.generate(**inputs, cache_implementation="static")
+```
+
+This guide covers the main generation strategies. For task-specific examples, see `task_patterns.md`.
--- a/scientific-packages/transformers/references/quantization.md
+++ b/scientific-packages/transformers/references/quantization.md
@@ -0,0 +1,504 @@
+# Model Quantization Guide
+
+Comprehensive guide to reducing model memory footprint through quantization while maintaining accuracy.
+
+## Overview
+
+Quantization reduces memory requirements by storing model weights in lower precision formats (int8, int4) instead of full precision (float32). This enables:
+- Running larger models on limited hardware
+- Faster inference (reduced memory bandwidth)
+- Lower deployment costs
+- Enabling fine-tuning of models that wouldn't fit in memory
+
+**Tradeoffs:**
+- Slight accuracy loss (typically < 1-2%)
+- Initial quantization overhead
+- Some methods require calibration data
+
+## Quick Comparison
+
+| Method | Precision | Speed | Accuracy | Fine-tuning | Hardware | Setup |
+|--------|-----------|-------|----------|-------------|----------|-------|
+| **Bitsandbytes** | 4/8-bit | Fast | High | Yes (PEFT) | CUDA, CPU | Easy |
+| **GPTQ** | 2-8-bit | Very Fast | High | Limited | CUDA, ROCm, Metal | Medium |
+| **AWQ** | 4-bit | Very Fast | High | Yes (PEFT) | CUDA, ROCm | Medium |
+| **GGUF** | 1-8-bit | Medium | Variable | No | CPU-optimized | Easy |
+| **HQQ** | 1-8-bit | Fast | High | Yes | Multi-platform | Medium |
+
+## Bitsandbytes (BnB)
+
+On-the-fly quantization with excellent PEFT fine-tuning support.
+
+### 8-bit Quantization
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    load_in_8bit=True,              # Enable 8-bit quantization
+    device_map="auto",              # Automatic device placement
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+# Use normally
+inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=50)
+```
+
+**Memory Savings:**
+- 7B model: ~14GB → ~7GB (50% reduction)
+- 13B model: ~26GB → ~13GB
+- 70B model: ~140GB → ~70GB
+
+**Characteristics:**
+- Fast inference
+- Minimal accuracy loss
+- Works with PEFT (LoRA, QLoRA)
+- Supports CPU and CUDA GPUs
+
+### 4-bit Quantization (QLoRA)
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+import torch
+
+# Configure 4-bit quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,                      # Enable 4-bit quantization
+    bnb_4bit_quant_type="nf4",              # Quantization type ("nf4" or "fp4")
+    bnb_4bit_compute_dtype=torch.float16,   # Computation dtype
+    bnb_4bit_use_double_quant=True,         # Nested quantization for more savings
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    quantization_config=bnb_config,
+    device_map="auto",
+)
+```
+
+**Memory Savings:**
+- 7B model: ~14GB → ~4GB (70% reduction)
+- 13B model: ~26GB → ~7GB
+- 70B model: ~140GB → ~35GB
+
+**Quantization Types:**
+- `nf4`: Normal Float 4 (recommended, better quality)
+- `fp4`: Float Point 4 (slightly more memory efficient)
+
+**Compute Dtype:**
+```python
+# For better quality
+bnb_4bit_compute_dtype=torch.float16
+
+# For best performance on Ampere+ GPUs
+bnb_4bit_compute_dtype=torch.bfloat16
+```
+
+**Double Quantization:**
+```python
+# Enable for additional ~0.4 bits/param savings
+bnb_4bit_use_double_quant=True  # Quantize the quantization constants
+```
+
+### Fine-tuning with QLoRA
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+import torch
+
+# Load quantized model
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    quantization_config=bnb_config,
+    device_map="auto",
+)
+
+# Prepare for training
+model = prepare_model_for_kbit_training(model)
+
+# Configure LoRA
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, lora_config)
+
+# Train normally
+trainer = Trainer(model=model, args=training_args, ...)
+trainer.train()
+```
+
+## GPTQ
+
+Post-training quantization requiring calibration, optimized for inference speed.
+
+### Loading GPTQ Models
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+# Load pre-quantized GPTQ model
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Llama-2-7B-GPTQ",         # Pre-quantized model
+    device_map="auto",
+    revision="gptq-4bit-32g-actorder_True",  # Specific quantization config
+)
+
+# Or quantize yourself
+gptq_config = GPTQConfig(
+    bits=4,                              # 2, 3, 4, 8 bits
+    dataset="c4",                        # Calibration dataset
+    tokenizer=tokenizer,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    device_map="auto",
+    quantization_config=gptq_config,
+)
+
+# Save quantized model
+model.save_pretrained("llama-2-7b-gptq")
+```
+
+**Configuration Options:**
+```python
+gptq_config = GPTQConfig(
+    bits=4,                              # Quantization bits
+    group_size=128,                      # Group size for quantization (128, 32, -1)
+    dataset="c4",                        # Calibration dataset
+    desc_act=False,                      # Activation order (can improve accuracy)
+    sym=True,                            # Symmetric quantization
+    damp_percent=0.1,                    # Dampening factor
+)
+```
+
+**Characteristics:**
+- Fastest inference among quantization methods
+- Requires one-time calibration (slow)
+- Best when using pre-quantized models from Hub
+- Limited fine-tuning support
+- Excellent for production deployment
+
+## AWQ (Activation-aware Weight Quantization)
+
+Protects important weights for better quality.
+
+### Loading AWQ Models
+
+```python
+from transformers import AutoModelForCausalLM, AwqConfig
+
+# Load pre-quantized AWQ model
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Llama-2-7B-AWQ",
+    device_map="auto",
+)
+
+# Or quantize yourself
+awq_config = AwqConfig(
+    bits=4,                              # 4-bit quantization
+    group_size=128,                      # Quantization group size
+    zero_point=True,                     # Use zero-point quantization
+    version="GEMM",                      # Quantization version
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    quantization_config=awq_config,
+    device_map="auto",
+)
+```
+
+**Characteristics:**
+- Better accuracy than GPTQ at same bit width
+- Excellent inference speed
+- Supports PEFT fine-tuning
+- Requires calibration data
+
+### Fine-tuning AWQ Models
+
+```python
+from peft import LoraConfig, get_peft_model
+
+# AWQ models support LoRA fine-tuning
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, lora_config)
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+## GGUF (GGML Format)
+
+CPU-optimized quantization format, popular in llama.cpp ecosystem.
+
+### Using GGUF Models
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# Load GGUF model
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Llama-2-7B-GGUF",
+    gguf_file="llama-2-7b.Q4_K_M.gguf",  # Specific quantization file
+    device_map="auto",
+)
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-GGUF")
+```
+
+**GGUF Quantization Types:**
+- `Q4_0`: 4-bit, smallest, lowest quality
+- `Q4_K_M`: 4-bit, medium quality (recommended)
+- `Q5_K_M`: 5-bit, good quality
+- `Q6_K`: 6-bit, high quality
+- `Q8_0`: 8-bit, very high quality
+
+**Characteristics:**
+- Optimized for CPU inference
+- Wide range of bit depths (1-8)
+- Good for Apple Silicon (M1/M2)
+- No fine-tuning support
+- Excellent for local/edge deployment
+
+## HQQ (Half-Quadratic Quantization)
+
+Flexible quantization with good accuracy retention.
+
+### Using HQQ
+
+```python
+from transformers import AutoModelForCausalLM, HqqConfig
+
+hqq_config = HqqConfig(
+    nbits=4,                             # Quantization bits
+    group_size=64,                       # Group size
+    quant_zero=False,                    # Quantize zero point
+    quant_scale=False,                   # Quantize scale
+    axis=0,                              # Quantization axis
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    quantization_config=hqq_config,
+    device_map="auto",
+)
+```
+
+**Characteristics:**
+- Very fast quantization
+- No calibration data needed
+- Support for 1-8 bits
+- Can serialize/deserialize
+- Good accuracy vs size tradeoff
+
+## Choosing a Quantization Method
+
+### Decision Tree
+
+**For inference only:**
+1. Need fastest inference? → **GPTQ or AWQ** (use pre-quantized models)
+2. CPU-only deployment? → **GGUF**
+3. Want easiest setup? → **Bitsandbytes 8-bit**
+4. Need extreme compression? → **GGUF Q4_0 or HQQ 2-bit**
+
+**For fine-tuning:**
+1. Limited VRAM? → **QLoRA (BnB 4-bit + LoRA)**
+2. Want best accuracy? → **Bitsandbytes 8-bit + LoRA**
+3. Need very large models? → **QLoRA with double quantization**
+
+**For production:**
+1. Latency-critical? → **GPTQ or AWQ**
+2. Cost-optimized? → **Bitsandbytes 8-bit**
+3. CPU deployment? → **GGUF**
+
+## Memory Requirements
+
+Approximate memory for Llama-2 7B model:
+
+| Method | Memory | vs FP16 |
+|--------|--------|---------|
+| FP32 | 28GB | 2x |
+| FP16 / BF16 | 14GB | 1x |
+| 8-bit (BnB) | 7GB | 0.5x |
+| 4-bit (QLoRA) | 3.5GB | 0.25x |
+| 4-bit Double Quant | 3GB | 0.21x |
+| GPTQ 4-bit | 4GB | 0.29x |
+| AWQ 4-bit | 4GB | 0.29x |
+
+**Note:** Add ~1-2GB for inference activations, KV cache, and framework overhead.
+
+## Best Practices
+
+### For Training
+
+```python
+# QLoRA recommended configuration
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,  # BF16 if available
+    bnb_4bit_use_double_quant=True,
+)
+
+# LoRA configuration
+lora_config = LoraConfig(
+    r=16,                                    # Rank (8, 16, 32, 64)
+    lora_alpha=32,                           # Scaling (typically 2*r)
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+```
+
+### For Inference
+
+```python
+# High-speed inference
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Llama-2-7B-GPTQ",
+    device_map="auto",
+    torch_dtype=torch.float16,           # Use FP16 for activations
+)
+
+# Balanced quality/speed
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    load_in_8bit=True,
+    device_map="auto",
+)
+
+# Maximum compression
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    ),
+    device_map="auto",
+)
+```
+
+### Multi-GPU Setups
+
+```python
+# Automatically distribute across GPUs
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-70b-hf",
+    load_in_4bit=True,
+    device_map="auto",                   # Automatic distribution
+    max_memory={0: "20GB", 1: "20GB"},   # Optional: limit per GPU
+)
+
+# Manual device map
+device_map = {
+    "model.embed_tokens": 0,
+    "model.layers.0": 0,
+    "model.layers.1": 0,
+    # ... distribute layers ...
+    "model.norm": 1,
+    "lm_head": 1,
+}
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-70b-hf",
+    load_in_4bit=True,
+    device_map=device_map,
+)
+```
+
+## Troubleshooting
+
+**Issue: OOM during quantization**
+```python
+# Solution: Use low_cpu_mem_usage
+model = AutoModelForCausalLM.from_pretrained(
+    "model-name",
+    quantization_config=config,
+    device_map="auto",
+    low_cpu_mem_usage=True,              # Reduce CPU memory during loading
+)
+```
+
+**Issue: Slow quantization**
+```python
+# GPTQ/AWQ take time to calibrate
+# Solution: Use pre-quantized models from Hub
+model = AutoModelForCausalLM.from_pretrained("TheBloke/Model-GPTQ")
+
+# Or use BnB for instant quantization
+model = AutoModelForCausalLM.from_pretrained("model-name", load_in_4bit=True)
+```
+
+**Issue: Poor quality after quantization**
+```python
+# Try different quantization types
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",           # Try "nf4" instead of "fp4"
+    bnb_4bit_compute_dtype=torch.bfloat16,  # Use BF16 if available
+)
+
+# Or use 8-bit instead of 4-bit
+model = AutoModelForCausalLM.from_pretrained("model-name", load_in_8bit=True)
+```
+
+**Issue: Can't fine-tune quantized model**
+```python
+# Ensure using compatible quantization method
+from peft import prepare_model_for_kbit_training
+
+model = prepare_model_for_kbit_training(model)
+
+# Only BnB and AWQ support PEFT fine-tuning
+# GPTQ has limited support, GGUF doesn't support fine-tuning
+```
+
+## Performance Benchmarks
+
+Approximate generation speed (tokens/sec) for Llama-2 7B on A100 40GB:
+
+| Method | Speed | Memory |
+|--------|-------|--------|
+| FP16 | 100 tok/s | 14GB |
+| 8-bit | 90 tok/s | 7GB |
+| 4-bit QLoRA | 70 tok/s | 4GB |
+| GPTQ 4-bit | 95 tok/s | 4GB |
+| AWQ 4-bit | 95 tok/s | 4GB |
+
+**Note:** Actual performance varies by hardware, sequence length, and batch size.
+
+## Resources
+
+- **Pre-quantized models:** Search "GPTQ" or "AWQ" on Hugging Face Hub
+- **BnB documentation:** https://github.com/TimDettmers/bitsandbytes
+- **PEFT library:** https://github.com/huggingface/peft
+- **QLoRA paper:** https://arxiv.org/abs/2305.14314
+
+For task-specific quantization examples, see `training_guide.md`.
--- a/scientific-packages/transformers/references/task_patterns.md
+++ b/scientific-packages/transformers/references/task_patterns.md
@@ -0,0 +1,610 @@
+# Task-Specific Patterns
+
+Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
+
+## Text Classification
+
+Classify text into predefined categories (sentiment, topic, intent, etc.).
+
+```python
+from transformers import (
+    AutoTokenizer, AutoModelForSequenceClassification,
+    TrainingArguments, Trainer, DataCollatorWithPadding
+)
+from datasets import load_dataset
+
+# 1. Load data
+dataset = load_dataset("imdb")
+
+# 2. Preprocess
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+def preprocess(examples):
+    return tokenizer(examples["text"], truncation=True, max_length=512)
+
+tokenized = dataset.map(preprocess, batched=True)
+
+# 3. Model
+model = AutoModelForSequenceClassification.from_pretrained(
+    "bert-base-uncased",
+    num_labels=2,
+    id2label={0: "negative", 1: "positive"},
+    label2id={"negative": 0, "positive": 1}
+)
+
+# 4. Train
+training_args = TrainingArguments(
+    output_dir="./results",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    num_train_epochs=3,
+    eval_strategy="epoch",
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized["train"],
+    eval_dataset=tokenized["test"],
+    tokenizer=tokenizer,
+    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
+)
+
+trainer.train()
+
+# 5. Inference
+text = "This movie was fantastic!"
+inputs = tokenizer(text, return_tensors="pt")
+outputs = model(**inputs)
+predictions = outputs.logits.argmax(-1)
+print(model.config.id2label[predictions.item()])  # "positive"
+```
+
+## Token Classification (NER)
+
+Label each token in text (named entities, POS tags, etc.).
+
+```python
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from datasets import load_dataset
+
+# Load data (tokens and NER tags)
+dataset = load_dataset("conll2003")
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+def tokenize_and_align_labels(examples):
+    tokenized_inputs = tokenizer(
+        examples["tokens"],
+        truncation=True,
+        is_split_into_words=True
+    )
+
+    labels = []
+    for i, label in enumerate(examples["ner_tags"]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        label_ids = []
+        previous_word_idx = None
+        for word_idx in word_ids:
+            if word_idx is None:
+                label_ids.append(-100)  # Special tokens
+            elif word_idx != previous_word_idx:
+                label_ids.append(label[word_idx])
+            else:
+                label_ids.append(-100)  # Subword tokens
+            previous_word_idx = word_idx
+        labels.append(label_ids)
+
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+
+tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
+
+# Model
+label_list = dataset["train"].features["ner_tags"].feature.names
+model = AutoModelForTokenClassification.from_pretrained(
+    "bert-base-cased",
+    num_labels=len(label_list),
+    id2label={i: label for i, label in enumerate(label_list)},
+    label2id={label: i for i, label in enumerate(label_list)}
+)
+
+# Training similar to classification
+# ... (use Trainer with DataCollatorForTokenClassification)
+```
+
+## Question Answering (Extractive)
+
+Extract answer spans from context.
+
+```python
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
+model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
+
+question = "What is the capital of France?"
+context = "Paris is the capital and most populous city of France."
+
+inputs = tokenizer(question, context, return_tensors="pt")
+outputs = model(**inputs)
+
+# Get answer span
+answer_start = outputs.start_logits.argmax()
+answer_end = outputs.end_logits.argmax() + 1
+answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
+print(answer)  # "Paris"
+```
+
+## Text Generation
+
+Generate text continuations.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+prompt = "In the future, artificial intelligence will"
+inputs = tokenizer(prompt, return_tensors="pt")
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.8,
+    top_p=0.95,
+    repetition_penalty=1.2,
+)
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+```
+
+## Summarization
+
+Condense long text into summaries.
+
+```python
+from transformers import (
+    AutoTokenizer, AutoModelForSeq2SeqLM,
+    Seq2SeqTrainingArguments, Seq2SeqTrainer,
+    DataCollatorForSeq2Seq
+)
+
+tokenizer = AutoTokenizer.from_pretrained("t5-small")
+model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+
+def preprocess(examples):
+    inputs = ["summarize: " + doc for doc in examples["document"]]
+    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+    labels = tokenizer(
+        examples["summary"],
+        max_length=128,
+        truncation=True
+    )
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+
+tokenized_dataset = dataset.map(preprocess, batched=True)
+
+# Training
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./results",
+    predict_with_generate=True,  # Important for seq2seq
+    eval_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    num_train_epochs=3,
+)
+
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset["train"],
+    eval_dataset=tokenized_dataset["validation"],
+    tokenizer=tokenizer,
+    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
+)
+
+trainer.train()
+
+# Inference
+text = "Long article text here..."
+inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
+outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
+summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+```
+
+## Translation
+
+Translate text between languages.
+
+```python
+from transformers import pipeline
+
+translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
+result = translator("Hello, how are you?")
+print(result[0]["translation_text"])  # "Bonjour, comment allez-vous?"
+
+# For fine-tuning, similar to summarization with Seq2SeqTrainer
+```
+
+## Image Classification
+
+Classify images into categories.
+
+```python
+from transformers import (
+    AutoImageProcessor, AutoModelForImageClassification,
+    TrainingArguments, Trainer
+)
+from datasets import load_dataset
+from PIL import Image
+
+# Load data
+dataset = load_dataset("food101", split="train[:1000]")
+
+# Preprocess
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+
+def transform(examples):
+    examples["pixel_values"] = [
+        processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
+        for img in examples["image"]
+    ]
+    return examples
+
+dataset = dataset.with_transform(transform)
+
+# Model
+model = AutoModelForImageClassification.from_pretrained(
+    "google/vit-base-patch16-224",
+    num_labels=101,
+    ignore_mismatched_sizes=True
+)
+
+# Training
+training_args = TrainingArguments(
+    output_dir="./results",
+    remove_unused_columns=False,  # Keep image data
+    eval_strategy="epoch",
+    learning_rate=5e-5,
+    per_device_train_batch_size=32,
+    num_train_epochs=3,
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=processor,
+)
+
+trainer.train()
+
+# Inference
+image = Image.open("food.jpg")
+inputs = processor(image, return_tensors="pt")
+outputs = model(**inputs)
+predicted_class = outputs.logits.argmax(-1).item()
+print(model.config.id2label[predicted_class])
+```
+
+## Object Detection
+
+Detect and localize objects in images.
+
+```python
+from transformers import pipeline
+from PIL import Image
+
+detector = pipeline("object-detection", model="facebook/detr-resnet-50")
+
+image = Image.open("street.jpg")
+results = detector(image)
+
+for result in results:
+    print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
+    # car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
+```
+
+## Image Segmentation
+
+Segment images into regions.
+
+```python
+from transformers import pipeline
+
+segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
+
+image = "path/to/image.jpg"
+segments = segmenter(image)
+
+for segment in segments:
+    print(f"{segment['label']}: {segment['score']:.2f}")
+    # Access mask: segment['mask']
+```
+
+## Image Captioning
+
+Generate textual descriptions of images.
+
+```python
+from transformers import AutoProcessor, AutoModelForCausalLM
+from PIL import Image
+
+processor = AutoProcessor.from_pretrained("microsoft/git-base")
+model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
+
+image = Image.open("photo.jpg")
+inputs = processor(images=image, return_tensors="pt")
+
+outputs = model.generate(**inputs, max_length=50)
+caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+print(caption)  # "a dog sitting on grass"
+```
+
+## Speech Recognition (ASR)
+
+Transcribe speech to text.
+
+```python
+from transformers import pipeline
+
+transcriber = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-base"
+)
+
+result = transcriber("audio.mp3")
+print(result["text"])  # "Hello, this is a test."
+
+# With timestamps
+result = transcriber("audio.mp3", return_timestamps=True)
+for chunk in result["chunks"]:
+    print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
+```
+
+## Text-to-Speech
+
+Generate speech from text.
+
+```python
+from transformers import pipeline
+
+synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+
+result = synthesizer("Hello, how are you today?")
+# result["audio"] contains the waveform
+# result["sampling_rate"] contains the sample rate
+
+# Save audio
+import scipy
+scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
+```
+
+## Visual Question Answering
+
+Answer questions about images.
+
+```python
+from transformers import pipeline
+from PIL import Image
+
+vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
+
+image = Image.open("photo.jpg")
+question = "What color is the car?"
+
+result = vqa(image=image, question=question)
+print(result[0]["answer"])  # "red"
+```
+
+## Document Question Answering
+
+Extract information from documents (PDFs, images with text).
+
+```python
+from transformers import pipeline
+
+doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
+
+result = doc_qa(
+    image="invoice.png",
+    question="What is the total amount?"
+)
+
+print(result["answer"])  # "$1,234.56"
+```
+
+## Zero-Shot Classification
+
+Classify without training data.
+
+```python
+from transformers import pipeline
+
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+
+text = "This is a delicious Italian restaurant with great pasta."
+candidate_labels = ["food", "travel", "technology", "sports"]
+
+result = classifier(text, candidate_labels)
+print(result["labels"][0])  # "food"
+print(result["scores"][0])  # 0.95
+```
+
+## Few-Shot Learning with LLMs
+
+Use large language models for few-shot tasks.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+# Few-shot prompt
+prompt = """
+Classify the sentiment: positive, negative, or neutral.
+
+Text: "I love this product!"
+Sentiment: positive
+
+Text: "This is terrible."
+Sentiment: negative
+
+Text: "It's okay, nothing special."
+Sentiment: neutral
+
+Text: "Best purchase ever!"
+Sentiment:"""
+
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(response.split("Sentiment:")[-1].strip())  # "positive"
+```
+
+## Instruction-Following / Chat
+
+Use instruction-tuned models.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is machine learning?"},
+]
+
+formatted = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+
+inputs = tokenizer(formatted, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+# Extract assistant response
+assistant_response = response.split("[/INST]")[-1].strip()
+print(assistant_response)
+```
+
+## Embeddings / Semantic Search
+
+Generate embeddings for semantic similarity.
+
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+
+def get_embedding(text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    # Mean pooling
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    return embeddings
+
+# Get embeddings
+text1 = "Machine learning is a subset of AI"
+text2 = "AI includes machine learning"
+
+emb1 = get_embedding(text1)
+emb2 = get_embedding(text2)
+
+# Compute similarity
+similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
+print(f"Similarity: {similarity.item():.4f}")  # ~0.85
+```
+
+## Multimodal Understanding (CLIP)
+
+Connect vision and language.
+
+```python
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+image = Image.open("photo.jpg")
+texts = ["a dog", "a cat", "a car", "a house"]
+
+inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+outputs = model(**inputs)
+
+# Get similarity scores
+logits_per_image = outputs.logits_per_image
+probs = logits_per_image.softmax(dim=1)
+
+for text, prob in zip(texts, probs[0]):
+    print(f"{text}: {prob.item():.4f}")
+```
+
+## Common Evaluation Metrics
+
+```python
+from datasets import load_metric
+
+# Accuracy (classification)
+metric = load_metric("accuracy")
+predictions = [0, 1, 1, 0]
+references = [0, 1, 0, 0]
+result = metric.compute(predictions=predictions, references=references)
+
+# F1 Score (classification, NER)
+metric = load_metric("f1")
+result = metric.compute(predictions=predictions, references=references)
+
+# BLEU (translation)
+metric = load_metric("bleu")
+predictions = ["hello there general kenobi"]
+references = [["hello there general kenobi", "hello there!"]]
+result = metric.compute(predictions=predictions, references=references)
+
+# ROUGE (summarization)
+metric = load_metric("rouge")
+predictions = ["summary text"]
+references = ["reference summary"]
+result = metric.compute(predictions=predictions, references=references)
+```
+
+## Common Data Collators
+
+```python
+from transformers import (
+    DataCollatorWithPadding,
+    DataCollatorForTokenClassification,
+    DataCollatorForSeq2Seq,
+    DataCollatorForLanguageModeling,
+)
+
+# Classification: dynamic padding
+DataCollatorWithPadding(tokenizer=tokenizer)
+
+# NER: pad labels too
+DataCollatorForTokenClassification(tokenizer=tokenizer)
+
+# Seq2Seq: pad inputs and labels
+DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
+
+# Language modeling: create MLM masks
+DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
+```
+
+This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`.