Files
claude-scientific-skills/scientific-packages/transformers/references/task_patterns.md
2025-10-19 14:12:02 -07:00

16 KiB

Task-Specific Patterns

Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.

Text Classification

Classify text into predefined categories (sentiment, topic, intent, etc.).

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import load_dataset

# 1. Load data
dataset = load_dataset("imdb")

# 2. Preprocess
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized = dataset.map(preprocess, batched=True)

# 3. Model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1}
)

# 4. Train
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

trainer.train()

# 5. Inference
text = "This movie was fantastic!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(-1)
print(model.config.id2label[predictions.item()])  # "positive"

Token Classification (NER)

Label each token in text (named entities, POS tags, etc.).

from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset

# Load data (tokens and NER tags)
dataset = load_dataset("conll2003")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Model
label_list = dataset["train"].features["ner_tags"].feature.names
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)

# Training similar to classification
# ... (use Trainer with DataCollatorForTokenClassification)

Question Answering (Extractive)

Extract answer spans from context.

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

# Get answer span
answer_start = outputs.start_logits.argmax()
answer_end = outputs.end_logits.argmax() + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
print(answer)  # "Paris"

Text Generation

Generate text continuations.

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

prompt = "In the future, artificial intelligence will"
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.8,
    top_p=0.95,
    repetition_penalty=1.2,
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Summarization

Condense long text into summaries.

from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments, Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(
        examples["summary"],
        max_length=128,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Training
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    predict_with_generate=True,  # Important for seq2seq
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

trainer.train()

# Inference
text = "Long article text here..."
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

Translation

Translate text between languages.

from transformers import pipeline

translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Hello, how are you?")
print(result[0]["translation_text"])  # "Bonjour, comment allez-vous?"

# For fine-tuning, similar to summarization with Seq2SeqTrainer

Image Classification

Classify images into categories.

from transformers import (
    AutoImageProcessor, AutoModelForImageClassification,
    TrainingArguments, Trainer
)
from datasets import load_dataset
from PIL import Image

# Load data
dataset = load_dataset("food101", split="train[:1000]")

# Preprocess
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

def transform(examples):
    examples["pixel_values"] = [
        processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
        for img in examples["image"]
    ]
    return examples

dataset = dataset.with_transform(transform)

# Model
model = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=101,
    ignore_mismatched_sizes=True
)

# Training
training_args = TrainingArguments(
    output_dir="./results",
    remove_unused_columns=False,  # Keep image data
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
)

trainer.train()

# Inference
image = Image.open("food.jpg")
inputs = processor(image, return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()
print(model.config.id2label[predicted_class])

Object Detection

Detect and localize objects in images.

from transformers import pipeline
from PIL import Image

detector = pipeline("object-detection", model="facebook/detr-resnet-50")

image = Image.open("street.jpg")
results = detector(image)

for result in results:
    print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
    # car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}

Image Segmentation

Segment images into regions.

from transformers import pipeline

segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")

image = "path/to/image.jpg"
segments = segmenter(image)

for segment in segments:
    print(f"{segment['label']}: {segment['score']:.2f}")
    # Access mask: segment['mask']

Image Captioning

Generate textual descriptions of images.

from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image

processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")

image = Image.open("photo.jpg")
inputs = processor(images=image, return_tensors="pt")

outputs = model.generate(**inputs, max_length=50)
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(caption)  # "a dog sitting on grass"

Speech Recognition (ASR)

Transcribe speech to text.

from transformers import pipeline

transcriber = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base"
)

result = transcriber("audio.mp3")
print(result["text"])  # "Hello, this is a test."

# With timestamps
result = transcriber("audio.mp3", return_timestamps=True)
for chunk in result["chunks"]:
    print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")

Text-to-Speech

Generate speech from text.

from transformers import pipeline

synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")

result = synthesizer("Hello, how are you today?")
# result["audio"] contains the waveform
# result["sampling_rate"] contains the sample rate

# Save audio
import scipy
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])

Visual Question Answering

Answer questions about images.

from transformers import pipeline
from PIL import Image

vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

image = Image.open("photo.jpg")
question = "What color is the car?"

result = vqa(image=image, question=question)
print(result[0]["answer"])  # "red"

Document Question Answering

Extract information from documents (PDFs, images with text).

from transformers import pipeline

doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")

result = doc_qa(
    image="invoice.png",
    question="What is the total amount?"
)

print(result["answer"])  # "$1,234.56"

Zero-Shot Classification

Classify without training data.

from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

text = "This is a delicious Italian restaurant with great pasta."
candidate_labels = ["food", "travel", "technology", "sports"]

result = classifier(text, candidate_labels)
print(result["labels"][0])  # "food"
print(result["scores"][0])  # 0.95

Few-Shot Learning with LLMs

Use large language models for few-shot tasks.

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Few-shot prompt
prompt = """
Classify the sentiment: positive, negative, or neutral.

Text: "I love this product!"
Sentiment: positive

Text: "This is terrible."
Sentiment: negative

Text: "It's okay, nothing special."
Sentiment: neutral

Text: "Best purchase ever!"
Sentiment:"""

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response.split("Sentiment:")[-1].strip())  # "positive"

Instruction-Following / Chat

Use instruction-tuned models.

from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is machine learning?"},
]

formatted = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(formatted, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract assistant response
assistant_response = response.split("[/INST]")[-1].strip()
print(assistant_response)

Generate embeddings for semantic similarity.

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Get embeddings
text1 = "Machine learning is a subset of AI"
text2 = "AI includes machine learning"

emb1 = get_embedding(text1)
emb2 = get_embedding(text2)

# Compute similarity
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
print(f"Similarity: {similarity.item():.4f}")  # ~0.85

Multimodal Understanding (CLIP)

Connect vision and language.

from transformers import CLIPProcessor, CLIPModel
from PIL import Image

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("photo.jpg")
texts = ["a dog", "a cat", "a car", "a house"]

inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

# Get similarity scores
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

for text, prob in zip(texts, probs[0]):
    print(f"{text}: {prob.item():.4f}")

Common Evaluation Metrics

from datasets import load_metric

# Accuracy (classification)
metric = load_metric("accuracy")
predictions = [0, 1, 1, 0]
references = [0, 1, 0, 0]
result = metric.compute(predictions=predictions, references=references)

# F1 Score (classification, NER)
metric = load_metric("f1")
result = metric.compute(predictions=predictions, references=references)

# BLEU (translation)
metric = load_metric("bleu")
predictions = ["hello there general kenobi"]
references = [["hello there general kenobi", "hello there!"]]
result = metric.compute(predictions=predictions, references=references)

# ROUGE (summarization)
metric = load_metric("rouge")
predictions = ["summary text"]
references = ["reference summary"]
result = metric.compute(predictions=predictions, references=references)

Common Data Collators

from transformers import (
    DataCollatorWithPadding,
    DataCollatorForTokenClassification,
    DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling,
)

# Classification: dynamic padding
DataCollatorWithPadding(tokenizer=tokenizer)

# NER: pad labels too
DataCollatorForTokenClassification(tokenizer=tokenizer)

# Seq2Seq: pad inputs and labels
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Language modeling: create MLM masks
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

This covers the most common task patterns. For detailed parameter tuning, see api_reference.md and generation_strategies.md.