16 KiB
Task-Specific Patterns
Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
Text Classification
Classify text into predefined categories (sentiment, topic, intent, etc.).
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import load_dataset
# 1. Load data
dataset = load_dataset("imdb")
# 2. Preprocess
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def preprocess(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized = dataset.map(preprocess, batched=True)
# 3. Model
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=2,
id2label={0: "negative", 1: "positive"},
label2id={"negative": 0, "positive": 1}
)
# 4. Train
training_args = TrainingArguments(
output_dir="./results",
learning_rate=2e-5,
per_device_train_batch_size=16,
num_train_epochs=3,
eval_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()
# 5. Inference
text = "This movie was fantastic!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(-1)
print(model.config.id2label[predictions.item()]) # "positive"
Token Classification (NER)
Label each token in text (named entities, POS tags, etc.).
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
# Load data (tokens and NER tags)
dataset = load_dataset("conll2003")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100) # Special tokens
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100) # Subword tokens
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
# Model
label_list = dataset["train"].features["ner_tags"].feature.names
model = AutoModelForTokenClassification.from_pretrained(
"bert-base-cased",
num_labels=len(label_list),
id2label={i: label for i, label in enumerate(label_list)},
label2id={label: i for i, label in enumerate(label_list)}
)
# Training similar to classification
# ... (use Trainer with DataCollatorForTokenClassification)
Question Answering (Extractive)
Extract answer spans from context.
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."
inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)
# Get answer span
answer_start = outputs.start_logits.argmax()
answer_end = outputs.end_logits.argmax() + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
print(answer) # "Paris"
Text Generation
Generate text continuations.
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
prompt = "In the future, artificial intelligence will"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True,
temperature=0.8,
top_p=0.95,
repetition_penalty=1.2,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
Summarization
Condense long text into summaries.
from transformers import (
AutoTokenizer, AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments, Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
def preprocess(examples):
inputs = ["summarize: " + doc for doc in examples["document"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(
examples["summary"],
max_length=128,
truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess, batched=True)
# Training
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
predict_with_generate=True, # Important for seq2seq
eval_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
num_train_epochs=3,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)
trainer.train()
# Inference
text = "Long article text here..."
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
Translation
Translate text between languages.
from transformers import pipeline
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Hello, how are you?")
print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?"
# For fine-tuning, similar to summarization with Seq2SeqTrainer
Image Classification
Classify images into categories.
from transformers import (
AutoImageProcessor, AutoModelForImageClassification,
TrainingArguments, Trainer
)
from datasets import load_dataset
from PIL import Image
# Load data
dataset = load_dataset("food101", split="train[:1000]")
# Preprocess
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
def transform(examples):
examples["pixel_values"] = [
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
for img in examples["image"]
]
return examples
dataset = dataset.with_transform(transform)
# Model
model = AutoModelForImageClassification.from_pretrained(
"google/vit-base-patch16-224",
num_labels=101,
ignore_mismatched_sizes=True
)
# Training
training_args = TrainingArguments(
output_dir="./results",
remove_unused_columns=False, # Keep image data
eval_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=32,
num_train_epochs=3,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=processor,
)
trainer.train()
# Inference
image = Image.open("food.jpg")
inputs = processor(image, return_tensors="pt")
outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()
print(model.config.id2label[predicted_class])
Object Detection
Detect and localize objects in images.
from transformers import pipeline
from PIL import Image
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
image = Image.open("street.jpg")
results = detector(image)
for result in results:
print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
# car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
Image Segmentation
Segment images into regions.
from transformers import pipeline
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
image = "path/to/image.jpg"
segments = segmenter(image)
for segment in segments:
print(f"{segment['label']}: {segment['score']:.2f}")
# Access mask: segment['mask']
Image Captioning
Generate textual descriptions of images.
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
image = Image.open("photo.jpg")
inputs = processor(images=image, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print(caption) # "a dog sitting on grass"
Speech Recognition (ASR)
Transcribe speech to text.
from transformers import pipeline
transcriber = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base"
)
result = transcriber("audio.mp3")
print(result["text"]) # "Hello, this is a test."
# With timestamps
result = transcriber("audio.mp3", return_timestamps=True)
for chunk in result["chunks"]:
print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
Text-to-Speech
Generate speech from text.
from transformers import pipeline
synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
result = synthesizer("Hello, how are you today?")
# result["audio"] contains the waveform
# result["sampling_rate"] contains the sample rate
# Save audio
import scipy
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
Visual Question Answering
Answer questions about images.
from transformers import pipeline
from PIL import Image
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
image = Image.open("photo.jpg")
question = "What color is the car?"
result = vqa(image=image, question=question)
print(result[0]["answer"]) # "red"
Document Question Answering
Extract information from documents (PDFs, images with text).
from transformers import pipeline
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
result = doc_qa(
image="invoice.png",
question="What is the total amount?"
)
print(result["answer"]) # "$1,234.56"
Zero-Shot Classification
Classify without training data.
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
text = "This is a delicious Italian restaurant with great pasta."
candidate_labels = ["food", "travel", "technology", "sports"]
result = classifier(text, candidate_labels)
print(result["labels"][0]) # "food"
print(result["scores"][0]) # 0.95
Few-Shot Learning with LLMs
Use large language models for few-shot tasks.
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Few-shot prompt
prompt = """
Classify the sentiment: positive, negative, or neutral.
Text: "I love this product!"
Sentiment: positive
Text: "This is terrible."
Sentiment: negative
Text: "It's okay, nothing special."
Sentiment: neutral
Text: "Best purchase ever!"
Sentiment:"""
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response.split("Sentiment:")[-1].strip()) # "positive"
Instruction-Following / Chat
Use instruction-tuned models.
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is machine learning?"},
]
formatted = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(formatted, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract assistant response
assistant_response = response.split("[/INST]")[-1].strip()
print(assistant_response)
Embeddings / Semantic Search
Generate embeddings for semantic similarity.
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
# Mean pooling
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
# Get embeddings
text1 = "Machine learning is a subset of AI"
text2 = "AI includes machine learning"
emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
# Compute similarity
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
print(f"Similarity: {similarity.item():.4f}") # ~0.85
Multimodal Understanding (CLIP)
Connect vision and language.
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = Image.open("photo.jpg")
texts = ["a dog", "a cat", "a car", "a house"]
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
# Get similarity scores
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
for text, prob in zip(texts, probs[0]):
print(f"{text}: {prob.item():.4f}")
Common Evaluation Metrics
from datasets import load_metric
# Accuracy (classification)
metric = load_metric("accuracy")
predictions = [0, 1, 1, 0]
references = [0, 1, 0, 0]
result = metric.compute(predictions=predictions, references=references)
# F1 Score (classification, NER)
metric = load_metric("f1")
result = metric.compute(predictions=predictions, references=references)
# BLEU (translation)
metric = load_metric("bleu")
predictions = ["hello there general kenobi"]
references = [["hello there general kenobi", "hello there!"]]
result = metric.compute(predictions=predictions, references=references)
# ROUGE (summarization)
metric = load_metric("rouge")
predictions = ["summary text"]
references = ["reference summary"]
result = metric.compute(predictions=predictions, references=references)
Common Data Collators
from transformers import (
DataCollatorWithPadding,
DataCollatorForTokenClassification,
DataCollatorForSeq2Seq,
DataCollatorForLanguageModeling,
)
# Classification: dynamic padding
DataCollatorWithPadding(tokenizer=tokenizer)
# NER: pad labels too
DataCollatorForTokenClassification(tokenizer=tokenizer)
# Seq2Seq: pad inputs and labels
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Language modeling: create MLM masks
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
This covers the most common task patterns. For detailed parameter tuning, see api_reference.md and generation_strategies.md.