mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
600 lines
15 KiB
Markdown
600 lines
15 KiB
Markdown
# Common Task Patterns
|
|
|
|
This document provides common patterns and workflows for typical tasks using Transformers.
|
|
|
|
## Text Classification
|
|
|
|
### Binary or Multi-class Classification
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForSequenceClassification,
|
|
TrainingArguments,
|
|
Trainer
|
|
)
|
|
from datasets import load_dataset
|
|
import evaluate
|
|
import numpy as np
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("imdb")
|
|
|
|
# Tokenize
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
def tokenize_function(examples):
|
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
|
|
|
# Load model
|
|
id2label = {0: "negative", 1: "positive"}
|
|
label2id = {"negative": 0, "positive": 1}
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
"bert-base-uncased",
|
|
num_labels=2,
|
|
id2label=id2label,
|
|
label2id=label2id
|
|
)
|
|
|
|
# Metrics
|
|
metric = evaluate.load("accuracy")
|
|
|
|
def compute_metrics(eval_pred):
|
|
logits, labels = eval_pred
|
|
predictions = np.argmax(logits, axis=-1)
|
|
return metric.compute(predictions=predictions, references=labels)
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=16,
|
|
per_device_eval_batch_size=64,
|
|
num_train_epochs=3,
|
|
weight_decay=0.01,
|
|
load_best_model_at_end=True,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_datasets["train"],
|
|
eval_dataset=tokenized_datasets["test"],
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
text = "This movie was fantastic!"
|
|
inputs = tokenizer(text, return_tensors="pt")
|
|
outputs = model(**inputs)
|
|
predictions = outputs.logits.argmax(-1)
|
|
print(id2label[predictions.item()])
|
|
```
|
|
|
|
## Named Entity Recognition (Token Classification)
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForTokenClassification,
|
|
TrainingArguments,
|
|
Trainer,
|
|
DataCollatorForTokenClassification
|
|
)
|
|
from datasets import load_dataset
|
|
import evaluate
|
|
import numpy as np
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("conll2003")
|
|
|
|
# Tokenize (align labels with tokenized words)
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
def tokenize_and_align_labels(examples):
|
|
tokenized_inputs = tokenizer(
|
|
examples["tokens"],
|
|
truncation=True,
|
|
is_split_into_words=True
|
|
)
|
|
|
|
labels = []
|
|
for i, label in enumerate(examples["ner_tags"]):
|
|
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
label_ids = []
|
|
previous_word_idx = None
|
|
for word_idx in word_ids:
|
|
if word_idx is None:
|
|
label_ids.append(-100)
|
|
elif word_idx != previous_word_idx:
|
|
label_ids.append(label[word_idx])
|
|
else:
|
|
label_ids.append(-100)
|
|
previous_word_idx = word_idx
|
|
labels.append(label_ids)
|
|
|
|
tokenized_inputs["labels"] = labels
|
|
return tokenized_inputs
|
|
|
|
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
|
|
|
|
# Model
|
|
label_list = dataset["train"].features["ner_tags"].feature.names
|
|
model = AutoModelForTokenClassification.from_pretrained(
|
|
"bert-base-uncased",
|
|
num_labels=len(label_list)
|
|
)
|
|
|
|
# Data collator
|
|
data_collator = DataCollatorForTokenClassification(tokenizer)
|
|
|
|
# Metrics
|
|
metric = evaluate.load("seqeval")
|
|
|
|
def compute_metrics(eval_pred):
|
|
predictions, labels = eval_pred
|
|
predictions = np.argmax(predictions, axis=2)
|
|
|
|
true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
|
|
true_predictions = [
|
|
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
|
|
for prediction, label in zip(predictions, labels)
|
|
]
|
|
|
|
return metric.compute(predictions=true_predictions, references=true_labels)
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=16,
|
|
num_train_epochs=3,
|
|
weight_decay=0.01,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_datasets["train"],
|
|
eval_dataset=tokenized_datasets["validation"],
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.train()
|
|
```
|
|
|
|
## Question Answering
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForQuestionAnswering,
|
|
TrainingArguments,
|
|
Trainer,
|
|
DefaultDataCollator
|
|
)
|
|
from datasets import load_dataset
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("squad")
|
|
|
|
# Tokenize
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
def preprocess_function(examples):
|
|
questions = [q.strip() for q in examples["question"]]
|
|
inputs = tokenizer(
|
|
questions,
|
|
examples["context"],
|
|
max_length=384,
|
|
truncation="only_second",
|
|
return_offsets_mapping=True,
|
|
padding="max_length",
|
|
)
|
|
|
|
offset_mapping = inputs.pop("offset_mapping")
|
|
answers = examples["answers"]
|
|
start_positions = []
|
|
end_positions = []
|
|
|
|
for i, offset in enumerate(offset_mapping):
|
|
answer = answers[i]
|
|
start_char = answer["answer_start"][0]
|
|
end_char = start_char + len(answer["text"][0])
|
|
|
|
# Find start and end token positions
|
|
sequence_ids = inputs.sequence_ids(i)
|
|
context_start = sequence_ids.index(1)
|
|
context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
|
|
|
|
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
|
|
start_positions.append(0)
|
|
end_positions.append(0)
|
|
else:
|
|
idx = context_start
|
|
while idx <= context_end and offset[idx][0] <= start_char:
|
|
idx += 1
|
|
start_positions.append(idx - 1)
|
|
|
|
idx = context_end
|
|
while idx >= context_start and offset[idx][1] >= end_char:
|
|
idx -= 1
|
|
end_positions.append(idx + 1)
|
|
|
|
inputs["start_positions"] = start_positions
|
|
inputs["end_positions"] = end_positions
|
|
return inputs
|
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
|
|
|
|
# Model
|
|
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=16,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_datasets["train"],
|
|
eval_dataset=tokenized_datasets["validation"],
|
|
data_collator=DefaultDataCollator(),
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
question = "What is the capital of France?"
|
|
context = "Paris is the capital and most populous city of France."
|
|
inputs = tokenizer(question, context, return_tensors="pt")
|
|
outputs = model(**inputs)
|
|
|
|
start_pos = outputs.start_logits.argmax()
|
|
end_pos = outputs.end_logits.argmax()
|
|
answer_tokens = inputs.input_ids[0][start_pos:end_pos+1]
|
|
answer = tokenizer.decode(answer_tokens)
|
|
```
|
|
|
|
## Text Summarization
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForSeq2SeqLM,
|
|
TrainingArguments,
|
|
Trainer,
|
|
DataCollatorForSeq2Seq
|
|
)
|
|
from datasets import load_dataset
|
|
import evaluate
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("cnn_dailymail", "3.0.0")
|
|
|
|
# Tokenize
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
|
|
|
def preprocess_function(examples):
|
|
inputs = ["summarize: " + doc for doc in examples["article"]]
|
|
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
|
|
|
labels = tokenizer(
|
|
text_target=examples["highlights"],
|
|
max_length=128,
|
|
truncation=True
|
|
)
|
|
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
return model_inputs
|
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True)
|
|
|
|
# Model
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
|
|
|
# Data collator
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
|
|
|
# Metrics
|
|
rouge = evaluate.load("rouge")
|
|
|
|
def compute_metrics(eval_pred):
|
|
predictions, labels = eval_pred
|
|
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
|
|
|
result = rouge.compute(
|
|
predictions=decoded_preds,
|
|
references=decoded_labels,
|
|
use_stemmer=True
|
|
)
|
|
|
|
return {k: round(v, 4) for k, v in result.items()}
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=8,
|
|
per_device_eval_batch_size=8,
|
|
num_train_epochs=3,
|
|
predict_with_generate=True,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_datasets["train"],
|
|
eval_dataset=tokenized_datasets["validation"],
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
text = "Long article text..."
|
|
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
|
|
outputs = model.generate(**inputs, max_length=128, num_beams=4)
|
|
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
```
|
|
|
|
## Translation
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForSeq2SeqLM,
|
|
TrainingArguments,
|
|
Trainer,
|
|
DataCollatorForSeq2Seq
|
|
)
|
|
from datasets import load_dataset
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("wmt16", "de-en")
|
|
|
|
# Tokenize
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
|
|
|
def preprocess_function(examples):
|
|
inputs = [f"translate German to English: {de}" for de in examples["de"]]
|
|
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
|
|
|
|
labels = tokenizer(
|
|
text_target=examples["en"],
|
|
max_length=128,
|
|
truncation=True
|
|
)
|
|
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
return model_inputs
|
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True)
|
|
|
|
# Model and training (similar to summarization)
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
|
|
|
# Inference
|
|
text = "Guten Tag, wie geht es Ihnen?"
|
|
inputs = tokenizer(f"translate German to English: {text}", return_tensors="pt")
|
|
outputs = model.generate(**inputs, max_length=128)
|
|
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
```
|
|
|
|
## Causal Language Modeling (Training from Scratch or Fine-tuning)
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForCausalLM,
|
|
TrainingArguments,
|
|
Trainer,
|
|
DataCollatorForLanguageModeling
|
|
)
|
|
from datasets import load_dataset
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
|
|
|
|
# Tokenize
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
def tokenize_function(examples):
|
|
return tokenizer(examples["text"], truncation=True, max_length=512)
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
|
|
|
# Group texts into chunks
|
|
block_size = 128
|
|
|
|
def group_texts(examples):
|
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
|
total_length = (total_length // block_size) * block_size
|
|
result = {
|
|
k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
|
|
for k, t in concatenated_examples.items()
|
|
}
|
|
result["labels"] = result["input_ids"].copy()
|
|
return result
|
|
|
|
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
|
|
|
|
# Model
|
|
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
|
|
# Data collator
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=8,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=lm_datasets["train"],
|
|
eval_dataset=lm_datasets["validation"],
|
|
data_collator=data_collator,
|
|
)
|
|
|
|
trainer.train()
|
|
```
|
|
|
|
## Image Classification
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoImageProcessor,
|
|
AutoModelForImageClassification,
|
|
TrainingArguments,
|
|
Trainer
|
|
)
|
|
from datasets import load_dataset
|
|
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
|
|
import numpy as np
|
|
import evaluate
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("food101", split="train[:5000]")
|
|
|
|
# Prepare image transforms
|
|
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
|
|
|
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
|
|
size = image_processor.size["height"]
|
|
|
|
transforms = Compose([
|
|
Resize((size, size)),
|
|
ToTensor(),
|
|
normalize,
|
|
])
|
|
|
|
def preprocess_function(examples):
|
|
examples["pixel_values"] = [transforms(img.convert("RGB")) for img in examples["image"]]
|
|
return examples
|
|
|
|
dataset = dataset.with_transform(preprocess_function)
|
|
|
|
# Model
|
|
model = AutoModelForImageClassification.from_pretrained(
|
|
"google/vit-base-patch16-224",
|
|
num_labels=len(dataset["train"].features["label"].names),
|
|
ignore_mismatched_sizes=True
|
|
)
|
|
|
|
# Metrics
|
|
metric = evaluate.load("accuracy")
|
|
|
|
def compute_metrics(eval_pred):
|
|
predictions = np.argmax(eval_pred.predictions, axis=1)
|
|
return metric.compute(predictions=predictions, references=eval_pred.label_ids)
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=5e-5,
|
|
per_device_train_batch_size=16,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=dataset["train"],
|
|
eval_dataset=dataset["validation"],
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.train()
|
|
```
|
|
|
|
## Vision-Language Tasks (Image Captioning)
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoProcessor,
|
|
AutoModelForVision2Seq,
|
|
TrainingArguments,
|
|
Trainer
|
|
)
|
|
from datasets import load_dataset
|
|
from PIL import Image
|
|
|
|
# Load dataset
|
|
dataset = load_dataset("ybelkada/football-dataset")
|
|
|
|
# Processor
|
|
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
|
|
|
def preprocess_function(examples):
|
|
images = [Image.open(img).convert("RGB") for img in examples["image"]]
|
|
texts = examples["caption"]
|
|
|
|
inputs = processor(images=images, text=texts, padding="max_length", truncation=True)
|
|
inputs["labels"] = inputs["input_ids"]
|
|
return inputs
|
|
|
|
dataset = dataset.map(preprocess_function, batched=True)
|
|
|
|
# Model
|
|
model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base")
|
|
|
|
# Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
learning_rate=5e-5,
|
|
per_device_train_batch_size=8,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=dataset["train"],
|
|
eval_dataset=dataset["test"],
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
image = Image.open("image.jpg")
|
|
inputs = processor(images=image, return_tensors="pt")
|
|
outputs = model.generate(**inputs)
|
|
caption = processor.decode(outputs[0], skip_special_tokens=True)
|
|
```
|
|
|
|
## Best Practices Summary
|
|
|
|
1. **Use appropriate Auto* classes**: AutoTokenizer, AutoModel, etc. for model loading
|
|
2. **Proper preprocessing**: Tokenize, align labels, handle special cases
|
|
3. **Data collators**: Use appropriate collators for dynamic padding
|
|
4. **Metrics**: Load and compute relevant metrics for evaluation
|
|
5. **Training arguments**: Configure properly for task and hardware
|
|
6. **Inference**: Use pipeline() for quick inference, or manual tokenization for custom needs
|