mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
611 lines
16 KiB
Markdown
611 lines
16 KiB
Markdown
# Task-Specific Patterns
|
|
|
|
Quick reference for implementing common tasks with Transformers. Each pattern includes the complete workflow from data loading to inference.
|
|
|
|
## Text Classification
|
|
|
|
Classify text into predefined categories (sentiment, topic, intent, etc.).
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer, AutoModelForSequenceClassification,
|
|
TrainingArguments, Trainer, DataCollatorWithPadding
|
|
)
|
|
from datasets import load_dataset
|
|
|
|
# 1. Load data
|
|
dataset = load_dataset("imdb")
|
|
|
|
# 2. Preprocess
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
def preprocess(examples):
|
|
return tokenizer(examples["text"], truncation=True, max_length=512)
|
|
|
|
tokenized = dataset.map(preprocess, batched=True)
|
|
|
|
# 3. Model
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
"bert-base-uncased",
|
|
num_labels=2,
|
|
id2label={0: "negative", 1: "positive"},
|
|
label2id={"negative": 0, "positive": 1}
|
|
)
|
|
|
|
# 4. Train
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=16,
|
|
num_train_epochs=3,
|
|
eval_strategy="epoch",
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized["train"],
|
|
eval_dataset=tokenized["test"],
|
|
tokenizer=tokenizer,
|
|
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# 5. Inference
|
|
text = "This movie was fantastic!"
|
|
inputs = tokenizer(text, return_tensors="pt")
|
|
outputs = model(**inputs)
|
|
predictions = outputs.logits.argmax(-1)
|
|
print(model.config.id2label[predictions.item()]) # "positive"
|
|
```
|
|
|
|
## Token Classification (NER)
|
|
|
|
Label each token in text (named entities, POS tags, etc.).
|
|
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
from datasets import load_dataset
|
|
|
|
# Load data (tokens and NER tags)
|
|
dataset = load_dataset("conll2003")
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
|
|
|
def tokenize_and_align_labels(examples):
|
|
tokenized_inputs = tokenizer(
|
|
examples["tokens"],
|
|
truncation=True,
|
|
is_split_into_words=True
|
|
)
|
|
|
|
labels = []
|
|
for i, label in enumerate(examples["ner_tags"]):
|
|
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
label_ids = []
|
|
previous_word_idx = None
|
|
for word_idx in word_ids:
|
|
if word_idx is None:
|
|
label_ids.append(-100) # Special tokens
|
|
elif word_idx != previous_word_idx:
|
|
label_ids.append(label[word_idx])
|
|
else:
|
|
label_ids.append(-100) # Subword tokens
|
|
previous_word_idx = word_idx
|
|
labels.append(label_ids)
|
|
|
|
tokenized_inputs["labels"] = labels
|
|
return tokenized_inputs
|
|
|
|
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
|
|
|
|
# Model
|
|
label_list = dataset["train"].features["ner_tags"].feature.names
|
|
model = AutoModelForTokenClassification.from_pretrained(
|
|
"bert-base-cased",
|
|
num_labels=len(label_list),
|
|
id2label={i: label for i, label in enumerate(label_list)},
|
|
label2id={label: i for i, label in enumerate(label_list)}
|
|
)
|
|
|
|
# Training similar to classification
|
|
# ... (use Trainer with DataCollatorForTokenClassification)
|
|
```
|
|
|
|
## Question Answering (Extractive)
|
|
|
|
Extract answer spans from context.
|
|
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
|
|
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
|
|
|
|
question = "What is the capital of France?"
|
|
context = "Paris is the capital and most populous city of France."
|
|
|
|
inputs = tokenizer(question, context, return_tensors="pt")
|
|
outputs = model(**inputs)
|
|
|
|
# Get answer span
|
|
answer_start = outputs.start_logits.argmax()
|
|
answer_end = outputs.end_logits.argmax() + 1
|
|
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
|
|
print(answer) # "Paris"
|
|
```
|
|
|
|
## Text Generation
|
|
|
|
Generate text continuations.
|
|
|
|
```python
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
|
|
prompt = "In the future, artificial intelligence will"
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
|
outputs = model.generate(
|
|
**inputs,
|
|
max_new_tokens=100,
|
|
do_sample=True,
|
|
temperature=0.8,
|
|
top_p=0.95,
|
|
repetition_penalty=1.2,
|
|
)
|
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
print(generated_text)
|
|
```
|
|
|
|
## Summarization
|
|
|
|
Condense long text into summaries.
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoTokenizer, AutoModelForSeq2SeqLM,
|
|
Seq2SeqTrainingArguments, Seq2SeqTrainer,
|
|
DataCollatorForSeq2Seq
|
|
)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
|
|
|
def preprocess(examples):
|
|
inputs = ["summarize: " + doc for doc in examples["document"]]
|
|
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
|
|
|
labels = tokenizer(
|
|
examples["summary"],
|
|
max_length=128,
|
|
truncation=True
|
|
)
|
|
model_inputs["labels"] = labels["input_ids"]
|
|
return model_inputs
|
|
|
|
tokenized_dataset = dataset.map(preprocess, batched=True)
|
|
|
|
# Training
|
|
training_args = Seq2SeqTrainingArguments(
|
|
output_dir="./results",
|
|
predict_with_generate=True, # Important for seq2seq
|
|
eval_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=8,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Seq2SeqTrainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_dataset["train"],
|
|
eval_dataset=tokenized_dataset["validation"],
|
|
tokenizer=tokenizer,
|
|
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
text = "Long article text here..."
|
|
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
|
|
outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
|
|
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
```
|
|
|
|
## Translation
|
|
|
|
Translate text between languages.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
|
|
result = translator("Hello, how are you?")
|
|
print(result[0]["translation_text"]) # "Bonjour, comment allez-vous?"
|
|
|
|
# For fine-tuning, similar to summarization with Seq2SeqTrainer
|
|
```
|
|
|
|
## Image Classification
|
|
|
|
Classify images into categories.
|
|
|
|
```python
|
|
from transformers import (
|
|
AutoImageProcessor, AutoModelForImageClassification,
|
|
TrainingArguments, Trainer
|
|
)
|
|
from datasets import load_dataset
|
|
from PIL import Image
|
|
|
|
# Load data
|
|
dataset = load_dataset("food101", split="train[:1000]")
|
|
|
|
# Preprocess
|
|
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
|
|
|
|
def transform(examples):
|
|
examples["pixel_values"] = [
|
|
processor(img.convert("RGB"), return_tensors="pt")["pixel_values"][0]
|
|
for img in examples["image"]
|
|
]
|
|
return examples
|
|
|
|
dataset = dataset.with_transform(transform)
|
|
|
|
# Model
|
|
model = AutoModelForImageClassification.from_pretrained(
|
|
"google/vit-base-patch16-224",
|
|
num_labels=101,
|
|
ignore_mismatched_sizes=True
|
|
)
|
|
|
|
# Training
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
remove_unused_columns=False, # Keep image data
|
|
eval_strategy="epoch",
|
|
learning_rate=5e-5,
|
|
per_device_train_batch_size=32,
|
|
num_train_epochs=3,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=dataset,
|
|
tokenizer=processor,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Inference
|
|
image = Image.open("food.jpg")
|
|
inputs = processor(image, return_tensors="pt")
|
|
outputs = model(**inputs)
|
|
predicted_class = outputs.logits.argmax(-1).item()
|
|
print(model.config.id2label[predicted_class])
|
|
```
|
|
|
|
## Object Detection
|
|
|
|
Detect and localize objects in images.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
from PIL import Image
|
|
|
|
detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
|
|
|
image = Image.open("street.jpg")
|
|
results = detector(image)
|
|
|
|
for result in results:
|
|
print(f"{result['label']}: {result['score']:.2f} at {result['box']}")
|
|
# car: 0.98 at {'xmin': 123, 'ymin': 456, 'xmax': 789, 'ymax': 1011}
|
|
```
|
|
|
|
## Image Segmentation
|
|
|
|
Segment images into regions.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
segmenter = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic")
|
|
|
|
image = "path/to/image.jpg"
|
|
segments = segmenter(image)
|
|
|
|
for segment in segments:
|
|
print(f"{segment['label']}: {segment['score']:.2f}")
|
|
# Access mask: segment['mask']
|
|
```
|
|
|
|
## Image Captioning
|
|
|
|
Generate textual descriptions of images.
|
|
|
|
```python
|
|
from transformers import AutoProcessor, AutoModelForCausalLM
|
|
from PIL import Image
|
|
|
|
processor = AutoProcessor.from_pretrained("microsoft/git-base")
|
|
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
|
|
|
|
image = Image.open("photo.jpg")
|
|
inputs = processor(images=image, return_tensors="pt")
|
|
|
|
outputs = model.generate(**inputs, max_length=50)
|
|
caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
|
print(caption) # "a dog sitting on grass"
|
|
```
|
|
|
|
## Speech Recognition (ASR)
|
|
|
|
Transcribe speech to text.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
transcriber = pipeline(
|
|
"automatic-speech-recognition",
|
|
model="openai/whisper-base"
|
|
)
|
|
|
|
result = transcriber("audio.mp3")
|
|
print(result["text"]) # "Hello, this is a test."
|
|
|
|
# With timestamps
|
|
result = transcriber("audio.mp3", return_timestamps=True)
|
|
for chunk in result["chunks"]:
|
|
print(f"[{chunk['timestamp'][0]:.1f}s - {chunk['timestamp'][1]:.1f}s]: {chunk['text']}")
|
|
```
|
|
|
|
## Text-to-Speech
|
|
|
|
Generate speech from text.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
|
|
result = synthesizer("Hello, how are you today?")
|
|
# result["audio"] contains the waveform
|
|
# result["sampling_rate"] contains the sample rate
|
|
|
|
# Save audio
|
|
import scipy
|
|
scipy.io.wavfile.write("output.wav", result["sampling_rate"], result["audio"][0])
|
|
```
|
|
|
|
## Visual Question Answering
|
|
|
|
Answer questions about images.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
from PIL import Image
|
|
|
|
vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
|
|
|
image = Image.open("photo.jpg")
|
|
question = "What color is the car?"
|
|
|
|
result = vqa(image=image, question=question)
|
|
print(result[0]["answer"]) # "red"
|
|
```
|
|
|
|
## Document Question Answering
|
|
|
|
Extract information from documents (PDFs, images with text).
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
doc_qa = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
|
|
|
result = doc_qa(
|
|
image="invoice.png",
|
|
question="What is the total amount?"
|
|
)
|
|
|
|
print(result["answer"]) # "$1,234.56"
|
|
```
|
|
|
|
## Zero-Shot Classification
|
|
|
|
Classify without training data.
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
|
|
|
text = "This is a delicious Italian restaurant with great pasta."
|
|
candidate_labels = ["food", "travel", "technology", "sports"]
|
|
|
|
result = classifier(text, candidate_labels)
|
|
print(result["labels"][0]) # "food"
|
|
print(result["scores"][0]) # 0.95
|
|
```
|
|
|
|
## Few-Shot Learning with LLMs
|
|
|
|
Use large language models for few-shot tasks.
|
|
|
|
```python
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
|
|
# Few-shot prompt
|
|
prompt = """
|
|
Classify the sentiment: positive, negative, or neutral.
|
|
|
|
Text: "I love this product!"
|
|
Sentiment: positive
|
|
|
|
Text: "This is terrible."
|
|
Sentiment: negative
|
|
|
|
Text: "It's okay, nothing special."
|
|
Sentiment: neutral
|
|
|
|
Text: "Best purchase ever!"
|
|
Sentiment:"""
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1)
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
print(response.split("Sentiment:")[-1].strip()) # "positive"
|
|
```
|
|
|
|
## Instruction-Following / Chat
|
|
|
|
Use instruction-tuned models.
|
|
|
|
```python
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
|
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
|
|
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "What is machine learning?"},
|
|
]
|
|
|
|
formatted = tokenizer.apply_chat_template(
|
|
messages,
|
|
tokenize=False,
|
|
add_generation_prompt=True
|
|
)
|
|
|
|
inputs = tokenizer(formatted, return_tensors="pt")
|
|
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.7)
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
# Extract assistant response
|
|
assistant_response = response.split("[/INST]")[-1].strip()
|
|
print(assistant_response)
|
|
```
|
|
|
|
## Embeddings / Semantic Search
|
|
|
|
Generate embeddings for semantic similarity.
|
|
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModel
|
|
import torch
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
def get_embedding(text):
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
# Mean pooling
|
|
embeddings = outputs.last_hidden_state.mean(dim=1)
|
|
return embeddings
|
|
|
|
# Get embeddings
|
|
text1 = "Machine learning is a subset of AI"
|
|
text2 = "AI includes machine learning"
|
|
|
|
emb1 = get_embedding(text1)
|
|
emb2 = get_embedding(text2)
|
|
|
|
# Compute similarity
|
|
similarity = torch.nn.functional.cosine_similarity(emb1, emb2)
|
|
print(f"Similarity: {similarity.item():.4f}") # ~0.85
|
|
```
|
|
|
|
## Multimodal Understanding (CLIP)
|
|
|
|
Connect vision and language.
|
|
|
|
```python
|
|
from transformers import CLIPProcessor, CLIPModel
|
|
from PIL import Image
|
|
|
|
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
|
|
image = Image.open("photo.jpg")
|
|
texts = ["a dog", "a cat", "a car", "a house"]
|
|
|
|
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
|
|
outputs = model(**inputs)
|
|
|
|
# Get similarity scores
|
|
logits_per_image = outputs.logits_per_image
|
|
probs = logits_per_image.softmax(dim=1)
|
|
|
|
for text, prob in zip(texts, probs[0]):
|
|
print(f"{text}: {prob.item():.4f}")
|
|
```
|
|
|
|
## Common Evaluation Metrics
|
|
|
|
```python
|
|
from datasets import load_metric
|
|
|
|
# Accuracy (classification)
|
|
metric = load_metric("accuracy")
|
|
predictions = [0, 1, 1, 0]
|
|
references = [0, 1, 0, 0]
|
|
result = metric.compute(predictions=predictions, references=references)
|
|
|
|
# F1 Score (classification, NER)
|
|
metric = load_metric("f1")
|
|
result = metric.compute(predictions=predictions, references=references)
|
|
|
|
# BLEU (translation)
|
|
metric = load_metric("bleu")
|
|
predictions = ["hello there general kenobi"]
|
|
references = [["hello there general kenobi", "hello there!"]]
|
|
result = metric.compute(predictions=predictions, references=references)
|
|
|
|
# ROUGE (summarization)
|
|
metric = load_metric("rouge")
|
|
predictions = ["summary text"]
|
|
references = ["reference summary"]
|
|
result = metric.compute(predictions=predictions, references=references)
|
|
```
|
|
|
|
## Common Data Collators
|
|
|
|
```python
|
|
from transformers import (
|
|
DataCollatorWithPadding,
|
|
DataCollatorForTokenClassification,
|
|
DataCollatorForSeq2Seq,
|
|
DataCollatorForLanguageModeling,
|
|
)
|
|
|
|
# Classification: dynamic padding
|
|
DataCollatorWithPadding(tokenizer=tokenizer)
|
|
|
|
# NER: pad labels too
|
|
DataCollatorForTokenClassification(tokenizer=tokenizer)
|
|
|
|
# Seq2Seq: pad inputs and labels
|
|
DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
|
|
|
|
# Language modeling: create MLM masks
|
|
DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
|
|
```
|
|
|
|
This covers the most common task patterns. For detailed parameter tuning, see `api_reference.md` and `generation_strategies.md`.
|