Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""
Complete example for fine-tuning a text classification model.
This script demonstrates the full workflow:
1. Load dataset
2. Preprocess with tokenizer
3. Configure model
4. Train with Trainer
5. Evaluate and save
Usage:
python fine_tune_classifier.py --model bert-base-uncased --dataset imdb --epochs 3
"""
import argparse
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
)
import evaluate
import numpy as np
def compute_metrics(eval_pred):
"""Compute accuracy and F1 score."""
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = metric_accuracy.compute(predictions=predictions, references=labels)
f1 = metric_f1.compute(predictions=predictions, references=labels)
return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}
def main():
parser = argparse.ArgumentParser(description="Fine-tune a text classification model")
parser.add_argument(
"--model",
type=str,
default="bert-base-uncased",
help="Pretrained model name or path",
)
parser.add_argument(
"--dataset",
type=str,
default="imdb",
help="Dataset name from Hugging Face Hub",
)
parser.add_argument(
"--max-samples",
type=int,
default=None,
help="Maximum samples to use (for quick testing)",
)
parser.add_argument(
"--output-dir",
type=str,
default="./results",
help="Output directory for checkpoints",
)
parser.add_argument(
"--epochs",
type=int,
default=3,
help="Number of training epochs",
)
parser.add_argument(
"--batch-size",
type=int,
default=16,
help="Batch size per device",
)
parser.add_argument(
"--learning-rate",
type=float,
default=2e-5,
help="Learning rate",
)
parser.add_argument(
"--push-to-hub",
action="store_true",
help="Push model to Hugging Face Hub after training",
)
args = parser.parse_args()
print("=" * 60)
print("Text Classification Fine-Tuning")
print("=" * 60)
print(f"Model: {args.model}")
print(f"Dataset: {args.dataset}")
print(f"Epochs: {args.epochs}")
print(f"Batch size: {args.batch_size}")
print(f"Learning rate: {args.learning_rate}")
print("=" * 60)
# 1. Load dataset
print("\n[1/5] Loading dataset...")
dataset = load_dataset(args.dataset)
if args.max_samples:
dataset["train"] = dataset["train"].select(range(args.max_samples))
dataset["test"] = dataset["test"].select(range(args.max_samples // 5))
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")
# 2. Preprocess
print("\n[2/5] Preprocessing data...")
tokenizer = AutoTokenizer.from_pretrained(args.model)
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 3. Load model
print("\n[3/5] Loading model...")
# Determine number of labels
num_labels = len(set(dataset["train"]["label"]))
model = AutoModelForSequenceClassification.from_pretrained(
args.model,
num_labels=num_labels,
)
print(f"Number of labels: {num_labels}")
print(f"Model parameters: {model.num_parameters():,}")
# 4. Configure training
print("\n[4/5] Configuring training...")
training_args = TrainingArguments(
output_dir=args.output_dir,
learning_rate=args.learning_rate,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
num_train_epochs=args.epochs,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=args.push_to_hub,
logging_steps=100,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 5. Train
print("\n[5/5] Training...")
print("-" * 60)
trainer.train()
# Evaluate
print("\n" + "=" * 60)
print("Final Evaluation")
print("=" * 60)
metrics = trainer.evaluate()
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
print(f"F1 Score: {metrics['eval_f1']:.4f}")
print(f"Loss: {metrics['eval_loss']:.4f}")
# Save
print("\n" + "=" * 60)
print(f"Saving model to {args.output_dir}")
trainer.save_model(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
print("Pushing to Hugging Face Hub...")
trainer.push_to_hub()
print("=" * 60)
print("Training complete!")
print("=" * 60)
# Quick inference example
print("\nQuick inference example:")
from transformers import pipeline
classifier = pipeline(
"text-classification",
model=args.output_dir,
tokenizer=args.output_dir,
)
example_text = "This is a great example of how to use transformers!"
result = classifier(example_text)
print(f"Text: {example_text}")
print(f"Prediction: {result[0]['label']} (score: {result[0]['score']:.4f})")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env python3
"""
Text generation with various strategies.
This script demonstrates different generation strategies:
- Greedy decoding
- Beam search
- Sampling with temperature
- Top-k and top-p sampling
Usage:
python generate_text.py --model gpt2 --prompt "The future of AI" --strategy sampling
"""
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def generate_with_greedy(model, tokenizer, prompt, max_length):
"""Greedy decoding (deterministic)."""
print("\n" + "=" * 60)
print("GREEDY DECODING")
print("=" * 60)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
pad_token_id=tokenizer.eos_token_id,
)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nPrompt: {prompt}")
print(f"\nGenerated:\n{text}")
def generate_with_beam_search(model, tokenizer, prompt, max_length, num_beams=5):
"""Beam search for higher quality."""
print("\n" + "=" * 60)
print(f"BEAM SEARCH (num_beams={num_beams})")
print("=" * 60)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
num_beams=num_beams,
early_stopping=True,
no_repeat_ngram_size=2,
pad_token_id=tokenizer.eos_token_id,
)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nPrompt: {prompt}")
print(f"\nGenerated:\n{text}")
def generate_with_sampling(model, tokenizer, prompt, max_length, temperature=0.8):
"""Sampling with temperature."""
print("\n" + "=" * 60)
print(f"SAMPLING (temperature={temperature})")
print("=" * 60)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
temperature=temperature,
pad_token_id=tokenizer.eos_token_id,
)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nPrompt: {prompt}")
print(f"\nGenerated:\n{text}")
def generate_with_top_k_top_p(model, tokenizer, prompt, max_length, top_k=50, top_p=0.95, temperature=0.8):
"""Top-k and top-p (nucleus) sampling."""
print("\n" + "=" * 60)
print(f"TOP-K TOP-P SAMPLING (k={top_k}, p={top_p}, temp={temperature})")
print("=" * 60)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
pad_token_id=tokenizer.eos_token_id,
)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nPrompt: {prompt}")
print(f"\nGenerated:\n{text}")
def generate_multiple(model, tokenizer, prompt, max_length, num_sequences=3):
"""Generate multiple diverse sequences."""
print("\n" + "=" * 60)
print(f"MULTIPLE SEQUENCES (n={num_sequences})")
print("=" * 60)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
num_return_sequences=num_sequences,
temperature=0.9,
top_p=0.95,
pad_token_id=tokenizer.eos_token_id,
)
print(f"\nPrompt: {prompt}\n")
for i, output in enumerate(outputs, 1):
text = tokenizer.decode(output, skip_special_tokens=True)
print(f"\n--- Sequence {i} ---\n{text}\n")
def main():
parser = argparse.ArgumentParser(description="Text generation with various strategies")
parser.add_argument(
"--model",
type=str,
default="gpt2",
help="Model name or path",
)
parser.add_argument(
"--prompt",
type=str,
required=True,
help="Input prompt for generation",
)
parser.add_argument(
"--strategy",
type=str,
default="all",
choices=["greedy", "beam", "sampling", "top_k_top_p", "multiple", "all"],
help="Generation strategy to use",
)
parser.add_argument(
"--max-length",
type=int,
default=100,
help="Maximum number of new tokens to generate",
)
parser.add_argument(
"--device",
type=str,
default="auto",
help="Device (cuda, cpu, or auto)",
)
parser.add_argument(
"--temperature",
type=float,
default=0.8,
help="Sampling temperature",
)
parser.add_argument(
"--quantize",
action="store_true",
help="Use 8-bit quantization",
)
args = parser.parse_args()
print("=" * 60)
print("Text Generation Demo")
print("=" * 60)
print(f"Model: {args.model}")
print(f"Strategy: {args.strategy}")
print(f"Max length: {args.max_length}")
print(f"Device: {args.device}")
print("=" * 60)
# Load model and tokenizer
print("\nLoading model...")
if args.device == "auto":
device_map = "auto"
device = None
else:
device_map = None
device = args.device
model_kwargs = {"device_map": device_map} if device_map else {}
if args.quantize:
print("Using 8-bit quantization...")
model_kwargs["load_in_8bit"] = True
model = AutoModelForCausalLM.from_pretrained(args.model, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args.model)
if device and not device_map:
model = model.to(device)
print(f"Model loaded on: {model.device if hasattr(model, 'device') else 'multiple devices'}")
# Generate based on strategy
strategies = {
"greedy": lambda: generate_with_greedy(model, tokenizer, args.prompt, args.max_length),
"beam": lambda: generate_with_beam_search(model, tokenizer, args.prompt, args.max_length),
"sampling": lambda: generate_with_sampling(model, tokenizer, args.prompt, args.max_length, args.temperature),
"top_k_top_p": lambda: generate_with_top_k_top_p(model, tokenizer, args.prompt, args.max_length),
"multiple": lambda: generate_multiple(model, tokenizer, args.prompt, args.max_length),
}
if args.strategy == "all":
for strategy_fn in strategies.values():
strategy_fn()
else:
strategies[args.strategy]()
print("\n" + "=" * 60)
print("Generation complete!")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Quick inference script using Transformers pipelines.
This script demonstrates how to use various pipeline tasks for quick inference
without manually managing models, tokenizers, or preprocessing.
Usage:
python quick_inference.py --task text-generation --model gpt2 --input "Hello world"
python quick_inference.py --task sentiment-analysis --input "I love this!"
"""
import argparse
from transformers import pipeline, infer_device
def main():
parser = argparse.ArgumentParser(description="Quick inference with Transformers pipelines")
parser.add_argument(
"--task",
type=str,
required=True,
help="Pipeline task (text-generation, sentiment-analysis, question-answering, etc.)",
)
parser.add_argument(
"--model",
type=str,
default=None,
help="Model name or path (default: use task default)",
)
parser.add_argument(
"--input",
type=str,
required=True,
help="Input text for inference",
)
parser.add_argument(
"--context",
type=str,
default=None,
help="Context for question-answering tasks",
)
parser.add_argument(
"--max-length",
type=int,
default=50,
help="Maximum generation length",
)
parser.add_argument(
"--device",
type=str,
default=None,
help="Device (cuda, cpu, or auto-detect)",
)
args = parser.parse_args()
# Auto-detect device if not specified
if args.device is None:
device = infer_device()
else:
device = args.device
print(f"Using device: {device}")
print(f"Task: {args.task}")
print(f"Model: {args.model or 'default'}")
print("-" * 50)
# Create pipeline
pipe = pipeline(
args.task,
model=args.model,
device=device,
)
# Run inference based on task
if args.task == "question-answering":
if args.context is None:
print("Error: --context required for question-answering")
return
result = pipe(question=args.input, context=args.context)
print(f"Question: {args.input}")
print(f"Context: {args.context}")
print(f"\nAnswer: {result['answer']}")
print(f"Score: {result['score']:.4f}")
elif args.task == "text-generation":
result = pipe(args.input, max_length=args.max_length)
print(f"Prompt: {args.input}")
print(f"\nGenerated: {result[0]['generated_text']}")
elif args.task in ["sentiment-analysis", "text-classification"]:
result = pipe(args.input)
print(f"Text: {args.input}")
print(f"\nLabel: {result[0]['label']}")
print(f"Score: {result[0]['score']:.4f}")
else:
# Generic handling for other tasks
result = pipe(args.input)
print(f"Input: {args.input}")
print(f"\nResult: {result}")
if __name__ == "__main__":
main()