Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python3
"""
Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning.
Demonstrates best practices for scikit-learn workflows.
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
def create_preprocessing_pipeline(numeric_features, categorical_features):
"""
Create preprocessing pipeline for mixed data types.
Args:
numeric_features: List of numeric column names
categorical_features: List of categorical column names
Returns:
ColumnTransformer with appropriate preprocessing for each data type
"""
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
return preprocessor
def create_full_pipeline(preprocessor, classifier=None):
"""
Create complete ML pipeline with preprocessing and classification.
Args:
preprocessor: Preprocessing ColumnTransformer
classifier: Classifier instance (default: RandomForestClassifier)
Returns:
Complete Pipeline
"""
if classifier is None:
classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', classifier)
])
return pipeline
def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5):
"""
Evaluate model using cross-validation and test set.
Args:
pipeline: Trained pipeline
X_train, y_train: Training data
X_test, y_test: Test data
cv: Number of cross-validation folds
Returns:
Dictionary with evaluation results
"""
# Cross-validation on training set
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
# Test set evaluation
y_pred = pipeline.predict(X_test)
test_score = pipeline.score(X_test, y_test)
# Get probabilities if available
try:
y_proba = pipeline.predict_proba(X_test)
if len(np.unique(y_test)) == 2:
# Binary classification
auc = roc_auc_score(y_test, y_proba[:, 1])
else:
# Multiclass
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
except:
auc = None
results = {
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std(),
'test_score': test_score,
'auc': auc,
'classification_report': classification_report(y_test, y_pred),
'confusion_matrix': confusion_matrix(y_test, y_pred)
}
return results
def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5):
"""
Perform hyperparameter tuning using GridSearchCV.
Args:
pipeline: Pipeline to tune
X_train, y_train: Training data
param_grid: Dictionary of parameters to search
cv: Number of cross-validation folds
Returns:
GridSearchCV object with best model
"""
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=cv,
scoring='f1_weighted',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")
return grid_search
def main():
"""
Example usage of the classification pipeline.
"""
# Load your data here
# X, y = load_data()
# Example with synthetic data
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
random_state=42
)
# Convert to DataFrame for demonstration
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=feature_names)
# Split features into numeric and categorical (all numeric in this example)
numeric_features = feature_names
categorical_features = []
# Split data (use stratify for imbalanced classes)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Create preprocessing pipeline
preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
# Create full pipeline
pipeline = create_full_pipeline(preprocessor)
# Train model
print("Training model...")
pipeline.fit(X_train, y_train)
# Evaluate model
print("\nEvaluating model...")
results = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
print(f"Test Accuracy: {results['test_score']:.3f}")
if results['auc']:
print(f"ROC-AUC: {results['auc']:.3f}")
print("\nClassification Report:")
print(results['classification_report'])
# Hyperparameter tuning (optional)
print("\nTuning hyperparameters...")
param_grid = {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [10, 20, None],
'classifier__min_samples_split': [2, 5]
}
grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid)
# Evaluate best model
print("\nEvaluating tuned model...")
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
# Save model
print("\nSaving model...")
joblib.dump(best_pipeline, 'best_model.pkl')
print("Model saved as 'best_model.pkl'")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,291 @@
#!/usr/bin/env python3
"""
Clustering analysis script with multiple algorithms and evaluation.
Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization.
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
def scale_data(X):
"""
Scale features using StandardScaler.
ALWAYS scale data before clustering!
Args:
X: Feature matrix
Returns:
Scaled feature matrix and fitted scaler
"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X_scaled, scaler
def find_optimal_k(X_scaled, k_range=range(2, 11)):
"""
Find optimal number of clusters using elbow method and silhouette score.
Args:
X_scaled: Scaled feature matrix
k_range: Range of k values to try
Returns:
Dictionary with inertias and silhouette scores
"""
inertias = []
silhouette_scores = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, labels))
return {
'k_values': list(k_range),
'inertias': inertias,
'silhouette_scores': silhouette_scores
}
def plot_elbow_silhouette(results):
"""
Plot elbow method and silhouette scores.
Args:
results: Dictionary from find_optimal_k
"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Elbow plot
ax1.plot(results['k_values'], results['inertias'], 'bo-')
ax1.set_xlabel('Number of clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')
ax1.grid(True, alpha=0.3)
# Silhouette plot
ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-')
ax2.set_xlabel('Number of clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score vs k')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight')
print("Saved elbow and silhouette plots to 'elbow_silhouette.png'")
plt.close()
def evaluate_clustering(X_scaled, labels, algorithm_name):
"""
Evaluate clustering using multiple metrics.
Args:
X_scaled: Scaled feature matrix
labels: Cluster labels
algorithm_name: Name of clustering algorithm
Returns:
Dictionary with evaluation metrics
"""
# Filter out noise points for DBSCAN (-1 labels)
mask = labels != -1
X_filtered = X_scaled[mask]
labels_filtered = labels[mask]
n_clusters = len(set(labels_filtered))
n_noise = list(labels).count(-1)
results = {
'algorithm': algorithm_name,
'n_clusters': n_clusters,
'n_noise': n_noise
}
# Calculate metrics if we have valid clusters
if n_clusters > 1:
results['silhouette'] = silhouette_score(X_filtered, labels_filtered)
results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
else:
results['silhouette'] = None
results['davies_bouldin'] = None
results['calinski_harabasz'] = None
return results
def perform_kmeans(X_scaled, n_clusters=3):
"""
Perform k-means clustering.
Args:
X_scaled: Scaled feature matrix
n_clusters: Number of clusters
Returns:
Fitted KMeans model and labels
"""
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
return kmeans, labels
def perform_dbscan(X_scaled, eps=0.5, min_samples=5):
"""
Perform DBSCAN clustering.
Args:
X_scaled: Scaled feature matrix
eps: Maximum distance between neighbors
min_samples: Minimum points to form dense region
Returns:
Fitted DBSCAN model and labels
"""
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(X_scaled)
return dbscan, labels
def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'):
"""
Perform hierarchical clustering.
Args:
X_scaled: Scaled feature matrix
n_clusters: Number of clusters
linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
Returns:
Fitted AgglomerativeClustering model and labels
"""
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
labels = hierarchical.fit_predict(X_scaled)
return hierarchical, labels
def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'):
"""
Visualize clusters in 2D using PCA or t-SNE.
Args:
X_scaled: Scaled feature matrix
labels: Cluster labels
algorithm_name: Name of algorithm for title
method: 'pca' or 'tsne'
"""
# Reduce to 2D
if method == 'pca':
pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(X_scaled)
variance = pca.explained_variance_ratio_
xlabel = f'PC1 ({variance[0]:.1%} variance)'
ylabel = f'PC2 ({variance[1]:.1%} variance)'
else:
from sklearn.manifold import TSNE
# Use PCA first to speed up t-SNE
pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42)
X_pca = pca.fit_transform(X_scaled)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_2d = tsne.fit_transform(X_pca)
xlabel = 't-SNE 1'
ylabel = 't-SNE 2'
# Plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50)
plt.colorbar(scatter, label='Cluster')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(f'{algorithm_name} Clustering ({method.upper()})')
plt.grid(True, alpha=0.3)
filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png'
plt.savefig(filename, dpi=300, bbox_inches='tight')
print(f"Saved visualization to '{filename}'")
plt.close()
def main():
"""
Example clustering analysis workflow.
"""
# Load your data here
# X = load_data()
# Example with synthetic data
from sklearn.datasets import make_blobs
X, y_true = make_blobs(
n_samples=500,
n_features=10,
centers=4,
cluster_std=1.0,
random_state=42
)
print(f"Dataset shape: {X.shape}")
# Scale data (ALWAYS scale for clustering!)
print("\nScaling data...")
X_scaled, scaler = scale_data(X)
# Find optimal k
print("\nFinding optimal number of clusters...")
results = find_optimal_k(X_scaled)
plot_elbow_silhouette(results)
# Based on elbow/silhouette, choose optimal k
optimal_k = 4 # Adjust based on plots
# Perform k-means
print(f"\nPerforming k-means with k={optimal_k}...")
kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k)
kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means')
# Perform DBSCAN
print("\nPerforming DBSCAN...")
dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5)
dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN')
# Perform hierarchical clustering
print("\nPerforming hierarchical clustering...")
hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k)
hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical')
# Print results
print("\n" + "="*60)
print("CLUSTERING RESULTS")
print("="*60)
for results in [kmeans_results, dbscan_results, hier_results]:
print(f"\n{results['algorithm']}:")
print(f" Clusters: {results['n_clusters']}")
if results['n_noise'] > 0:
print(f" Noise points: {results['n_noise']}")
if results['silhouette']:
print(f" Silhouette Score: {results['silhouette']:.3f}")
print(f" Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)")
print(f" Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)")
# Visualize clusters
print("\nCreating visualizations...")
visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca')
visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca')
visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca')
print("\nClustering analysis complete!")
if __name__ == "__main__":
main()