mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
Improve the scikit-learn skill
This commit is contained in:
@@ -1,219 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning.
|
||||
Demonstrates best practices for scikit-learn workflows.
|
||||
Complete classification pipeline example with preprocessing, model training,
|
||||
hyperparameter tuning, and evaluation.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
|
||||
import joblib
|
||||
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import (
|
||||
classification_report, confusion_matrix, roc_auc_score,
|
||||
accuracy_score, precision_score, recall_score, f1_score
|
||||
)
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def create_preprocessing_pipeline(numeric_features, categorical_features):
|
||||
"""
|
||||
Create preprocessing pipeline for mixed data types.
|
||||
Create a preprocessing pipeline for mixed data types.
|
||||
|
||||
Args:
|
||||
numeric_features: List of numeric column names
|
||||
categorical_features: List of categorical column names
|
||||
Parameters:
|
||||
-----------
|
||||
numeric_features : list
|
||||
List of numeric feature column names
|
||||
categorical_features : list
|
||||
List of categorical feature column names
|
||||
|
||||
Returns:
|
||||
ColumnTransformer with appropriate preprocessing for each data type
|
||||
--------
|
||||
ColumnTransformer
|
||||
Preprocessing pipeline
|
||||
"""
|
||||
# Numeric preprocessing
|
||||
numeric_transformer = Pipeline(steps=[
|
||||
('imputer', SimpleImputer(strategy='median')),
|
||||
('scaler', StandardScaler())
|
||||
])
|
||||
|
||||
# Categorical preprocessing
|
||||
categorical_transformer = Pipeline(steps=[
|
||||
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
|
||||
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
|
||||
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
||||
])
|
||||
|
||||
# Combine transformers
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', numeric_transformer, numeric_features),
|
||||
('cat', categorical_transformer, categorical_features)
|
||||
])
|
||||
]
|
||||
)
|
||||
|
||||
return preprocessor
|
||||
|
||||
|
||||
def create_full_pipeline(preprocessor, classifier=None):
|
||||
def train_and_evaluate_model(X, y, numeric_features, categorical_features,
|
||||
test_size=0.2, random_state=42):
|
||||
"""
|
||||
Create complete ML pipeline with preprocessing and classification.
|
||||
Complete pipeline: preprocess, train, tune, and evaluate a classifier.
|
||||
|
||||
Args:
|
||||
preprocessor: Preprocessing ColumnTransformer
|
||||
classifier: Classifier instance (default: RandomForestClassifier)
|
||||
Parameters:
|
||||
-----------
|
||||
X : DataFrame or array
|
||||
Feature matrix
|
||||
y : Series or array
|
||||
Target variable
|
||||
numeric_features : list
|
||||
List of numeric feature names
|
||||
categorical_features : list
|
||||
List of categorical feature names
|
||||
test_size : float
|
||||
Proportion of data for testing
|
||||
random_state : int
|
||||
Random seed
|
||||
|
||||
Returns:
|
||||
Complete Pipeline
|
||||
--------
|
||||
dict
|
||||
Dictionary containing trained model, predictions, and metrics
|
||||
"""
|
||||
if classifier is None:
|
||||
classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
|
||||
# Split data with stratification
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=test_size, stratify=y, random_state=random_state
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', classifier)
|
||||
])
|
||||
print(f"Training set size: {len(X_train)}")
|
||||
print(f"Test set size: {len(X_test)}")
|
||||
print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}")
|
||||
|
||||
return pipeline
|
||||
# Create preprocessor
|
||||
preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
|
||||
|
||||
|
||||
def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5):
|
||||
"""
|
||||
Evaluate model using cross-validation and test set.
|
||||
|
||||
Args:
|
||||
pipeline: Trained pipeline
|
||||
X_train, y_train: Training data
|
||||
X_test, y_test: Test data
|
||||
cv: Number of cross-validation folds
|
||||
|
||||
Returns:
|
||||
Dictionary with evaluation results
|
||||
"""
|
||||
# Cross-validation on training set
|
||||
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
|
||||
|
||||
# Test set evaluation
|
||||
y_pred = pipeline.predict(X_test)
|
||||
test_score = pipeline.score(X_test, y_test)
|
||||
|
||||
# Get probabilities if available
|
||||
try:
|
||||
y_proba = pipeline.predict_proba(X_test)
|
||||
if len(np.unique(y_test)) == 2:
|
||||
# Binary classification
|
||||
auc = roc_auc_score(y_test, y_proba[:, 1])
|
||||
else:
|
||||
# Multiclass
|
||||
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
|
||||
except:
|
||||
auc = None
|
||||
|
||||
results = {
|
||||
'cv_mean': cv_scores.mean(),
|
||||
'cv_std': cv_scores.std(),
|
||||
'test_score': test_score,
|
||||
'auc': auc,
|
||||
'classification_report': classification_report(y_test, y_pred),
|
||||
'confusion_matrix': confusion_matrix(y_test, y_pred)
|
||||
# Define models to compare
|
||||
models = {
|
||||
'Logistic Regression': Pipeline([
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', LogisticRegression(max_iter=1000, random_state=random_state))
|
||||
]),
|
||||
'Random Forest': Pipeline([
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state))
|
||||
]),
|
||||
'Gradient Boosting': Pipeline([
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state))
|
||||
])
|
||||
}
|
||||
|
||||
return results
|
||||
# Compare models using cross-validation
|
||||
print("\n" + "="*60)
|
||||
print("Model Comparison (5-Fold Cross-Validation)")
|
||||
print("="*60)
|
||||
|
||||
cv_results = {}
|
||||
for name, model in models.items():
|
||||
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
|
||||
cv_results[name] = scores.mean()
|
||||
print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
|
||||
|
||||
def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5):
|
||||
"""
|
||||
Perform hyperparameter tuning using GridSearchCV.
|
||||
# Select best model based on CV
|
||||
best_model_name = max(cv_results, key=cv_results.get)
|
||||
best_model = models[best_model_name]
|
||||
|
||||
Args:
|
||||
pipeline: Pipeline to tune
|
||||
X_train, y_train: Training data
|
||||
param_grid: Dictionary of parameters to search
|
||||
cv: Number of cross-validation folds
|
||||
print(f"\nBest model: {best_model_name}")
|
||||
|
||||
# Hyperparameter tuning for best model
|
||||
if best_model_name == 'Random Forest':
|
||||
param_grid = {
|
||||
'classifier__n_estimators': [100, 200],
|
||||
'classifier__max_depth': [10, 20, None],
|
||||
'classifier__min_samples_split': [2, 5]
|
||||
}
|
||||
elif best_model_name == 'Gradient Boosting':
|
||||
param_grid = {
|
||||
'classifier__n_estimators': [100, 200],
|
||||
'classifier__learning_rate': [0.01, 0.1],
|
||||
'classifier__max_depth': [3, 5]
|
||||
}
|
||||
else: # Logistic Regression
|
||||
param_grid = {
|
||||
'classifier__C': [0.1, 1.0, 10.0],
|
||||
'classifier__penalty': ['l2']
|
||||
}
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Hyperparameter Tuning")
|
||||
print("="*60)
|
||||
|
||||
Returns:
|
||||
GridSearchCV object with best model
|
||||
"""
|
||||
grid_search = GridSearchCV(
|
||||
pipeline,
|
||||
param_grid,
|
||||
cv=cv,
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1,
|
||||
verbose=1
|
||||
best_model, param_grid, cv=5, scoring='accuracy',
|
||||
n_jobs=-1, verbose=0
|
||||
)
|
||||
|
||||
grid_search.fit(X_train, y_train)
|
||||
|
||||
print(f"Best parameters: {grid_search.best_params_}")
|
||||
print(f"Best CV score: {grid_search.best_score_:.3f}")
|
||||
print(f"Best CV score: {grid_search.best_score_:.4f}")
|
||||
|
||||
return grid_search
|
||||
# Evaluate on test set
|
||||
tuned_model = grid_search.best_estimator_
|
||||
y_pred = tuned_model.predict(X_test)
|
||||
y_pred_proba = tuned_model.predict_proba(X_test)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test Set Evaluation")
|
||||
print("="*60)
|
||||
|
||||
def main():
|
||||
"""
|
||||
Example usage of the classification pipeline.
|
||||
"""
|
||||
# Load your data here
|
||||
# X, y = load_data()
|
||||
# Calculate metrics
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
precision = precision_score(y_test, y_pred, average='weighted')
|
||||
recall = recall_score(y_test, y_pred, average='weighted')
|
||||
f1 = f1_score(y_test, y_pred, average='weighted')
|
||||
|
||||
# Example with synthetic data
|
||||
from sklearn.datasets import make_classification
|
||||
X, y = make_classification(
|
||||
n_samples=1000,
|
||||
n_features=20,
|
||||
n_informative=15,
|
||||
n_redundant=5,
|
||||
random_state=42
|
||||
)
|
||||
print(f"Accuracy: {accuracy:.4f}")
|
||||
print(f"Precision: {precision:.4f}")
|
||||
print(f"Recall: {recall:.4f}")
|
||||
print(f"F1-Score: {f1:.4f}")
|
||||
|
||||
# Convert to DataFrame for demonstration
|
||||
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
|
||||
X = pd.DataFrame(X, columns=feature_names)
|
||||
# ROC AUC (if binary classification)
|
||||
if len(np.unique(y)) == 2:
|
||||
roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
|
||||
print(f"ROC AUC: {roc_auc:.4f}")
|
||||
|
||||
# Split features into numeric and categorical (all numeric in this example)
|
||||
numeric_features = feature_names
|
||||
categorical_features = []
|
||||
|
||||
# Split data (use stratify for imbalanced classes)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
# Create preprocessing pipeline
|
||||
preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
|
||||
|
||||
# Create full pipeline
|
||||
pipeline = create_full_pipeline(preprocessor)
|
||||
|
||||
# Train model
|
||||
print("Training model...")
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
# Evaluate model
|
||||
print("\nEvaluating model...")
|
||||
results = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
|
||||
|
||||
print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
|
||||
print(f"Test Accuracy: {results['test_score']:.3f}")
|
||||
if results['auc']:
|
||||
print(f"ROC-AUC: {results['auc']:.3f}")
|
||||
print("\nClassification Report:")
|
||||
print(results['classification_report'])
|
||||
|
||||
# Hyperparameter tuning (optional)
|
||||
print("\nTuning hyperparameters...")
|
||||
param_grid = {
|
||||
'classifier__n_estimators': [100, 200],
|
||||
'classifier__max_depth': [10, 20, None],
|
||||
'classifier__min_samples_split': [2, 5]
|
||||
}
|
||||
|
||||
grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid)
|
||||
|
||||
# Evaluate best model
|
||||
print("\nEvaluating tuned model...")
|
||||
best_pipeline = grid_search.best_estimator_
|
||||
y_pred = best_pipeline.predict(X_test)
|
||||
print("\n" + "="*60)
|
||||
print("Classification Report")
|
||||
print("="*60)
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
# Save model
|
||||
print("\nSaving model...")
|
||||
joblib.dump(best_pipeline, 'best_model.pkl')
|
||||
print("Model saved as 'best_model.pkl'")
|
||||
print("\n" + "="*60)
|
||||
print("Confusion Matrix")
|
||||
print("="*60)
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
|
||||
# Feature importance (if available)
|
||||
if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'):
|
||||
print("\n" + "="*60)
|
||||
print("Top 10 Most Important Features")
|
||||
print("="*60)
|
||||
|
||||
feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out()
|
||||
importances = tuned_model.named_steps['classifier'].feature_importances_
|
||||
|
||||
feature_importance_df = pd.DataFrame({
|
||||
'feature': feature_names,
|
||||
'importance': importances
|
||||
}).sort_values('importance', ascending=False).head(10)
|
||||
|
||||
print(feature_importance_df.to_string(index=False))
|
||||
|
||||
return {
|
||||
'model': tuned_model,
|
||||
'y_test': y_test,
|
||||
'y_pred': y_pred,
|
||||
'y_pred_proba': y_pred_proba,
|
||||
'metrics': {
|
||||
'accuracy': accuracy,
|
||||
'precision': precision,
|
||||
'recall': recall,
|
||||
'f1': f1
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Load example dataset
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
# Load data
|
||||
data = load_breast_cancer()
|
||||
X = pd.DataFrame(data.data, columns=data.feature_names)
|
||||
y = data.target
|
||||
|
||||
# For demonstration, treat all features as numeric
|
||||
numeric_features = X.columns.tolist()
|
||||
categorical_features = []
|
||||
|
||||
print("="*60)
|
||||
print("Classification Pipeline Example")
|
||||
print("Dataset: Breast Cancer Wisconsin")
|
||||
print("="*60)
|
||||
|
||||
# Run complete pipeline
|
||||
results = train_and_evaluate_model(
|
||||
X, y, numeric_features, categorical_features,
|
||||
test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Pipeline Complete!")
|
||||
print("="*60)
|
||||
|
||||
@@ -1,291 +1,386 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clustering analysis script with multiple algorithms and evaluation.
|
||||
Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization.
|
||||
Clustering analysis example with multiple algorithms, evaluation, and visualization.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||||
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
|
||||
from sklearn.decomposition import PCA
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.metrics import (
|
||||
silhouette_score, calinski_harabasz_score, davies_bouldin_score
|
||||
)
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def scale_data(X):
|
||||
def preprocess_for_clustering(X, scale=True, pca_components=None):
|
||||
"""
|
||||
Scale features using StandardScaler.
|
||||
ALWAYS scale data before clustering!
|
||||
Preprocess data for clustering.
|
||||
|
||||
Args:
|
||||
X: Feature matrix
|
||||
Parameters:
|
||||
-----------
|
||||
X : array-like
|
||||
Feature matrix
|
||||
scale : bool
|
||||
Whether to standardize features
|
||||
pca_components : int or None
|
||||
Number of PCA components (None to skip PCA)
|
||||
|
||||
Returns:
|
||||
Scaled feature matrix and fitted scaler
|
||||
--------
|
||||
array
|
||||
Preprocessed data
|
||||
"""
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
return X_scaled, scaler
|
||||
X_processed = X.copy()
|
||||
|
||||
if scale:
|
||||
scaler = StandardScaler()
|
||||
X_processed = scaler.fit_transform(X_processed)
|
||||
|
||||
if pca_components is not None:
|
||||
pca = PCA(n_components=pca_components)
|
||||
X_processed = pca.fit_transform(X_processed)
|
||||
print(f"PCA: Explained variance ratio = {pca.explained_variance_ratio_.sum():.3f}")
|
||||
|
||||
return X_processed
|
||||
|
||||
|
||||
def find_optimal_k(X_scaled, k_range=range(2, 11)):
|
||||
def find_optimal_k_kmeans(X, k_range=range(2, 11)):
|
||||
"""
|
||||
Find optimal number of clusters using elbow method and silhouette score.
|
||||
Find optimal K for K-Means using elbow method and silhouette score.
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
k_range: Range of k values to try
|
||||
Parameters:
|
||||
-----------
|
||||
X : array-like
|
||||
Feature matrix (should be scaled)
|
||||
k_range : range
|
||||
Range of K values to test
|
||||
|
||||
Returns:
|
||||
Dictionary with inertias and silhouette scores
|
||||
--------
|
||||
dict
|
||||
Dictionary with inertia and silhouette scores for each K
|
||||
"""
|
||||
inertias = []
|
||||
silhouette_scores = []
|
||||
|
||||
for k in k_range:
|
||||
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(X_scaled)
|
||||
labels = kmeans.fit_predict(X)
|
||||
|
||||
inertias.append(kmeans.inertia_)
|
||||
silhouette_scores.append(silhouette_score(X_scaled, labels))
|
||||
silhouette_scores.append(silhouette_score(X, labels))
|
||||
|
||||
# Plot results
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
|
||||
|
||||
# Elbow plot
|
||||
ax1.plot(k_range, inertias, 'bo-')
|
||||
ax1.set_xlabel('Number of clusters (K)')
|
||||
ax1.set_ylabel('Inertia')
|
||||
ax1.set_title('Elbow Method')
|
||||
ax1.grid(True)
|
||||
|
||||
# Silhouette plot
|
||||
ax2.plot(k_range, silhouette_scores, 'ro-')
|
||||
ax2.set_xlabel('Number of clusters (K)')
|
||||
ax2.set_ylabel('Silhouette Score')
|
||||
ax2.set_title('Silhouette Analysis')
|
||||
ax2.grid(True)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('clustering_optimization.png', dpi=300, bbox_inches='tight')
|
||||
print("Saved: clustering_optimization.png")
|
||||
plt.close()
|
||||
|
||||
# Find best K based on silhouette score
|
||||
best_k = k_range[np.argmax(silhouette_scores)]
|
||||
print(f"\nRecommended K based on silhouette score: {best_k}")
|
||||
|
||||
return {
|
||||
'k_values': list(k_range),
|
||||
'inertias': inertias,
|
||||
'silhouette_scores': silhouette_scores
|
||||
'silhouette_scores': silhouette_scores,
|
||||
'best_k': best_k
|
||||
}
|
||||
|
||||
|
||||
def plot_elbow_silhouette(results):
|
||||
def compare_clustering_algorithms(X, n_clusters=3):
|
||||
"""
|
||||
Plot elbow method and silhouette scores.
|
||||
Compare different clustering algorithms.
|
||||
|
||||
Args:
|
||||
results: Dictionary from find_optimal_k
|
||||
"""
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
# Elbow plot
|
||||
ax1.plot(results['k_values'], results['inertias'], 'bo-')
|
||||
ax1.set_xlabel('Number of clusters (k)')
|
||||
ax1.set_ylabel('Inertia')
|
||||
ax1.set_title('Elbow Method')
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
# Silhouette plot
|
||||
ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-')
|
||||
ax2.set_xlabel('Number of clusters (k)')
|
||||
ax2.set_ylabel('Silhouette Score')
|
||||
ax2.set_title('Silhouette Score vs k')
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight')
|
||||
print("Saved elbow and silhouette plots to 'elbow_silhouette.png'")
|
||||
plt.close()
|
||||
|
||||
|
||||
def evaluate_clustering(X_scaled, labels, algorithm_name):
|
||||
"""
|
||||
Evaluate clustering using multiple metrics.
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
labels: Cluster labels
|
||||
algorithm_name: Name of clustering algorithm
|
||||
Parameters:
|
||||
-----------
|
||||
X : array-like
|
||||
Feature matrix (should be scaled)
|
||||
n_clusters : int
|
||||
Number of clusters
|
||||
|
||||
Returns:
|
||||
Dictionary with evaluation metrics
|
||||
--------
|
||||
dict
|
||||
Dictionary with results for each algorithm
|
||||
"""
|
||||
# Filter out noise points for DBSCAN (-1 labels)
|
||||
mask = labels != -1
|
||||
X_filtered = X_scaled[mask]
|
||||
labels_filtered = labels[mask]
|
||||
print("="*60)
|
||||
print(f"Comparing Clustering Algorithms (n_clusters={n_clusters})")
|
||||
print("="*60)
|
||||
|
||||
n_clusters = len(set(labels_filtered))
|
||||
n_noise = list(labels).count(-1)
|
||||
|
||||
results = {
|
||||
'algorithm': algorithm_name,
|
||||
'n_clusters': n_clusters,
|
||||
'n_noise': n_noise
|
||||
algorithms = {
|
||||
'K-Means': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
|
||||
'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters, linkage='ward'),
|
||||
'Gaussian Mixture': GaussianMixture(n_components=n_clusters, random_state=42)
|
||||
}
|
||||
|
||||
# Calculate metrics if we have valid clusters
|
||||
if n_clusters > 1:
|
||||
results['silhouette'] = silhouette_score(X_filtered, labels_filtered)
|
||||
results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
|
||||
results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
|
||||
# DBSCAN doesn't require n_clusters
|
||||
# We'll add it separately
|
||||
dbscan = DBSCAN(eps=0.5, min_samples=5)
|
||||
dbscan_labels = dbscan.fit_predict(X)
|
||||
|
||||
results = {}
|
||||
|
||||
for name, algorithm in algorithms.items():
|
||||
labels = algorithm.fit_predict(X)
|
||||
|
||||
# Calculate metrics
|
||||
silhouette = silhouette_score(X, labels)
|
||||
calinski = calinski_harabasz_score(X, labels)
|
||||
davies = davies_bouldin_score(X, labels)
|
||||
|
||||
results[name] = {
|
||||
'labels': labels,
|
||||
'n_clusters': n_clusters,
|
||||
'silhouette': silhouette,
|
||||
'calinski_harabasz': calinski,
|
||||
'davies_bouldin': davies
|
||||
}
|
||||
|
||||
print(f"\n{name}:")
|
||||
print(f" Silhouette Score: {silhouette:.4f} (higher is better)")
|
||||
print(f" Calinski-Harabasz: {calinski:.4f} (higher is better)")
|
||||
print(f" Davies-Bouldin: {davies:.4f} (lower is better)")
|
||||
|
||||
# DBSCAN results
|
||||
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
|
||||
n_noise = list(dbscan_labels).count(-1)
|
||||
|
||||
if n_clusters_dbscan > 1:
|
||||
# Only calculate metrics if we have multiple clusters
|
||||
mask = dbscan_labels != -1 # Exclude noise
|
||||
if mask.sum() > 0:
|
||||
silhouette = silhouette_score(X[mask], dbscan_labels[mask])
|
||||
calinski = calinski_harabasz_score(X[mask], dbscan_labels[mask])
|
||||
davies = davies_bouldin_score(X[mask], dbscan_labels[mask])
|
||||
|
||||
results['DBSCAN'] = {
|
||||
'labels': dbscan_labels,
|
||||
'n_clusters': n_clusters_dbscan,
|
||||
'n_noise': n_noise,
|
||||
'silhouette': silhouette,
|
||||
'calinski_harabasz': calinski,
|
||||
'davies_bouldin': davies
|
||||
}
|
||||
|
||||
print(f"\nDBSCAN:")
|
||||
print(f" Clusters found: {n_clusters_dbscan}")
|
||||
print(f" Noise points: {n_noise}")
|
||||
print(f" Silhouette Score: {silhouette:.4f} (higher is better)")
|
||||
print(f" Calinski-Harabasz: {calinski:.4f} (higher is better)")
|
||||
print(f" Davies-Bouldin: {davies:.4f} (lower is better)")
|
||||
else:
|
||||
results['silhouette'] = None
|
||||
results['davies_bouldin'] = None
|
||||
results['calinski_harabasz'] = None
|
||||
print(f"\nDBSCAN:")
|
||||
print(f" Clusters found: {n_clusters_dbscan}")
|
||||
print(f" Noise points: {n_noise}")
|
||||
print(" Note: Insufficient clusters for metric calculation")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def perform_kmeans(X_scaled, n_clusters=3):
|
||||
def visualize_clusters(X, results, true_labels=None):
|
||||
"""
|
||||
Perform k-means clustering.
|
||||
Visualize clustering results using PCA for 2D projection.
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
n_clusters: Number of clusters
|
||||
|
||||
Returns:
|
||||
Fitted KMeans model and labels
|
||||
Parameters:
|
||||
-----------
|
||||
X : array-like
|
||||
Feature matrix
|
||||
results : dict
|
||||
Dictionary with clustering results
|
||||
true_labels : array-like or None
|
||||
True labels (if available) for comparison
|
||||
"""
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(X_scaled)
|
||||
return kmeans, labels
|
||||
# Reduce to 2D using PCA
|
||||
pca = PCA(n_components=2)
|
||||
X_2d = pca.fit_transform(X)
|
||||
|
||||
# Determine number of subplots
|
||||
n_plots = len(results)
|
||||
if true_labels is not None:
|
||||
n_plots += 1
|
||||
|
||||
def perform_dbscan(X_scaled, eps=0.5, min_samples=5):
|
||||
"""
|
||||
Perform DBSCAN clustering.
|
||||
n_cols = min(3, n_plots)
|
||||
n_rows = (n_plots + n_cols - 1) // n_cols
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
eps: Maximum distance between neighbors
|
||||
min_samples: Minimum points to form dense region
|
||||
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
|
||||
if n_plots == 1:
|
||||
axes = np.array([axes])
|
||||
axes = axes.flatten()
|
||||
|
||||
Returns:
|
||||
Fitted DBSCAN model and labels
|
||||
"""
|
||||
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
||||
labels = dbscan.fit_predict(X_scaled)
|
||||
return dbscan, labels
|
||||
plot_idx = 0
|
||||
|
||||
# Plot true labels if available
|
||||
if true_labels is not None:
|
||||
ax = axes[plot_idx]
|
||||
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=true_labels, cmap='viridis', alpha=0.6)
|
||||
ax.set_title('True Labels')
|
||||
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
|
||||
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
|
||||
plt.colorbar(scatter, ax=ax)
|
||||
plot_idx += 1
|
||||
|
||||
def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'):
|
||||
"""
|
||||
Perform hierarchical clustering.
|
||||
# Plot clustering results
|
||||
for name, result in results.items():
|
||||
ax = axes[plot_idx]
|
||||
labels = result['labels']
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
n_clusters: Number of clusters
|
||||
linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
|
||||
scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6)
|
||||
|
||||
Returns:
|
||||
Fitted AgglomerativeClustering model and labels
|
||||
"""
|
||||
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
|
||||
labels = hierarchical.fit_predict(X_scaled)
|
||||
return hierarchical, labels
|
||||
# Highlight noise points for DBSCAN
|
||||
if name == 'DBSCAN' and -1 in labels:
|
||||
noise_mask = labels == -1
|
||||
ax.scatter(X_2d[noise_mask, 0], X_2d[noise_mask, 1],
|
||||
c='red', marker='x', s=100, label='Noise', alpha=0.8)
|
||||
ax.legend()
|
||||
|
||||
title = f"{name} (K={result['n_clusters']})"
|
||||
if 'silhouette' in result:
|
||||
title += f"\nSilhouette: {result['silhouette']:.3f}"
|
||||
ax.set_title(title)
|
||||
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
|
||||
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
|
||||
plt.colorbar(scatter, ax=ax)
|
||||
|
||||
def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'):
|
||||
"""
|
||||
Visualize clusters in 2D using PCA or t-SNE.
|
||||
plot_idx += 1
|
||||
|
||||
Args:
|
||||
X_scaled: Scaled feature matrix
|
||||
labels: Cluster labels
|
||||
algorithm_name: Name of algorithm for title
|
||||
method: 'pca' or 'tsne'
|
||||
"""
|
||||
# Reduce to 2D
|
||||
if method == 'pca':
|
||||
pca = PCA(n_components=2, random_state=42)
|
||||
X_2d = pca.fit_transform(X_scaled)
|
||||
variance = pca.explained_variance_ratio_
|
||||
xlabel = f'PC1 ({variance[0]:.1%} variance)'
|
||||
ylabel = f'PC2 ({variance[1]:.1%} variance)'
|
||||
else:
|
||||
from sklearn.manifold import TSNE
|
||||
# Use PCA first to speed up t-SNE
|
||||
pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42)
|
||||
X_pca = pca.fit_transform(X_scaled)
|
||||
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
|
||||
X_2d = tsne.fit_transform(X_pca)
|
||||
xlabel = 't-SNE 1'
|
||||
ylabel = 't-SNE 2'
|
||||
# Hide unused subplots
|
||||
for idx in range(plot_idx, len(axes)):
|
||||
axes[idx].axis('off')
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(10, 8))
|
||||
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50)
|
||||
plt.colorbar(scatter, label='Cluster')
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(f'{algorithm_name} Clustering ({method.upper()})')
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png'
|
||||
plt.savefig(filename, dpi=300, bbox_inches='tight')
|
||||
print(f"Saved visualization to '{filename}'")
|
||||
plt.tight_layout()
|
||||
plt.savefig('clustering_results.png', dpi=300, bbox_inches='tight')
|
||||
print("\nSaved: clustering_results.png")
|
||||
plt.close()
|
||||
|
||||
|
||||
def main():
|
||||
def complete_clustering_analysis(X, true_labels=None, scale=True,
|
||||
find_k=True, k_range=range(2, 11), n_clusters=3):
|
||||
"""
|
||||
Example clustering analysis workflow.
|
||||
Complete clustering analysis workflow.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
X : array-like
|
||||
Feature matrix
|
||||
true_labels : array-like or None
|
||||
True labels (for comparison only, not used in clustering)
|
||||
scale : bool
|
||||
Whether to scale features
|
||||
find_k : bool
|
||||
Whether to search for optimal K
|
||||
k_range : range
|
||||
Range of K values to test
|
||||
n_clusters : int
|
||||
Number of clusters to use in comparison
|
||||
|
||||
Returns:
|
||||
--------
|
||||
dict
|
||||
Dictionary with all analysis results
|
||||
"""
|
||||
# Load your data here
|
||||
# X = load_data()
|
||||
print("="*60)
|
||||
print("Clustering Analysis")
|
||||
print("="*60)
|
||||
print(f"Data shape: {X.shape}")
|
||||
|
||||
# Example with synthetic data
|
||||
from sklearn.datasets import make_blobs
|
||||
X, y_true = make_blobs(
|
||||
n_samples=500,
|
||||
n_features=10,
|
||||
centers=4,
|
||||
cluster_std=1.0,
|
||||
random_state=42
|
||||
)
|
||||
# Preprocess data
|
||||
X_processed = preprocess_for_clustering(X, scale=scale)
|
||||
|
||||
print(f"Dataset shape: {X.shape}")
|
||||
# Find optimal K if requested
|
||||
optimization_results = None
|
||||
if find_k:
|
||||
print("\n" + "="*60)
|
||||
print("Finding Optimal Number of Clusters")
|
||||
print("="*60)
|
||||
optimization_results = find_optimal_k_kmeans(X_processed, k_range=k_range)
|
||||
|
||||
# Scale data (ALWAYS scale for clustering!)
|
||||
print("\nScaling data...")
|
||||
X_scaled, scaler = scale_data(X)
|
||||
# Use recommended K
|
||||
if optimization_results:
|
||||
n_clusters = optimization_results['best_k']
|
||||
|
||||
# Find optimal k
|
||||
print("\nFinding optimal number of clusters...")
|
||||
results = find_optimal_k(X_scaled)
|
||||
plot_elbow_silhouette(results)
|
||||
# Compare clustering algorithms
|
||||
comparison_results = compare_clustering_algorithms(X_processed, n_clusters=n_clusters)
|
||||
|
||||
# Based on elbow/silhouette, choose optimal k
|
||||
optimal_k = 4 # Adjust based on plots
|
||||
|
||||
# Perform k-means
|
||||
print(f"\nPerforming k-means with k={optimal_k}...")
|
||||
kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k)
|
||||
kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means')
|
||||
|
||||
# Perform DBSCAN
|
||||
print("\nPerforming DBSCAN...")
|
||||
dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5)
|
||||
dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN')
|
||||
|
||||
# Perform hierarchical clustering
|
||||
print("\nPerforming hierarchical clustering...")
|
||||
hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k)
|
||||
hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical')
|
||||
|
||||
# Print results
|
||||
# Visualize results
|
||||
print("\n" + "="*60)
|
||||
print("CLUSTERING RESULTS")
|
||||
print("Visualizing Results")
|
||||
print("="*60)
|
||||
visualize_clusters(X_processed, comparison_results, true_labels=true_labels)
|
||||
|
||||
return {
|
||||
'X_processed': X_processed,
|
||||
'optimization': optimization_results,
|
||||
'comparison': comparison_results
|
||||
}
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
from sklearn.datasets import load_iris, make_blobs
|
||||
|
||||
print("="*60)
|
||||
print("Example 1: Iris Dataset")
|
||||
print("="*60)
|
||||
|
||||
for results in [kmeans_results, dbscan_results, hier_results]:
|
||||
print(f"\n{results['algorithm']}:")
|
||||
print(f" Clusters: {results['n_clusters']}")
|
||||
if results['n_noise'] > 0:
|
||||
print(f" Noise points: {results['n_noise']}")
|
||||
if results['silhouette']:
|
||||
print(f" Silhouette Score: {results['silhouette']:.3f}")
|
||||
print(f" Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)")
|
||||
print(f" Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)")
|
||||
# Load Iris dataset
|
||||
iris = load_iris()
|
||||
X_iris = iris.data
|
||||
y_iris = iris.target
|
||||
|
||||
# Visualize clusters
|
||||
print("\nCreating visualizations...")
|
||||
visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca')
|
||||
visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca')
|
||||
visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca')
|
||||
results_iris = complete_clustering_analysis(
|
||||
X_iris,
|
||||
true_labels=y_iris,
|
||||
scale=True,
|
||||
find_k=True,
|
||||
k_range=range(2, 8),
|
||||
n_clusters=3
|
||||
)
|
||||
|
||||
print("\nClustering analysis complete!")
|
||||
print("\n" + "="*60)
|
||||
print("Example 2: Synthetic Dataset with Noise")
|
||||
print("="*60)
|
||||
|
||||
# Create synthetic dataset
|
||||
X_synth, y_synth = make_blobs(
|
||||
n_samples=500, n_features=2, centers=4,
|
||||
cluster_std=0.5, random_state=42
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Add noise points
|
||||
noise = np.random.randn(50, 2) * 3
|
||||
X_synth = np.vstack([X_synth, noise])
|
||||
y_synth_with_noise = np.concatenate([y_synth, np.full(50, -1)])
|
||||
|
||||
results_synth = complete_clustering_analysis(
|
||||
X_synth,
|
||||
true_labels=y_synth_with_noise,
|
||||
scale=True,
|
||||
find_k=True,
|
||||
k_range=range(2, 8),
|
||||
n_clusters=4
|
||||
)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Analysis Complete!")
|
||||
print("="*60)
|
||||
|
||||
Reference in New Issue
Block a user