#!/usr/bin/env python3 """ Clustering analysis script with multiple algorithms and evaluation. Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization. """ import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns def scale_data(X): """ Scale features using StandardScaler. ALWAYS scale data before clustering! Args: X: Feature matrix Returns: Scaled feature matrix and fitted scaler """ scaler = StandardScaler() X_scaled = scaler.fit_transform(X) return X_scaled, scaler def find_optimal_k(X_scaled, k_range=range(2, 11)): """ Find optimal number of clusters using elbow method and silhouette score. Args: X_scaled: Scaled feature matrix k_range: Range of k values to try Returns: Dictionary with inertias and silhouette scores """ inertias = [] silhouette_scores = [] for k in k_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) labels = kmeans.fit_predict(X_scaled) inertias.append(kmeans.inertia_) silhouette_scores.append(silhouette_score(X_scaled, labels)) return { 'k_values': list(k_range), 'inertias': inertias, 'silhouette_scores': silhouette_scores } def plot_elbow_silhouette(results): """ Plot elbow method and silhouette scores. Args: results: Dictionary from find_optimal_k """ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) # Elbow plot ax1.plot(results['k_values'], results['inertias'], 'bo-') ax1.set_xlabel('Number of clusters (k)') ax1.set_ylabel('Inertia') ax1.set_title('Elbow Method') ax1.grid(True, alpha=0.3) # Silhouette plot ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-') ax2.set_xlabel('Number of clusters (k)') ax2.set_ylabel('Silhouette Score') ax2.set_title('Silhouette Score vs k') ax2.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight') print("Saved elbow and silhouette plots to 'elbow_silhouette.png'") plt.close() def evaluate_clustering(X_scaled, labels, algorithm_name): """ Evaluate clustering using multiple metrics. Args: X_scaled: Scaled feature matrix labels: Cluster labels algorithm_name: Name of clustering algorithm Returns: Dictionary with evaluation metrics """ # Filter out noise points for DBSCAN (-1 labels) mask = labels != -1 X_filtered = X_scaled[mask] labels_filtered = labels[mask] n_clusters = len(set(labels_filtered)) n_noise = list(labels).count(-1) results = { 'algorithm': algorithm_name, 'n_clusters': n_clusters, 'n_noise': n_noise } # Calculate metrics if we have valid clusters if n_clusters > 1: results['silhouette'] = silhouette_score(X_filtered, labels_filtered) results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered) results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered) else: results['silhouette'] = None results['davies_bouldin'] = None results['calinski_harabasz'] = None return results def perform_kmeans(X_scaled, n_clusters=3): """ Perform k-means clustering. Args: X_scaled: Scaled feature matrix n_clusters: Number of clusters Returns: Fitted KMeans model and labels """ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) labels = kmeans.fit_predict(X_scaled) return kmeans, labels def perform_dbscan(X_scaled, eps=0.5, min_samples=5): """ Perform DBSCAN clustering. Args: X_scaled: Scaled feature matrix eps: Maximum distance between neighbors min_samples: Minimum points to form dense region Returns: Fitted DBSCAN model and labels """ dbscan = DBSCAN(eps=eps, min_samples=min_samples) labels = dbscan.fit_predict(X_scaled) return dbscan, labels def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'): """ Perform hierarchical clustering. Args: X_scaled: Scaled feature matrix n_clusters: Number of clusters linkage: Linkage criterion ('ward', 'complete', 'average', 'single') Returns: Fitted AgglomerativeClustering model and labels """ hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage) labels = hierarchical.fit_predict(X_scaled) return hierarchical, labels def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'): """ Visualize clusters in 2D using PCA or t-SNE. Args: X_scaled: Scaled feature matrix labels: Cluster labels algorithm_name: Name of algorithm for title method: 'pca' or 'tsne' """ # Reduce to 2D if method == 'pca': pca = PCA(n_components=2, random_state=42) X_2d = pca.fit_transform(X_scaled) variance = pca.explained_variance_ratio_ xlabel = f'PC1 ({variance[0]:.1%} variance)' ylabel = f'PC2 ({variance[1]:.1%} variance)' else: from sklearn.manifold import TSNE # Use PCA first to speed up t-SNE pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42) X_pca = pca.fit_transform(X_scaled) tsne = TSNE(n_components=2, random_state=42, perplexity=30) X_2d = tsne.fit_transform(X_pca) xlabel = 't-SNE 1' ylabel = 't-SNE 2' # Plot plt.figure(figsize=(10, 8)) scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50) plt.colorbar(scatter, label='Cluster') plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title(f'{algorithm_name} Clustering ({method.upper()})') plt.grid(True, alpha=0.3) filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png' plt.savefig(filename, dpi=300, bbox_inches='tight') print(f"Saved visualization to '{filename}'") plt.close() def main(): """ Example clustering analysis workflow. """ # Load your data here # X = load_data() # Example with synthetic data from sklearn.datasets import make_blobs X, y_true = make_blobs( n_samples=500, n_features=10, centers=4, cluster_std=1.0, random_state=42 ) print(f"Dataset shape: {X.shape}") # Scale data (ALWAYS scale for clustering!) print("\nScaling data...") X_scaled, scaler = scale_data(X) # Find optimal k print("\nFinding optimal number of clusters...") results = find_optimal_k(X_scaled) plot_elbow_silhouette(results) # Based on elbow/silhouette, choose optimal k optimal_k = 4 # Adjust based on plots # Perform k-means print(f"\nPerforming k-means with k={optimal_k}...") kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k) kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means') # Perform DBSCAN print("\nPerforming DBSCAN...") dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5) dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN') # Perform hierarchical clustering print("\nPerforming hierarchical clustering...") hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k) hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical') # Print results print("\n" + "="*60) print("CLUSTERING RESULTS") print("="*60) for results in [kmeans_results, dbscan_results, hier_results]: print(f"\n{results['algorithm']}:") print(f" Clusters: {results['n_clusters']}") if results['n_noise'] > 0: print(f" Noise points: {results['n_noise']}") if results['silhouette']: print(f" Silhouette Score: {results['silhouette']:.3f}") print(f" Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)") print(f" Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)") # Visualize clusters print("\nCreating visualizations...") visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca') visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca') visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca') print("\nClustering analysis complete!") if __name__ == "__main__": main()