Improve the scikit-learn skill

2026-03-28 07:33:45 +08:00 · 2025-11-04 10:11:46 -08:00
parent 63a4293f1a
commit 4ad4f9970f
10 changed files with 3293 additions and 3606 deletions
--- a/scientific-packages/scikit-learn/scripts/classification_pipeline.py
+++ b/scientific-packages/scikit-learn/scripts/classification_pipeline.py
@@ -1,219 +1,257 @@
-#!/usr/bin/env python3
 """
-Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning.
-Demonstrates best practices for scikit-learn workflows.
+Complete classification pipeline example with preprocessing, model training,
+hyperparameter tuning, and evaluation.
 """

 import numpy as np
 import pandas as pd
-from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
-import joblib
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    classification_report, confusion_matrix, roc_auc_score,
+    accuracy_score, precision_score, recall_score, f1_score
+)
+import warnings
+warnings.filterwarnings('ignore')


 def create_preprocessing_pipeline(numeric_features, categorical_features):
    """
-    Create preprocessing pipeline for mixed data types.
+    Create a preprocessing pipeline for mixed data types.

-    Args:
-        numeric_features: List of numeric column names
-        categorical_features: List of categorical column names
+    Parameters:
+    -----------
+    numeric_features : list
+        List of numeric feature column names
+    categorical_features : list
+        List of categorical feature column names

    Returns:
-        ColumnTransformer with appropriate preprocessing for each data type
+    --------
+    ColumnTransformer
+        Preprocessing pipeline
    """
+    # Numeric preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

+    # Categorical preprocessing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
+        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

+    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
-        ])
+        ]
+    )

    return preprocessor


-def create_full_pipeline(preprocessor, classifier=None):
+def train_and_evaluate_model(X, y, numeric_features, categorical_features,
+                             test_size=0.2, random_state=42):
    """
-    Create complete ML pipeline with preprocessing and classification.
+    Complete pipeline: preprocess, train, tune, and evaluate a classifier.

-    Args:
-        preprocessor: Preprocessing ColumnTransformer
-        classifier: Classifier instance (default: RandomForestClassifier)
+    Parameters:
+    -----------
+    X : DataFrame or array
+        Feature matrix
+    y : Series or array
+        Target variable
+    numeric_features : list
+        List of numeric feature names
+    categorical_features : list
+        List of categorical feature names
+    test_size : float
+        Proportion of data for testing
+    random_state : int
+        Random seed

    Returns:
-        Complete Pipeline
+    --------
+    dict
+        Dictionary containing trained model, predictions, and metrics
    """
-    if classifier is None:
-        classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
+    # Split data with stratification
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, stratify=y, random_state=random_state
+    )

-    pipeline = Pipeline(steps=[
-        ('preprocessor', preprocessor),
-        ('classifier', classifier)
-    ])
+    print(f"Training set size: {len(X_train)}")
+    print(f"Test set size: {len(X_test)}")
+    print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}")

-    return pipeline
+    # Create preprocessor
+    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)

-
-def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5):
-    """
-    Evaluate model using cross-validation and test set.
-
-    Args:
-        pipeline: Trained pipeline
-        X_train, y_train: Training data
-        X_test, y_test: Test data
-        cv: Number of cross-validation folds
-
-    Returns:
-        Dictionary with evaluation results
-    """
-    # Cross-validation on training set
-    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
-
-    # Test set evaluation
-    y_pred = pipeline.predict(X_test)
-    test_score = pipeline.score(X_test, y_test)
-
-    # Get probabilities if available
-    try:
-        y_proba = pipeline.predict_proba(X_test)
-        if len(np.unique(y_test)) == 2:
-            # Binary classification
-            auc = roc_auc_score(y_test, y_proba[:, 1])
-        else:
-            # Multiclass
-            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
-    except:
-        auc = None
-
-    results = {
-        'cv_mean': cv_scores.mean(),
-        'cv_std': cv_scores.std(),
-        'test_score': test_score,
-        'auc': auc,
-        'classification_report': classification_report(y_test, y_pred),
-        'confusion_matrix': confusion_matrix(y_test, y_pred)
+    # Define models to compare
+    models = {
+        'Logistic Regression': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', LogisticRegression(max_iter=1000, random_state=random_state))
+        ]),
+        'Random Forest': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state))
+        ]),
+        'Gradient Boosting': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state))
+        ])
    }

-    return results
+    # Compare models using cross-validation
+    print("\n" + "="*60)
+    print("Model Comparison (5-Fold Cross-Validation)")
+    print("="*60)

+    cv_results = {}
+    for name, model in models.items():
+        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
+        cv_results[name] = scores.mean()
+        print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

-def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5):
-    """
-    Perform hyperparameter tuning using GridSearchCV.
+    # Select best model based on CV
+    best_model_name = max(cv_results, key=cv_results.get)
+    best_model = models[best_model_name]

-    Args:
-        pipeline: Pipeline to tune
-        X_train, y_train: Training data
-        param_grid: Dictionary of parameters to search
-        cv: Number of cross-validation folds
+    print(f"\nBest model: {best_model_name}")
+
+    # Hyperparameter tuning for best model
+    if best_model_name == 'Random Forest':
+        param_grid = {
+            'classifier__n_estimators': [100, 200],
+            'classifier__max_depth': [10, 20, None],
+            'classifier__min_samples_split': [2, 5]
+        }
+    elif best_model_name == 'Gradient Boosting':
+        param_grid = {
+            'classifier__n_estimators': [100, 200],
+            'classifier__learning_rate': [0.01, 0.1],
+            'classifier__max_depth': [3, 5]
+        }
+    else:  # Logistic Regression
+        param_grid = {
+            'classifier__C': [0.1, 1.0, 10.0],
+            'classifier__penalty': ['l2']
+        }
+
+    print("\n" + "="*60)
+    print("Hyperparameter Tuning")
+    print("="*60)

-    Returns:
-        GridSearchCV object with best model
-    """
    grid_search = GridSearchCV(
-        pipeline,
-        param_grid,
-        cv=cv,
-        scoring='f1_weighted',
-        n_jobs=-1,
-        verbose=1
+        best_model, param_grid, cv=5, scoring='accuracy',
+        n_jobs=-1, verbose=0
    )

    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
-    print(f"Best CV score: {grid_search.best_score_:.3f}")
+    print(f"Best CV score: {grid_search.best_score_:.4f}")

-    return grid_search
+    # Evaluate on test set
+    tuned_model = grid_search.best_estimator_
+    y_pred = tuned_model.predict(X_test)
+    y_pred_proba = tuned_model.predict_proba(X_test)

+    print("\n" + "="*60)
+    print("Test Set Evaluation")
+    print("="*60)

-def main():
-    """
-    Example usage of the classification pipeline.
-    """
-    # Load your data here
-    # X, y = load_data()
+    # Calculate metrics
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred, average='weighted')
+    recall = recall_score(y_test, y_pred, average='weighted')
+    f1 = f1_score(y_test, y_pred, average='weighted')

-    # Example with synthetic data
-    from sklearn.datasets import make_classification
-    X, y = make_classification(
-        n_samples=1000,
-        n_features=20,
-        n_informative=15,
-        n_redundant=5,
-        random_state=42
-    )
+    print(f"Accuracy:  {accuracy:.4f}")
+    print(f"Precision: {precision:.4f}")
+    print(f"Recall:    {recall:.4f}")
+    print(f"F1-Score:  {f1:.4f}")

-    # Convert to DataFrame for demonstration
-    feature_names = [f'feature_{i}' for i in range(X.shape[1])]
-    X = pd.DataFrame(X, columns=feature_names)
+    # ROC AUC (if binary classification)
+    if len(np.unique(y)) == 2:
+        roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
+        print(f"ROC AUC:   {roc_auc:.4f}")

-    # Split features into numeric and categorical (all numeric in this example)
-    numeric_features = feature_names
-    categorical_features = []
-
-    # Split data (use stratify for imbalanced classes)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, random_state=42, stratify=y
-    )
-
-    # Create preprocessing pipeline
-    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
-
-    # Create full pipeline
-    pipeline = create_full_pipeline(preprocessor)
-
-    # Train model
-    print("Training model...")
-    pipeline.fit(X_train, y_train)
-
-    # Evaluate model
-    print("\nEvaluating model...")
-    results = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
-
-    print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
-    print(f"Test Accuracy: {results['test_score']:.3f}")
-    if results['auc']:
-        print(f"ROC-AUC: {results['auc']:.3f}")
-    print("\nClassification Report:")
-    print(results['classification_report'])
-
-    # Hyperparameter tuning (optional)
-    print("\nTuning hyperparameters...")
-    param_grid = {
-        'classifier__n_estimators': [100, 200],
-        'classifier__max_depth': [10, 20, None],
-        'classifier__min_samples_split': [2, 5]
-    }
-
-    grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid)
-
-    # Evaluate best model
-    print("\nEvaluating tuned model...")
-    best_pipeline = grid_search.best_estimator_
-    y_pred = best_pipeline.predict(X_test)
+    print("\n" + "="*60)
+    print("Classification Report")
+    print("="*60)
    print(classification_report(y_test, y_pred))

-    # Save model
-    print("\nSaving model...")
-    joblib.dump(best_pipeline, 'best_model.pkl')
-    print("Model saved as 'best_model.pkl'")
+    print("\n" + "="*60)
+    print("Confusion Matrix")
+    print("="*60)
+    print(confusion_matrix(y_test, y_pred))
+
+    # Feature importance (if available)
+    if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'):
+        print("\n" + "="*60)
+        print("Top 10 Most Important Features")
+        print("="*60)
+
+        feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out()
+        importances = tuned_model.named_steps['classifier'].feature_importances_
+
+        feature_importance_df = pd.DataFrame({
+            'feature': feature_names,
+            'importance': importances
+        }).sort_values('importance', ascending=False).head(10)
+
+        print(feature_importance_df.to_string(index=False))
+
+    return {
+        'model': tuned_model,
+        'y_test': y_test,
+        'y_pred': y_pred,
+        'y_pred_proba': y_pred_proba,
+        'metrics': {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1
+        }
+    }


+# Example usage
 if __name__ == "__main__":
-    main()
+    # Load example dataset
+    from sklearn.datasets import load_breast_cancer
+
+    # Load data
+    data = load_breast_cancer()
+    X = pd.DataFrame(data.data, columns=data.feature_names)
+    y = data.target
+
+    # For demonstration, treat all features as numeric
+    numeric_features = X.columns.tolist()
+    categorical_features = []
+
+    print("="*60)
+    print("Classification Pipeline Example")
+    print("Dataset: Breast Cancer Wisconsin")
+    print("="*60)
+
+    # Run complete pipeline
+    results = train_and_evaluate_model(
+        X, y, numeric_features, categorical_features,
+        test_size=0.2, random_state=42
+    )
+
+    print("\n" + "="*60)
+    print("Pipeline Complete!")
+    print("="*60)
--- a/scientific-packages/scikit-learn/scripts/clustering_analysis.py
+++ b/scientific-packages/scikit-learn/scripts/clustering_analysis.py
@@ -1,291 +1,386 @@
-#!/usr/bin/env python3
 """
-Clustering analysis script with multiple algorithms and evaluation.
-Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization.
+Clustering analysis example with multiple algorithms, evaluation, and visualization.
 """

 import numpy as np
 import pandas as pd
-from sklearn.preprocessing import StandardScaler
-from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
-from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
-from sklearn.decomposition import PCA
 import matplotlib.pyplot as plt
-import seaborn as sns
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.mixture import GaussianMixture
+from sklearn.metrics import (
+    silhouette_score, calinski_harabasz_score, davies_bouldin_score
+)
+import warnings
+warnings.filterwarnings('ignore')


-def scale_data(X):
+def preprocess_for_clustering(X, scale=True, pca_components=None):
    """
-    Scale features using StandardScaler.
-    ALWAYS scale data before clustering!
+    Preprocess data for clustering.

-    Args:
-        X: Feature matrix
+    Parameters:
+    -----------
+    X : array-like
+        Feature matrix
+    scale : bool
+        Whether to standardize features
+    pca_components : int or None
+        Number of PCA components (None to skip PCA)

    Returns:
-        Scaled feature matrix and fitted scaler
+    --------
+    array
+        Preprocessed data
    """
-    scaler = StandardScaler()
-    X_scaled = scaler.fit_transform(X)
-    return X_scaled, scaler
+    X_processed = X.copy()
+
+    if scale:
+        scaler = StandardScaler()
+        X_processed = scaler.fit_transform(X_processed)
+
+    if pca_components is not None:
+        pca = PCA(n_components=pca_components)
+        X_processed = pca.fit_transform(X_processed)
+        print(f"PCA: Explained variance ratio = {pca.explained_variance_ratio_.sum():.3f}")
+
+    return X_processed


-def find_optimal_k(X_scaled, k_range=range(2, 11)):
+def find_optimal_k_kmeans(X, k_range=range(2, 11)):
    """
-    Find optimal number of clusters using elbow method and silhouette score.
+    Find optimal K for K-Means using elbow method and silhouette score.

-    Args:
-        X_scaled: Scaled feature matrix
-        k_range: Range of k values to try
+    Parameters:
+    -----------
+    X : array-like
+        Feature matrix (should be scaled)
+    k_range : range
+        Range of K values to test

    Returns:
-        Dictionary with inertias and silhouette scores
+    --------
+    dict
+        Dictionary with inertia and silhouette scores for each K
    """
    inertias = []
    silhouette_scores = []

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
-        labels = kmeans.fit_predict(X_scaled)
+        labels = kmeans.fit_predict(X)
+
        inertias.append(kmeans.inertia_)
-        silhouette_scores.append(silhouette_score(X_scaled, labels))
+        silhouette_scores.append(silhouette_score(X, labels))
+
+    # Plot results
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+
+    # Elbow plot
+    ax1.plot(k_range, inertias, 'bo-')
+    ax1.set_xlabel('Number of clusters (K)')
+    ax1.set_ylabel('Inertia')
+    ax1.set_title('Elbow Method')
+    ax1.grid(True)
+
+    # Silhouette plot
+    ax2.plot(k_range, silhouette_scores, 'ro-')
+    ax2.set_xlabel('Number of clusters (K)')
+    ax2.set_ylabel('Silhouette Score')
+    ax2.set_title('Silhouette Analysis')
+    ax2.grid(True)
+
+    plt.tight_layout()
+    plt.savefig('clustering_optimization.png', dpi=300, bbox_inches='tight')
+    print("Saved: clustering_optimization.png")
+    plt.close()
+
+    # Find best K based on silhouette score
+    best_k = k_range[np.argmax(silhouette_scores)]
+    print(f"\nRecommended K based on silhouette score: {best_k}")

    return {
        'k_values': list(k_range),
        'inertias': inertias,
-        'silhouette_scores': silhouette_scores
+        'silhouette_scores': silhouette_scores,
+        'best_k': best_k
    }


-def plot_elbow_silhouette(results):
+def compare_clustering_algorithms(X, n_clusters=3):
    """
-    Plot elbow method and silhouette scores.
+    Compare different clustering algorithms.

-    Args:
-        results: Dictionary from find_optimal_k
-    """
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
-
-    # Elbow plot
-    ax1.plot(results['k_values'], results['inertias'], 'bo-')
-    ax1.set_xlabel('Number of clusters (k)')
-    ax1.set_ylabel('Inertia')
-    ax1.set_title('Elbow Method')
-    ax1.grid(True, alpha=0.3)
-
-    # Silhouette plot
-    ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-')
-    ax2.set_xlabel('Number of clusters (k)')
-    ax2.set_ylabel('Silhouette Score')
-    ax2.set_title('Silhouette Score vs k')
-    ax2.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight')
-    print("Saved elbow and silhouette plots to 'elbow_silhouette.png'")
-    plt.close()
-
-
-def evaluate_clustering(X_scaled, labels, algorithm_name):
-    """
-    Evaluate clustering using multiple metrics.
-
-    Args:
-        X_scaled: Scaled feature matrix
-        labels: Cluster labels
-        algorithm_name: Name of clustering algorithm
+    Parameters:
+    -----------
+    X : array-like
+        Feature matrix (should be scaled)
+    n_clusters : int
+        Number of clusters

    Returns:
-        Dictionary with evaluation metrics
+    --------
+    dict
+        Dictionary with results for each algorithm
    """
-    # Filter out noise points for DBSCAN (-1 labels)
-    mask = labels != -1
-    X_filtered = X_scaled[mask]
-    labels_filtered = labels[mask]
+    print("="*60)
+    print(f"Comparing Clustering Algorithms (n_clusters={n_clusters})")
+    print("="*60)

-    n_clusters = len(set(labels_filtered))
-    n_noise = list(labels).count(-1)
-
-    results = {
-        'algorithm': algorithm_name,
-        'n_clusters': n_clusters,
-        'n_noise': n_noise
+    algorithms = {
+        'K-Means': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
+        'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters, linkage='ward'),
+        'Gaussian Mixture': GaussianMixture(n_components=n_clusters, random_state=42)
    }

-    # Calculate metrics if we have valid clusters
-    if n_clusters > 1:
-        results['silhouette'] = silhouette_score(X_filtered, labels_filtered)
-        results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
-        results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
+    # DBSCAN doesn't require n_clusters
+    # We'll add it separately
+    dbscan = DBSCAN(eps=0.5, min_samples=5)
+    dbscan_labels = dbscan.fit_predict(X)
+
+    results = {}
+
+    for name, algorithm in algorithms.items():
+        labels = algorithm.fit_predict(X)
+
+        # Calculate metrics
+        silhouette = silhouette_score(X, labels)
+        calinski = calinski_harabasz_score(X, labels)
+        davies = davies_bouldin_score(X, labels)
+
+        results[name] = {
+            'labels': labels,
+            'n_clusters': n_clusters,
+            'silhouette': silhouette,
+            'calinski_harabasz': calinski,
+            'davies_bouldin': davies
+        }
+
+        print(f"\n{name}:")
+        print(f"  Silhouette Score:       {silhouette:.4f} (higher is better)")
+        print(f"  Calinski-Harabasz:      {calinski:.4f} (higher is better)")
+        print(f"  Davies-Bouldin:         {davies:.4f} (lower is better)")
+
+    # DBSCAN results
+    n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
+    n_noise = list(dbscan_labels).count(-1)
+
+    if n_clusters_dbscan > 1:
+        # Only calculate metrics if we have multiple clusters
+        mask = dbscan_labels != -1  # Exclude noise
+        if mask.sum() > 0:
+            silhouette = silhouette_score(X[mask], dbscan_labels[mask])
+            calinski = calinski_harabasz_score(X[mask], dbscan_labels[mask])
+            davies = davies_bouldin_score(X[mask], dbscan_labels[mask])
+
+            results['DBSCAN'] = {
+                'labels': dbscan_labels,
+                'n_clusters': n_clusters_dbscan,
+                'n_noise': n_noise,
+                'silhouette': silhouette,
+                'calinski_harabasz': calinski,
+                'davies_bouldin': davies
+            }
+
+            print(f"\nDBSCAN:")
+            print(f"  Clusters found:         {n_clusters_dbscan}")
+            print(f"  Noise points:           {n_noise}")
+            print(f"  Silhouette Score:       {silhouette:.4f} (higher is better)")
+            print(f"  Calinski-Harabasz:      {calinski:.4f} (higher is better)")
+            print(f"  Davies-Bouldin:         {davies:.4f} (lower is better)")
    else:
-        results['silhouette'] = None
-        results['davies_bouldin'] = None
-        results['calinski_harabasz'] = None
+        print(f"\nDBSCAN:")
+        print(f"  Clusters found:         {n_clusters_dbscan}")
+        print(f"  Noise points:           {n_noise}")
+        print("  Note: Insufficient clusters for metric calculation")

    return results


-def perform_kmeans(X_scaled, n_clusters=3):
+def visualize_clusters(X, results, true_labels=None):
    """
-    Perform k-means clustering.
+    Visualize clustering results using PCA for 2D projection.

-    Args:
-        X_scaled: Scaled feature matrix
-        n_clusters: Number of clusters
-
-    Returns:
-        Fitted KMeans model and labels
+    Parameters:
+    -----------
+    X : array-like
+        Feature matrix
+    results : dict
+        Dictionary with clustering results
+    true_labels : array-like or None
+        True labels (if available) for comparison
    """
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X_scaled)
-    return kmeans, labels
+    # Reduce to 2D using PCA
+    pca = PCA(n_components=2)
+    X_2d = pca.fit_transform(X)

+    # Determine number of subplots
+    n_plots = len(results)
+    if true_labels is not None:
+        n_plots += 1

-def perform_dbscan(X_scaled, eps=0.5, min_samples=5):
-    """
-    Perform DBSCAN clustering.
+    n_cols = min(3, n_plots)
+    n_rows = (n_plots + n_cols - 1) // n_cols

-    Args:
-        X_scaled: Scaled feature matrix
-        eps: Maximum distance between neighbors
-        min_samples: Minimum points to form dense region
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
+    if n_plots == 1:
+        axes = np.array([axes])
+    axes = axes.flatten()

-    Returns:
-        Fitted DBSCAN model and labels
-    """
-    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
-    labels = dbscan.fit_predict(X_scaled)
-    return dbscan, labels
+    plot_idx = 0

+    # Plot true labels if available
+    if true_labels is not None:
+        ax = axes[plot_idx]
+        scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=true_labels, cmap='viridis', alpha=0.6)
+        ax.set_title('True Labels')
+        ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
+        ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
+        plt.colorbar(scatter, ax=ax)
+        plot_idx += 1

-def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'):
-    """
-    Perform hierarchical clustering.
+    # Plot clustering results
+    for name, result in results.items():
+        ax = axes[plot_idx]
+        labels = result['labels']

-    Args:
-        X_scaled: Scaled feature matrix
-        n_clusters: Number of clusters
-        linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
+        scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6)

-    Returns:
-        Fitted AgglomerativeClustering model and labels
-    """
-    hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
-    labels = hierarchical.fit_predict(X_scaled)
-    return hierarchical, labels
+        # Highlight noise points for DBSCAN
+        if name == 'DBSCAN' and -1 in labels:
+            noise_mask = labels == -1
+            ax.scatter(X_2d[noise_mask, 0], X_2d[noise_mask, 1],
+                      c='red', marker='x', s=100, label='Noise', alpha=0.8)
+            ax.legend()

+        title = f"{name} (K={result['n_clusters']})"
+        if 'silhouette' in result:
+            title += f"\nSilhouette: {result['silhouette']:.3f}"
+        ax.set_title(title)
+        ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
+        ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
+        plt.colorbar(scatter, ax=ax)

-def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'):
-    """
-    Visualize clusters in 2D using PCA or t-SNE.
+        plot_idx += 1

-    Args:
-        X_scaled: Scaled feature matrix
-        labels: Cluster labels
-        algorithm_name: Name of algorithm for title
-        method: 'pca' or 'tsne'
-    """
-    # Reduce to 2D
-    if method == 'pca':
-        pca = PCA(n_components=2, random_state=42)
-        X_2d = pca.fit_transform(X_scaled)
-        variance = pca.explained_variance_ratio_
-        xlabel = f'PC1 ({variance[0]:.1%} variance)'
-        ylabel = f'PC2 ({variance[1]:.1%} variance)'
-    else:
-        from sklearn.manifold import TSNE
-        # Use PCA first to speed up t-SNE
-        pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42)
-        X_pca = pca.fit_transform(X_scaled)
-        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
-        X_2d = tsne.fit_transform(X_pca)
-        xlabel = 't-SNE 1'
-        ylabel = 't-SNE 2'
+    # Hide unused subplots
+    for idx in range(plot_idx, len(axes)):
+        axes[idx].axis('off')

-    # Plot
-    plt.figure(figsize=(10, 8))
-    scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50)
-    plt.colorbar(scatter, label='Cluster')
-    plt.xlabel(xlabel)
-    plt.ylabel(ylabel)
-    plt.title(f'{algorithm_name} Clustering ({method.upper()})')
-    plt.grid(True, alpha=0.3)
-
-    filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png'
-    plt.savefig(filename, dpi=300, bbox_inches='tight')
-    print(f"Saved visualization to '{filename}'")
+    plt.tight_layout()
+    plt.savefig('clustering_results.png', dpi=300, bbox_inches='tight')
+    print("\nSaved: clustering_results.png")
    plt.close()


-def main():
+def complete_clustering_analysis(X, true_labels=None, scale=True,
+                                 find_k=True, k_range=range(2, 11), n_clusters=3):
    """
-    Example clustering analysis workflow.
+    Complete clustering analysis workflow.
+
+    Parameters:
+    -----------
+    X : array-like
+        Feature matrix
+    true_labels : array-like or None
+        True labels (for comparison only, not used in clustering)
+    scale : bool
+        Whether to scale features
+    find_k : bool
+        Whether to search for optimal K
+    k_range : range
+        Range of K values to test
+    n_clusters : int
+        Number of clusters to use in comparison
+
+    Returns:
+    --------
+    dict
+        Dictionary with all analysis results
    """
-    # Load your data here
-    # X = load_data()
+    print("="*60)
+    print("Clustering Analysis")
+    print("="*60)
+    print(f"Data shape: {X.shape}")

-    # Example with synthetic data
-    from sklearn.datasets import make_blobs
-    X, y_true = make_blobs(
-        n_samples=500,
-        n_features=10,
-        centers=4,
-        cluster_std=1.0,
-        random_state=42
-    )
+    # Preprocess data
+    X_processed = preprocess_for_clustering(X, scale=scale)

-    print(f"Dataset shape: {X.shape}")
+    # Find optimal K if requested
+    optimization_results = None
+    if find_k:
+        print("\n" + "="*60)
+        print("Finding Optimal Number of Clusters")
+        print("="*60)
+        optimization_results = find_optimal_k_kmeans(X_processed, k_range=k_range)

-    # Scale data (ALWAYS scale for clustering!)
-    print("\nScaling data...")
-    X_scaled, scaler = scale_data(X)
+        # Use recommended K
+        if optimization_results:
+            n_clusters = optimization_results['best_k']

-    # Find optimal k
-    print("\nFinding optimal number of clusters...")
-    results = find_optimal_k(X_scaled)
-    plot_elbow_silhouette(results)
+    # Compare clustering algorithms
+    comparison_results = compare_clustering_algorithms(X_processed, n_clusters=n_clusters)

-    # Based on elbow/silhouette, choose optimal k
-    optimal_k = 4  # Adjust based on plots
-
-    # Perform k-means
-    print(f"\nPerforming k-means with k={optimal_k}...")
-    kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k)
-    kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means')
-
-    # Perform DBSCAN
-    print("\nPerforming DBSCAN...")
-    dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5)
-    dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN')
-
-    # Perform hierarchical clustering
-    print("\nPerforming hierarchical clustering...")
-    hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k)
-    hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical')
-
-    # Print results
+    # Visualize results
    print("\n" + "="*60)
-    print("CLUSTERING RESULTS")
+    print("Visualizing Results")
+    print("="*60)
+    visualize_clusters(X_processed, comparison_results, true_labels=true_labels)
+
+    return {
+        'X_processed': X_processed,
+        'optimization': optimization_results,
+        'comparison': comparison_results
+    }
+
+
+# Example usage
+if __name__ == "__main__":
+    from sklearn.datasets import load_iris, make_blobs
+
+    print("="*60)
+    print("Example 1: Iris Dataset")
    print("="*60)

-    for results in [kmeans_results, dbscan_results, hier_results]:
-        print(f"\n{results['algorithm']}:")
-        print(f"  Clusters: {results['n_clusters']}")
-        if results['n_noise'] > 0:
-            print(f"  Noise points: {results['n_noise']}")
-        if results['silhouette']:
-            print(f"  Silhouette Score: {results['silhouette']:.3f}")
-            print(f"  Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)")
-            print(f"  Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)")
+    # Load Iris dataset
+    iris = load_iris()
+    X_iris = iris.data
+    y_iris = iris.target

-    # Visualize clusters
-    print("\nCreating visualizations...")
-    visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca')
-    visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca')
-    visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca')
+    results_iris = complete_clustering_analysis(
+        X_iris,
+        true_labels=y_iris,
+        scale=True,
+        find_k=True,
+        k_range=range(2, 8),
+        n_clusters=3
+    )

-    print("\nClustering analysis complete!")
+    print("\n" + "="*60)
+    print("Example 2: Synthetic Dataset with Noise")
+    print("="*60)

+    # Create synthetic dataset
+    X_synth, y_synth = make_blobs(
+        n_samples=500, n_features=2, centers=4,
+        cluster_std=0.5, random_state=42
+    )

-if __name__ == "__main__":
-    main()
+    # Add noise points
+    noise = np.random.randn(50, 2) * 3
+    X_synth = np.vstack([X_synth, noise])
+    y_synth_with_noise = np.concatenate([y_synth, np.full(50, -1)])
+
+    results_synth = complete_clustering_analysis(
+        X_synth,
+        true_labels=y_synth_with_noise,
+        scale=True,
+        find_k=True,
+        k_range=range(2, 8),
+        n_clusters=4
+    )
+
+    print("\n" + "="*60)
+    print("Analysis Complete!")
+    print("="*60)