Improve the scikit-learn skill

2026-01-26 16:58:56 +08:00 · 2025-11-04 10:11:46 -08:00
parent 63a4293f1a
commit 4ad4f9970f
10 changed files with 3293 additions and 3606 deletions
--- a/scientific-packages/scikit-learn/scripts/classification_pipeline.py
+++ b/scientific-packages/scikit-learn/scripts/classification_pipeline.py
@@ -1,219 +1,257 @@
-#!/usr/bin/env python3
 """
-Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning.
-Demonstrates best practices for scikit-learn workflows.
+Complete classification pipeline example with preprocessing, model training,
+hyperparameter tuning, and evaluation.
 """

 import numpy as np
 import pandas as pd
-from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
-import joblib
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    classification_report, confusion_matrix, roc_auc_score,
+    accuracy_score, precision_score, recall_score, f1_score
+)
+import warnings
+warnings.filterwarnings('ignore')


 def create_preprocessing_pipeline(numeric_features, categorical_features):
    """
-    Create preprocessing pipeline for mixed data types.
+    Create a preprocessing pipeline for mixed data types.

-    Args:
-        numeric_features: List of numeric column names
-        categorical_features: List of categorical column names
+    Parameters:
+    -----------
+    numeric_features : list
+        List of numeric feature column names
+    categorical_features : list
+        List of categorical feature column names

    Returns:
-        ColumnTransformer with appropriate preprocessing for each data type
+    --------
+    ColumnTransformer
+        Preprocessing pipeline
    """
+    # Numeric preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

+    # Categorical preprocessing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
+        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

+    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
-        ])
+        ]
+    )

    return preprocessor


-def create_full_pipeline(preprocessor, classifier=None):
+def train_and_evaluate_model(X, y, numeric_features, categorical_features,
+                             test_size=0.2, random_state=42):
    """
-    Create complete ML pipeline with preprocessing and classification.
+    Complete pipeline: preprocess, train, tune, and evaluate a classifier.

-    Args:
-        preprocessor: Preprocessing ColumnTransformer
-        classifier: Classifier instance (default: RandomForestClassifier)
+    Parameters:
+    -----------
+    X : DataFrame or array
+        Feature matrix
+    y : Series or array
+        Target variable
+    numeric_features : list
+        List of numeric feature names
+    categorical_features : list
+        List of categorical feature names
+    test_size : float
+        Proportion of data for testing
+    random_state : int
+        Random seed

    Returns:
-        Complete Pipeline
+    --------
+    dict
+        Dictionary containing trained model, predictions, and metrics
    """
-    if classifier is None:
-        classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
+    # Split data with stratification
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, stratify=y, random_state=random_state
+    )

-    pipeline = Pipeline(steps=[
-        ('preprocessor', preprocessor),
-        ('classifier', classifier)
-    ])
+    print(f"Training set size: {len(X_train)}")
+    print(f"Test set size: {len(X_test)}")
+    print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}")

-    return pipeline
+    # Create preprocessor
+    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)

-
-def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5):
-    """
-    Evaluate model using cross-validation and test set.
-
-    Args:
-        pipeline: Trained pipeline
-        X_train, y_train: Training data
-        X_test, y_test: Test data
-        cv: Number of cross-validation folds
-
-    Returns:
-        Dictionary with evaluation results
-    """
-    # Cross-validation on training set
-    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
-
-    # Test set evaluation
-    y_pred = pipeline.predict(X_test)
-    test_score = pipeline.score(X_test, y_test)
-
-    # Get probabilities if available
-    try:
-        y_proba = pipeline.predict_proba(X_test)
-        if len(np.unique(y_test)) == 2:
-            # Binary classification
-            auc = roc_auc_score(y_test, y_proba[:, 1])
-        else:
-            # Multiclass
-            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
-    except:
-        auc = None
-
-    results = {
-        'cv_mean': cv_scores.mean(),
-        'cv_std': cv_scores.std(),
-        'test_score': test_score,
-        'auc': auc,
-        'classification_report': classification_report(y_test, y_pred),
-        'confusion_matrix': confusion_matrix(y_test, y_pred)
+    # Define models to compare
+    models = {
+        'Logistic Regression': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', LogisticRegression(max_iter=1000, random_state=random_state))
+        ]),
+        'Random Forest': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state))
+        ]),
+        'Gradient Boosting': Pipeline([
+            ('preprocessor', preprocessor),
+            ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state))
+        ])
    }

-    return results
+    # Compare models using cross-validation
+    print("\n" + "="*60)
+    print("Model Comparison (5-Fold Cross-Validation)")
+    print("="*60)

+    cv_results = {}
+    for name, model in models.items():
+        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
+        cv_results[name] = scores.mean()
+        print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

-def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5):
-    """
-    Perform hyperparameter tuning using GridSearchCV.
+    # Select best model based on CV
+    best_model_name = max(cv_results, key=cv_results.get)
+    best_model = models[best_model_name]

-    Args:
-        pipeline: Pipeline to tune
-        X_train, y_train: Training data
-        param_grid: Dictionary of parameters to search
-        cv: Number of cross-validation folds
+    print(f"\nBest model: {best_model_name}")
+
+    # Hyperparameter tuning for best model
+    if best_model_name == 'Random Forest':
+        param_grid = {
+            'classifier__n_estimators': [100, 200],
+            'classifier__max_depth': [10, 20, None],
+            'classifier__min_samples_split': [2, 5]
+        }
+    elif best_model_name == 'Gradient Boosting':
+        param_grid = {
+            'classifier__n_estimators': [100, 200],
+            'classifier__learning_rate': [0.01, 0.1],
+            'classifier__max_depth': [3, 5]
+        }
+    else:  # Logistic Regression
+        param_grid = {
+            'classifier__C': [0.1, 1.0, 10.0],
+            'classifier__penalty': ['l2']
+        }
+
+    print("\n" + "="*60)
+    print("Hyperparameter Tuning")
+    print("="*60)

-    Returns:
-        GridSearchCV object with best model
-    """
    grid_search = GridSearchCV(
-        pipeline,
-        param_grid,
-        cv=cv,
-        scoring='f1_weighted',
-        n_jobs=-1,
-        verbose=1
+        best_model, param_grid, cv=5, scoring='accuracy',
+        n_jobs=-1, verbose=0
    )

    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
-    print(f"Best CV score: {grid_search.best_score_:.3f}")
+    print(f"Best CV score: {grid_search.best_score_:.4f}")

-    return grid_search
+    # Evaluate on test set
+    tuned_model = grid_search.best_estimator_
+    y_pred = tuned_model.predict(X_test)
+    y_pred_proba = tuned_model.predict_proba(X_test)

+    print("\n" + "="*60)
+    print("Test Set Evaluation")
+    print("="*60)

-def main():
-    """
-    Example usage of the classification pipeline.
-    """
-    # Load your data here
-    # X, y = load_data()
+    # Calculate metrics
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred, average='weighted')
+    recall = recall_score(y_test, y_pred, average='weighted')
+    f1 = f1_score(y_test, y_pred, average='weighted')

-    # Example with synthetic data
-    from sklearn.datasets import make_classification
-    X, y = make_classification(
-        n_samples=1000,
-        n_features=20,
-        n_informative=15,
-        n_redundant=5,
-        random_state=42
-    )
+    print(f"Accuracy:  {accuracy:.4f}")
+    print(f"Precision: {precision:.4f}")
+    print(f"Recall:    {recall:.4f}")
+    print(f"F1-Score:  {f1:.4f}")

-    # Convert to DataFrame for demonstration
-    feature_names = [f'feature_{i}' for i in range(X.shape[1])]
-    X = pd.DataFrame(X, columns=feature_names)
+    # ROC AUC (if binary classification)
+    if len(np.unique(y)) == 2:
+        roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
+        print(f"ROC AUC:   {roc_auc:.4f}")

-    # Split features into numeric and categorical (all numeric in this example)
-    numeric_features = feature_names
-    categorical_features = []
-
-    # Split data (use stratify for imbalanced classes)
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, random_state=42, stratify=y
-    )
-
-    # Create preprocessing pipeline
-    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
-
-    # Create full pipeline
-    pipeline = create_full_pipeline(preprocessor)
-
-    # Train model
-    print("Training model...")
-    pipeline.fit(X_train, y_train)
-
-    # Evaluate model
-    print("\nEvaluating model...")
-    results = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
-
-    print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
-    print(f"Test Accuracy: {results['test_score']:.3f}")
-    if results['auc']:
-        print(f"ROC-AUC: {results['auc']:.3f}")
-    print("\nClassification Report:")
-    print(results['classification_report'])
-
-    # Hyperparameter tuning (optional)
-    print("\nTuning hyperparameters...")
-    param_grid = {
-        'classifier__n_estimators': [100, 200],
-        'classifier__max_depth': [10, 20, None],
-        'classifier__min_samples_split': [2, 5]
-    }
-
-    grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid)
-
-    # Evaluate best model
-    print("\nEvaluating tuned model...")
-    best_pipeline = grid_search.best_estimator_
-    y_pred = best_pipeline.predict(X_test)
+    print("\n" + "="*60)
+    print("Classification Report")
+    print("="*60)
    print(classification_report(y_test, y_pred))

-    # Save model
-    print("\nSaving model...")
-    joblib.dump(best_pipeline, 'best_model.pkl')
-    print("Model saved as 'best_model.pkl'")
+    print("\n" + "="*60)
+    print("Confusion Matrix")
+    print("="*60)
+    print(confusion_matrix(y_test, y_pred))
+
+    # Feature importance (if available)
+    if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'):
+        print("\n" + "="*60)
+        print("Top 10 Most Important Features")
+        print("="*60)
+
+        feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out()
+        importances = tuned_model.named_steps['classifier'].feature_importances_
+
+        feature_importance_df = pd.DataFrame({
+            'feature': feature_names,
+            'importance': importances
+        }).sort_values('importance', ascending=False).head(10)
+
+        print(feature_importance_df.to_string(index=False))
+
+    return {
+        'model': tuned_model,
+        'y_test': y_test,
+        'y_pred': y_pred,
+        'y_pred_proba': y_pred_proba,
+        'metrics': {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1': f1
+        }
+    }


+# Example usage
 if __name__ == "__main__":
-    main()
+    # Load example dataset
+    from sklearn.datasets import load_breast_cancer
+
+    # Load data
+    data = load_breast_cancer()
+    X = pd.DataFrame(data.data, columns=data.feature_names)
+    y = data.target
+
+    # For demonstration, treat all features as numeric
+    numeric_features = X.columns.tolist()
+    categorical_features = []
+
+    print("="*60)
+    print("Classification Pipeline Example")
+    print("Dataset: Breast Cancer Wisconsin")
+    print("="*60)
+
+    # Run complete pipeline
+    results = train_and_evaluate_model(
+        X, y, numeric_features, categorical_features,
+        test_size=0.2, random_state=42
+    )
+
+    print("\n" + "="*60)
+    print("Pipeline Complete!")
+    print("="*60)