From 4ad4f9970fbdf2ae9752a4a15ebc0b3ca3bb1d2c Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Tue, 4 Nov 2025 10:11:46 -0800 Subject: [PATCH] Improve the scikit-learn skill --- docs/scientific-packages.md | 2 +- scientific-packages/scikit-learn/SKILL.md | 1095 +++++++---------- .../references/model_evaluation.md | 845 +++++++------ .../references/pipelines_and_composition.md | 955 +++++++------- .../scikit-learn/references/preprocessing.md | 841 ++++++++----- .../references/quick_reference.md | 722 ++++------- .../references/supervised_learning.md | 523 +++++--- .../references/unsupervised_learning.md | 1049 +++++++--------- .../scripts/classification_pipeline.py | 346 +++--- .../scripts/clustering_analysis.py | 521 ++++---- 10 files changed, 3293 insertions(+), 3606 deletions(-) diff --git a/docs/scientific-packages.md b/docs/scientific-packages.md index c55e1ae..d978f3d 100644 --- a/docs/scientific-packages.md +++ b/docs/scientific-packages.md @@ -47,7 +47,7 @@ - **PyMC** - Bayesian statistical modeling and probabilistic programming - **PyMOO** - Multi-objective optimization with evolutionary algorithms - **PyTorch Lightning** - Deep learning framework that organizes PyTorch code to eliminate boilerplate while maintaining full flexibility. Automates training workflows (40+ tasks including epoch/batch iteration, optimizer steps, gradient management, checkpointing), supports multi-GPU/TPU training with DDP/FSDP/DeepSpeed strategies, includes LightningModule for model organization, Trainer for automation, LightningDataModule for data pipelines, callbacks for extensibility, and integrations with TensorBoard, Wandb, MLflow for experiment tracking -- **scikit-learn** - Machine learning algorithms, preprocessing, and model selection +- **scikit-learn** - Industry-standard Python library for classical machine learning providing comprehensive supervised learning (classification: Logistic Regression, SVM, Decision Trees, Random Forests with 17+ variants, Gradient Boosting with XGBoost-compatible HistGradientBoosting, Naive Bayes, KNN, Neural Networks/MLP; regression: Linear, Ridge, Lasso, ElasticNet, SVR, ensemble methods), unsupervised learning (clustering: K-Means, DBSCAN, HDBSCAN, OPTICS, Agglomerative/Hierarchical, Spectral, Gaussian Mixture Models, BIRCH, MeanShift; dimensionality reduction: PCA, Kernel PCA, t-SNE, Isomap, LLE, NMF, TruncatedSVD, FastICA, LDA; outlier detection: IsolationForest, LocalOutlierFactor, OneClassSVM), data preprocessing (scaling: StandardScaler, MinMaxScaler, RobustScaler; encoding: OneHotEncoder, OrdinalEncoder, LabelEncoder; imputation: SimpleImputer, KNNImputer, IterativeImputer; feature engineering: PolynomialFeatures, KBinsDiscretizer, text vectorization with CountVectorizer/TfidfVectorizer), model evaluation (cross-validation: KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold; hyperparameter tuning: GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV; metrics: 30+ evaluation metrics for classification/regression/clustering including accuracy, precision, recall, F1, ROC-AUC, MSE, R², silhouette score), and Pipeline/ColumnTransformer for production-ready workflows. Features consistent API (fit/predict/transform), extensive documentation, integration with NumPy/pandas/SciPy, joblib persistence, and scikit-learn-compatible ecosystem (XGBoost, LightGBM, CatBoost, imbalanced-learn). Optimized implementations using Cython/OpenMP for performance. Use cases: predictive modeling, customer segmentation, anomaly detection, feature engineering, model selection/validation, text classification, image classification (with feature extraction), time series forecasting (with preprocessing), medical diagnosis, fraud detection, recommendation systems, and any tabular data ML task requiring interpretable models or established algorithms - **scikit-survival** - Survival analysis and time-to-event modeling with censored data. Built on scikit-learn, provides Cox proportional hazards models (CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis with elastic net regularization), ensemble methods (Random Survival Forests, Gradient Boosting), Survival Support Vector Machines (linear and kernel), non-parametric estimators (Kaplan-Meier, Nelson-Aalen), competing risks analysis, and specialized evaluation metrics (concordance index, time-dependent AUC, Brier score). Handles right-censored data, integrates with scikit-learn pipelines, and supports feature selection and hyperparameter tuning via cross-validation - **SHAP** - Model interpretability and explainability using Shapley values from game theory. Provides unified approach to explain any ML model with TreeExplainer (fast exact explanations for XGBoost/LightGBM/Random Forest), DeepExplainer (TensorFlow/PyTorch neural networks), KernelExplainer (model-agnostic), and LinearExplainer. Includes comprehensive visualizations (waterfall plots for individual predictions, beeswarm plots for global importance, scatter plots for feature relationships, bar/force/heatmap plots), supports model debugging, fairness analysis, feature engineering guidance, and production deployment - **Stable Baselines3** - PyTorch-based reinforcement learning library providing reliable implementations of RL algorithms (PPO, SAC, DQN, TD3, DDPG, A2C, HER, RecurrentPPO). Use this skill for training RL agents on standard or custom Gymnasium environments, implementing callbacks for monitoring and control, using vectorized environments for parallel training, creating custom environments with proper Gymnasium API implementation, and integrating with deep RL workflows. Includes comprehensive training templates, evaluation utilities, algorithm selection guidance (on-policy vs off-policy, continuous vs discrete actions), support for multi-input policies (dict observations), goal-conditioned learning with HER, and integration with TensorBoard for experiment tracking diff --git a/scientific-packages/scikit-learn/SKILL.md b/scientific-packages/scikit-learn/SKILL.md index ab3ba9a..784f677 100644 --- a/scientific-packages/scikit-learn/SKILL.md +++ b/scientific-packages/scikit-learn/SKILL.md @@ -1,780 +1,515 @@ --- name: scikit-learn -description: "ML toolkit. Classification, regression, clustering, PCA, preprocessing, pipelines, GridSearch, cross-validation, RandomForest, SVM, for general machine learning workflows." +description: Machine learning in Python with scikit-learn. Use when working with supervised learning (classification, regression), unsupervised learning (clustering, dimensionality reduction), model evaluation, hyperparameter tuning, preprocessing, or building ML pipelines. Provides comprehensive reference documentation for algorithms, preprocessing techniques, pipelines, and best practices. --- -# Scikit-learn: Machine Learning in Python +# Scikit-learn ## Overview -Scikit-learn is Python's premier machine learning library, offering simple and efficient tools for predictive data analysis. Apply this skill for classification, regression, clustering, dimensionality reduction, model selection, preprocessing, and hyperparameter optimization. +This skill provides comprehensive guidance for machine learning tasks using scikit-learn, the industry-standard Python library for classical machine learning. Use this skill for classification, regression, clustering, dimensionality reduction, preprocessing, model evaluation, and building production-ready ML pipelines. + +## Installation + +```bash +# Install scikit-learn using uv +uv pip install scikit-learn + +# Optional: Install visualization dependencies +uv pip install matplotlib seaborn + +# Commonly used with +uv pip install pandas numpy +``` ## When to Use This Skill -This skill should be used when: -- Building classification models (spam detection, image recognition, medical diagnosis) -- Creating regression models (price prediction, forecasting, trend analysis) -- Performing clustering analysis (customer segmentation, pattern discovery) -- Reducing dimensionality (PCA, t-SNE for visualization) -- Preprocessing data (scaling, encoding, imputation) -- Evaluating model performance (cross-validation, metrics) -- Tuning hyperparameters (grid search, random search) -- Creating machine learning pipelines -- Detecting anomalies or outliers -- Implementing ensemble methods +Use the scikit-learn skill when: -## Core Machine Learning Workflow +- Building classification or regression models +- Performing clustering or dimensionality reduction +- Preprocessing and transforming data for machine learning +- Evaluating model performance with cross-validation +- Tuning hyperparameters with grid or random search +- Creating ML pipelines for production workflows +- Comparing different algorithms for a task +- Working with both structured (tabular) and text data +- Need interpretable, classical machine learning approaches -### Standard ML Pipeline +## Quick Start -Follow this general workflow for supervised learning tasks: - -1. **Data Preparation** - - Load and explore data - - Split into train/test sets - - Handle missing values - - Encode categorical features - - Scale/normalize features - -2. **Model Selection** - - Start with baseline model - - Try more complex models - - Use domain knowledge to guide selection - -3. **Model Training** - - Fit model on training data - - Use pipelines to prevent data leakage - - Apply cross-validation - -4. **Model Evaluation** - - Evaluate on test set - - Use appropriate metrics - - Analyze errors - -5. **Model Optimization** - - Tune hyperparameters - - Feature engineering - - Ensemble methods - -6. **Deployment** - - Save model using joblib - - Create prediction pipeline - - Monitor performance - -### Classification Quick Start +### Classification Example ```python from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report -from sklearn.pipeline import Pipeline - -# Create pipeline (prevents data leakage) -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) -]) - -# Split data (use stratify for imbalanced classes) -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y -) - -# Train -pipeline.fit(X_train, y_train) - -# Evaluate -y_pred = pipeline.predict(X_test) -print(classification_report(y_test, y_pred)) - -# Cross-validation for robust evaluation -from sklearn.model_selection import cross_val_score -scores = cross_val_score(pipeline, X_train, y_train, cv=5) -print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})") -``` - -### Regression Quick Start - -```python -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import mean_squared_error, r2_score -from sklearn.pipeline import Pipeline - -# Create pipeline -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('regressor', RandomForestRegressor(n_estimators=100, random_state=42)) -]) # Split data X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 + X, y, test_size=0.2, stratify=y, random_state=42 ) -# Train -pipeline.fit(X_train, y_train) +# Preprocess +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Train model +model = RandomForestClassifier(n_estimators=100, random_state=42) +model.fit(X_train_scaled, y_train) # Evaluate -y_pred = pipeline.predict(X_test) -rmse = mean_squared_error(y_test, y_pred, squared=False) -r2 = r2_score(y_test, y_pred) -print(f"RMSE: {rmse:.3f}, R²: {r2:.3f}") +y_pred = model.predict(X_test_scaled) +print(classification_report(y_test, y_pred)) ``` -## Algorithm Selection Guide - -### Classification Algorithms - -**Start with baseline**: LogisticRegression -- Fast, interpretable, works well for linearly separable data -- Good for high-dimensional data (text classification) - -**General-purpose**: RandomForestClassifier -- Handles non-linear relationships -- Robust to outliers -- Provides feature importance -- Good default choice - -**Best performance**: HistGradientBoostingClassifier -- State-of-the-art for tabular data -- Fast on large datasets (>10K samples) -- Often wins Kaggle competitions - -**Special cases**: -- **Small datasets (<1K)**: SVC with RBF kernel -- **Very large datasets (>100K)**: SGDClassifier or LinearSVC -- **Interpretability critical**: LogisticRegression or DecisionTreeClassifier -- **Probabilistic predictions**: GaussianNB or calibrated models -- **Text classification**: LogisticRegression with TfidfVectorizer - -### Regression Algorithms - -**Start with baseline**: LinearRegression or Ridge -- Fast, interpretable -- Works well when relationships are linear - -**General-purpose**: RandomForestRegressor -- Handles non-linear relationships -- Robust to outliers -- Good default choice - -**Best performance**: HistGradientBoostingRegressor -- State-of-the-art for tabular data -- Fast on large datasets - -**Special cases**: -- **Regularization needed**: Ridge (L2) or Lasso (L1 + feature selection) -- **Very large datasets**: SGDRegressor -- **Outliers present**: HuberRegressor or RANSAC - -### Clustering Algorithms - -**Known number of clusters**: KMeans -- Fast and scalable -- Assumes spherical clusters - -**Unknown number of clusters**: DBSCAN or HDBSCAN -- Handles arbitrary shapes -- Automatic outlier detection - -**Hierarchical relationships**: AgglomerativeClustering -- Creates hierarchy of clusters -- Good for visualization (dendrograms) - -**Soft clustering (probabilities)**: GaussianMixture -- Provides cluster probabilities -- Handles elliptical clusters - -### Dimensionality Reduction - -**Preprocessing/feature extraction**: PCA -- Fast and efficient -- Linear transformation -- ALWAYS standardize first - -**Visualization only**: t-SNE or UMAP -- Preserves local structure -- Non-linear -- DO NOT use for preprocessing - -**Sparse data (text)**: TruncatedSVD -- Works with sparse matrices -- Latent Semantic Analysis - -**Non-negative data**: NMF -- Interpretable components -- Topic modeling - -## Working with Different Data Types - -### Numeric Features - -**Continuous features**: -1. Check distribution -2. Handle outliers (remove, clip, or use RobustScaler) -3. Scale using StandardScaler (most algorithms) or MinMaxScaler (neural networks) - -**Count data**: -1. Consider log transformation or sqrt -2. Scale after transformation - -**Skewed data**: -1. Use PowerTransformer (Yeo-Johnson or Box-Cox) -2. Or QuantileTransformer for stronger normalization - -### Categorical Features - -**Low cardinality (<10 categories)**: -```python -from sklearn.preprocessing import OneHotEncoder -encoder = OneHotEncoder(drop='first', sparse_output=True) -``` - -**High cardinality (>10 categories)**: -```python -from sklearn.preprocessing import TargetEncoder -encoder = TargetEncoder() -# Uses target statistics, prevents leakage with cross-fitting -``` - -**Ordinal relationships**: -```python -from sklearn.preprocessing import OrdinalEncoder -encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']]) -``` - -### Text Data +### Complete Pipeline with Mixed Data ```python -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline - -text_pipeline = Pipeline([ - ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')), - ('classifier', MultinomialNB()) -]) - -text_pipeline.fit(X_train_text, y_train) -``` - -### Mixed Data Types - -```python from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer -from sklearn.pipeline import Pipeline +from sklearn.ensemble import GradientBoostingClassifier # Define feature types -numeric_features = ['age', 'income', 'credit_score'] -categorical_features = ['country', 'occupation'] +numeric_features = ['age', 'income'] +categorical_features = ['gender', 'occupation'] -# Separate preprocessing pipelines +# Create preprocessing pipelines numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline([ - ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)) + ('imputer', SimpleImputer(strategy='most_frequent')), + ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) -# Combine with ColumnTransformer -preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features) - ]) +# Combine transformers +preprocessor = ColumnTransformer([ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) +]) -# Complete pipeline -from sklearn.ensemble import RandomForestClassifier -pipeline = Pipeline([ +# Full pipeline +model = Pipeline([ ('preprocessor', preprocessor), - ('classifier', RandomForestClassifier()) + ('classifier', GradientBoostingClassifier(random_state=42)) ]) -pipeline.fit(X_train, y_train) -``` - -## Model Evaluation - -### Classification Metrics - -**Balanced datasets**: Use accuracy or F1-score - -**Imbalanced datasets**: Use balanced_accuracy, F1-weighted, or ROC-AUC -```python -from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score - -balanced_acc = balanced_accuracy_score(y_true, y_pred) -f1 = f1_score(y_true, y_pred, average='weighted') - -# ROC-AUC requires probabilities -y_proba = model.predict_proba(X_test) -auc = roc_auc_score(y_true, y_proba, multi_class='ovr') -``` - -**Cost-sensitive**: Define custom scorer or adjust decision threshold - -**Comprehensive report**: -```python -from sklearn.metrics import classification_report, confusion_matrix - -print(classification_report(y_true, y_pred)) -print(confusion_matrix(y_true, y_pred)) -``` - -### Regression Metrics - -**Standard use**: RMSE and R² -```python -from sklearn.metrics import mean_squared_error, r2_score - -rmse = mean_squared_error(y_true, y_pred, squared=False) -r2 = r2_score(y_true, y_pred) -``` - -**Outliers present**: Use MAE (robust to outliers) -```python -from sklearn.metrics import mean_absolute_error -mae = mean_absolute_error(y_true, y_pred) -``` - -**Percentage errors matter**: Use MAPE -```python -from sklearn.metrics import mean_absolute_percentage_error -mape = mean_absolute_percentage_error(y_true, y_pred) -``` - -### Cross-Validation - -**Standard approach** (5-10 folds): -```python -from sklearn.model_selection import cross_val_score - -scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') -print(f"CV Score: {scores.mean():.3f} (+/- {scores.std():.3f})") -``` - -**Imbalanced classes** (use stratification): -```python -from sklearn.model_selection import StratifiedKFold - -cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) -scores = cross_val_score(model, X, y, cv=cv) -``` - -**Time series** (respect temporal order): -```python -from sklearn.model_selection import TimeSeriesSplit - -cv = TimeSeriesSplit(n_splits=5) -scores = cross_val_score(model, X, y, cv=cv) -``` - -**Multiple metrics**: -```python -from sklearn.model_selection import cross_validate - -scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] -results = cross_validate(model, X, y, cv=5, scoring=scoring) - -for metric in scoring: - scores = results[f'test_{metric}'] - print(f"{metric}: {scores.mean():.3f}") -``` - -## Hyperparameter Tuning - -### Grid Search (Exhaustive) - -```python -from sklearn.model_selection import GridSearchCV - -param_grid = { - 'n_estimators': [100, 200, 500], - 'max_depth': [10, 20, 30, None], - 'min_samples_split': [2, 5, 10] -} - -grid_search = GridSearchCV( - RandomForestClassifier(random_state=42), - param_grid, - cv=5, - scoring='f1_weighted', - n_jobs=-1, # Use all CPU cores - verbose=1 -) - -grid_search.fit(X_train, y_train) - -print(f"Best parameters: {grid_search.best_params_}") -print(f"Best CV score: {grid_search.best_score_:.3f}") - -# Use best model -best_model = grid_search.best_estimator_ -test_score = best_model.score(X_test, y_test) -``` - -### Random Search (Faster) - -```python -from sklearn.model_selection import RandomizedSearchCV -from scipy.stats import randint, uniform - -param_distributions = { - 'n_estimators': randint(100, 1000), - 'max_depth': randint(5, 50), - 'min_samples_split': randint(2, 20), - 'max_features': uniform(0.1, 0.9) -} - -random_search = RandomizedSearchCV( - RandomForestClassifier(random_state=42), - param_distributions, - n_iter=100, # Number of combinations to try - cv=5, - scoring='f1_weighted', - n_jobs=-1, - random_state=42 -) - -random_search.fit(X_train, y_train) -``` - -### Pipeline Hyperparameter Tuning - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('svm', SVC()) -]) - -# Use double underscore for nested parameters -param_grid = { - 'svm__C': [0.1, 1, 10, 100], - 'svm__kernel': ['rbf', 'linear'], - 'svm__gamma': ['scale', 'auto', 0.001, 0.01] -} - -grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1) -grid_search.fit(X_train, y_train) -``` - -## Feature Engineering and Selection - -### Feature Importance - -```python -# Tree-based models have built-in feature importance -from sklearn.ensemble import RandomForestClassifier - -model = RandomForestClassifier(n_estimators=100) +# Fit and predict model.fit(X_train, y_train) - -importances = model.feature_importances_ -feature_importance_df = pd.DataFrame({ - 'feature': feature_names, - 'importance': importances -}).sort_values('importance', ascending=False) - -# Permutation importance (works for any model) -from sklearn.inspection import permutation_importance - -result = permutation_importance( - model, X_test, y_test, - n_repeats=10, - random_state=42, - n_jobs=-1 -) - -importance_df = pd.DataFrame({ - 'feature': feature_names, - 'importance': result.importances_mean, - 'std': result.importances_std -}).sort_values('importance', ascending=False) +y_pred = model.predict(X_test) ``` -### Feature Selection Methods +## Core Capabilities -**Univariate selection**: -```python -from sklearn.feature_selection import SelectKBest, f_classif +### 1. Supervised Learning -selector = SelectKBest(f_classif, k=10) -X_selected = selector.fit_transform(X, y) -selected_features = selector.get_support(indices=True) +Comprehensive algorithms for classification and regression tasks. + +**Key algorithms:** +- **Linear models**: Logistic Regression, Linear Regression, Ridge, Lasso, ElasticNet +- **Tree-based**: Decision Trees, Random Forest, Gradient Boosting +- **Support Vector Machines**: SVC, SVR with various kernels +- **Ensemble methods**: AdaBoost, Voting, Stacking +- **Neural Networks**: MLPClassifier, MLPRegressor +- **Others**: Naive Bayes, K-Nearest Neighbors + +**When to use:** +- Classification: Predicting discrete categories (spam detection, image classification, fraud detection) +- Regression: Predicting continuous values (price prediction, demand forecasting) + +**See:** `references/supervised_learning.md` for detailed algorithm documentation, parameters, and usage examples. + +### 2. Unsupervised Learning + +Discover patterns in unlabeled data through clustering and dimensionality reduction. + +**Clustering algorithms:** +- **Partition-based**: K-Means, MiniBatchKMeans +- **Density-based**: DBSCAN, HDBSCAN, OPTICS +- **Hierarchical**: AgglomerativeClustering +- **Probabilistic**: Gaussian Mixture Models +- **Others**: MeanShift, SpectralClustering, BIRCH + +**Dimensionality reduction:** +- **Linear**: PCA, TruncatedSVD, NMF +- **Manifold learning**: t-SNE, UMAP, Isomap, LLE +- **Feature extraction**: FastICA, LatentDirichletAllocation + +**When to use:** +- Customer segmentation, anomaly detection, data visualization +- Reducing feature dimensions, exploratory data analysis +- Topic modeling, image compression + +**See:** `references/unsupervised_learning.md` for detailed documentation. + +### 3. Model Evaluation and Selection + +Tools for robust model evaluation, cross-validation, and hyperparameter tuning. + +**Cross-validation strategies:** +- KFold, StratifiedKFold (classification) +- TimeSeriesSplit (temporal data) +- GroupKFold (grouped samples) + +**Hyperparameter tuning:** +- GridSearchCV (exhaustive search) +- RandomizedSearchCV (random sampling) +- HalvingGridSearchCV (successive halving) + +**Metrics:** +- **Classification**: accuracy, precision, recall, F1-score, ROC AUC, confusion matrix +- **Regression**: MSE, RMSE, MAE, R², MAPE +- **Clustering**: silhouette score, Calinski-Harabasz, Davies-Bouldin + +**When to use:** +- Comparing model performance objectively +- Finding optimal hyperparameters +- Preventing overfitting through cross-validation +- Understanding model behavior with learning curves + +**See:** `references/model_evaluation.md` for comprehensive metrics and tuning strategies. + +### 4. Data Preprocessing + +Transform raw data into formats suitable for machine learning. + +**Scaling and normalization:** +- StandardScaler (zero mean, unit variance) +- MinMaxScaler (bounded range) +- RobustScaler (robust to outliers) +- Normalizer (sample-wise normalization) + +**Encoding categorical variables:** +- OneHotEncoder (nominal categories) +- OrdinalEncoder (ordered categories) +- LabelEncoder (target encoding) + +**Handling missing values:** +- SimpleImputer (mean, median, most frequent) +- KNNImputer (k-nearest neighbors) +- IterativeImputer (multivariate imputation) + +**Feature engineering:** +- PolynomialFeatures (interaction terms) +- KBinsDiscretizer (binning) +- Feature selection (RFE, SelectKBest, SelectFromModel) + +**When to use:** +- Before training any algorithm that requires scaled features (SVM, KNN, Neural Networks) +- Converting categorical variables to numeric format +- Handling missing data systematically +- Creating non-linear features for linear models + +**See:** `references/preprocessing.md` for detailed preprocessing techniques. + +### 5. Pipelines and Composition + +Build reproducible, production-ready ML workflows. + +**Key components:** +- **Pipeline**: Chain transformers and estimators sequentially +- **ColumnTransformer**: Apply different preprocessing to different columns +- **FeatureUnion**: Combine multiple transformers in parallel +- **TransformedTargetRegressor**: Transform target variable + +**Benefits:** +- Prevents data leakage in cross-validation +- Simplifies code and improves maintainability +- Enables joint hyperparameter tuning +- Ensures consistency between training and prediction + +**When to use:** +- Always use Pipelines for production workflows +- When mixing numerical and categorical features (use ColumnTransformer) +- When performing cross-validation with preprocessing steps +- When hyperparameter tuning includes preprocessing parameters + +**See:** `references/pipelines_and_composition.md` for comprehensive pipeline patterns. + +## Example Scripts + +### Classification Pipeline + +Run a complete classification workflow with preprocessing, model comparison, hyperparameter tuning, and evaluation: + +```bash +python scripts/classification_pipeline.py ``` -**Recursive Feature Elimination**: -```python -from sklearn.feature_selection import RFECV -from sklearn.ensemble import RandomForestClassifier +This script demonstrates: +- Handling mixed data types (numeric and categorical) +- Model comparison using cross-validation +- Hyperparameter tuning with GridSearchCV +- Comprehensive evaluation with multiple metrics +- Feature importance analysis -selector = RFECV( - RandomForestClassifier(n_estimators=100), - step=1, - cv=5, - n_jobs=-1 -) -X_selected = selector.fit_transform(X, y) -print(f"Optimal features: {selector.n_features_}") +### Clustering Analysis + +Perform clustering analysis with algorithm comparison and visualization: + +```bash +python scripts/clustering_analysis.py ``` -**Model-based selection**: -```python -from sklearn.feature_selection import SelectFromModel +This script demonstrates: +- Finding optimal number of clusters (elbow method, silhouette analysis) +- Comparing multiple clustering algorithms (K-Means, DBSCAN, Agglomerative, Gaussian Mixture) +- Evaluating clustering quality without ground truth +- Visualizing results with PCA projection -selector = SelectFromModel( - RandomForestClassifier(n_estimators=100), - threshold='median' # or '0.5*mean', or specific value -) -X_selected = selector.fit_transform(X, y) -``` +## Reference Documentation -### Polynomial Features +This skill includes comprehensive reference files for deep dives into specific topics: -```python -from sklearn.preprocessing import PolynomialFeatures -from sklearn.linear_model import Ridge -from sklearn.pipeline import Pipeline +### Quick Reference +**File:** `references/quick_reference.md` +- Common import patterns and installation instructions +- Quick workflow templates for common tasks +- Algorithm selection cheat sheets +- Common patterns and gotchas +- Performance optimization tips -pipeline = Pipeline([ - ('poly', PolynomialFeatures(degree=2, include_bias=False)), - ('scaler', StandardScaler()), - ('ridge', Ridge()) -]) +### Supervised Learning +**File:** `references/supervised_learning.md` +- Linear models (regression and classification) +- Support Vector Machines +- Decision Trees and ensemble methods +- K-Nearest Neighbors, Naive Bayes, Neural Networks +- Algorithm selection guide -pipeline.fit(X_train, y_train) -``` +### Unsupervised Learning +**File:** `references/unsupervised_learning.md` +- All clustering algorithms with parameters and use cases +- Dimensionality reduction techniques +- Outlier and novelty detection +- Gaussian Mixture Models +- Method selection guide -## Common Patterns and Best Practices +### Model Evaluation +**File:** `references/model_evaluation.md` +- Cross-validation strategies +- Hyperparameter tuning methods +- Classification, regression, and clustering metrics +- Learning and validation curves +- Best practices for model selection + +### Preprocessing +**File:** `references/preprocessing.md` +- Feature scaling and normalization +- Encoding categorical variables +- Missing value imputation +- Feature engineering techniques +- Custom transformers + +### Pipelines and Composition +**File:** `references/pipelines_and_composition.md` +- Pipeline construction and usage +- ColumnTransformer for mixed data types +- FeatureUnion for parallel transformations +- Complete end-to-end examples +- Best practices + +## Common Workflows + +### Building a Classification Model + +1. **Load and explore data** + ```python + import pandas as pd + df = pd.read_csv('data.csv') + X = df.drop('target', axis=1) + y = df['target'] + ``` + +2. **Split data with stratification** + ```python + from sklearn.model_selection import train_test_split + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, stratify=y, random_state=42 + ) + ``` + +3. **Create preprocessing pipeline** + ```python + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.compose import ColumnTransformer + + # Handle numeric and categorical features separately + preprocessor = ColumnTransformer([ + ('num', StandardScaler(), numeric_features), + ('cat', OneHotEncoder(), categorical_features) + ]) + ``` + +4. **Build complete pipeline** + ```python + model = Pipeline([ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(random_state=42)) + ]) + ``` + +5. **Tune hyperparameters** + ```python + from sklearn.model_selection import GridSearchCV + + param_grid = { + 'classifier__n_estimators': [100, 200], + 'classifier__max_depth': [10, 20, None] + } + + grid_search = GridSearchCV(model, param_grid, cv=5) + grid_search.fit(X_train, y_train) + ``` + +6. **Evaluate on test set** + ```python + from sklearn.metrics import classification_report + + best_model = grid_search.best_estimator_ + y_pred = best_model.predict(X_test) + print(classification_report(y_test, y_pred)) + ``` + +### Performing Clustering Analysis + +1. **Preprocess data** + ```python + from sklearn.preprocessing import StandardScaler + + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + ``` + +2. **Find optimal number of clusters** + ```python + from sklearn.cluster import KMeans + from sklearn.metrics import silhouette_score + + scores = [] + for k in range(2, 11): + kmeans = KMeans(n_clusters=k, random_state=42) + labels = kmeans.fit_predict(X_scaled) + scores.append(silhouette_score(X_scaled, labels)) + + optimal_k = range(2, 11)[np.argmax(scores)] + ``` + +3. **Apply clustering** + ```python + model = KMeans(n_clusters=optimal_k, random_state=42) + labels = model.fit_predict(X_scaled) + ``` + +4. **Visualize with dimensionality reduction** + ```python + from sklearn.decomposition import PCA + + pca = PCA(n_components=2) + X_2d = pca.fit_transform(X_scaled) + + plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis') + ``` + +## Best Practices ### Always Use Pipelines - -Pipelines prevent data leakage and ensure proper workflow: - -✅ **Correct**: +Pipelines prevent data leakage and ensure consistency: ```python +# Good: Preprocessing in pipeline pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LogisticRegression()) ]) -pipeline.fit(X_train, y_train) -y_pred = pipeline.predict(X_test) + +# Bad: Preprocessing outside (can leak information) +X_scaled = StandardScaler().fit_transform(X) ``` -❌ **Wrong** (data leakage): +### Fit on Training Data Only +Never fit on test data: ```python -scaler = StandardScaler().fit(X) # Fit on all data! -X_train, X_test = train_test_split(scaler.transform(X)) +# Good +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) # Only transform + +# Bad +scaler = StandardScaler() +X_all_scaled = scaler.fit_transform(np.vstack([X_train, X_test])) ``` -### Stratify for Imbalanced Classes - +### Use Stratified Splitting for Classification +Preserve class distribution: ```python -# Always use stratify for classification with imbalanced classes X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) ``` -### Scale When Necessary - -**Scale for**: SVM, Neural Networks, KNN, Linear Models with regularization, PCA, Gradient Descent - -**Don't scale for**: Tree-based models (Random Forest, Gradient Boosting), Naive Bayes - -### Handle Missing Values - +### Set Random State for Reproducibility ```python -from sklearn.impute import SimpleImputer - -# Numeric: use median (robust to outliers) -imputer = SimpleImputer(strategy='median') - -# Categorical: use constant value or most_frequent -imputer = SimpleImputer(strategy='constant', fill_value='missing') +model = RandomForestClassifier(n_estimators=100, random_state=42) ``` -### Use Appropriate Metrics +### Choose Appropriate Metrics +- Balanced data: Accuracy, F1-score +- Imbalanced data: Precision, Recall, ROC AUC, Balanced Accuracy +- Cost-sensitive: Define custom scorer -- **Balanced classification**: accuracy, F1 -- **Imbalanced classification**: balanced_accuracy, F1-weighted, ROC-AUC -- **Regression with outliers**: MAE instead of RMSE -- **Cost-sensitive**: custom scorer +### Scale Features When Required +Algorithms requiring feature scaling: +- SVM, KNN, Neural Networks +- PCA, Linear/Logistic Regression with regularization +- K-Means clustering -### Set Random States +Algorithms not requiring scaling: +- Tree-based models (Decision Trees, Random Forest, Gradient Boosting) +- Naive Bayes +## Troubleshooting Common Issues + +### ConvergenceWarning +**Issue:** Model didn't converge +**Solution:** Increase `max_iter` or scale features ```python -# For reproducibility -model = RandomForestClassifier(random_state=42) -X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=42 -) +model = LogisticRegression(max_iter=1000) ``` -### Use Parallel Processing - +### Poor Performance on Test Set +**Issue:** Overfitting +**Solution:** Use regularization, cross-validation, or simpler model ```python -# Use all CPU cores -model = RandomForestClassifier(n_jobs=-1) -grid_search = GridSearchCV(model, param_grid, n_jobs=-1) +# Add regularization +model = Ridge(alpha=1.0) + +# Use cross-validation +scores = cross_val_score(model, X, y, cv=5) ``` -## Unsupervised Learning - -### Clustering Workflow - +### Memory Error with Large Datasets +**Solution:** Use algorithms designed for large data ```python -from sklearn.preprocessing import StandardScaler -from sklearn.cluster import KMeans -from sklearn.metrics import silhouette_score +# Use SGD for large datasets +from sklearn.linear_model import SGDClassifier +model = SGDClassifier() -# Always scale for clustering -scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) - -# Elbow method to find optimal k -inertias = [] -silhouette_scores = [] -K_range = range(2, 11) - -for k in K_range: - kmeans = KMeans(n_clusters=k, random_state=42) - labels = kmeans.fit_predict(X_scaled) - inertias.append(kmeans.inertia_) - silhouette_scores.append(silhouette_score(X_scaled, labels)) - -# Plot and choose k -import matplotlib.pyplot as plt -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) -ax1.plot(K_range, inertias, 'bo-') -ax1.set_xlabel('k') -ax1.set_ylabel('Inertia') -ax2.plot(K_range, silhouette_scores, 'ro-') -ax2.set_xlabel('k') -ax2.set_ylabel('Silhouette Score') -plt.show() - -# Fit final model -optimal_k = 5 # Based on elbow/silhouette -kmeans = KMeans(n_clusters=optimal_k, random_state=42) -labels = kmeans.fit_predict(X_scaled) +# Or MiniBatchKMeans for clustering +from sklearn.cluster import MiniBatchKMeans +model = MiniBatchKMeans(n_clusters=8, batch_size=100) ``` -### Dimensionality Reduction +## Additional Resources -```python -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler - -# ALWAYS scale before PCA -scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) - -# Specify variance to retain -pca = PCA(n_components=0.95) # Keep 95% of variance -X_pca = pca.fit_transform(X_scaled) - -print(f"Original features: {X.shape[1]}") -print(f"Reduced features: {pca.n_components_}") -print(f"Variance explained: {pca.explained_variance_ratio_.sum():.3f}") - -# Visualize explained variance -import matplotlib.pyplot as plt -plt.plot(np.cumsum(pca.explained_variance_ratio_)) -plt.xlabel('Number of components') -plt.ylabel('Cumulative explained variance') -plt.show() -``` - -### Visualization with t-SNE - -```python -from sklearn.manifold import TSNE -from sklearn.decomposition import PCA - -# Reduce to 50 dimensions with PCA first (faster) -pca = PCA(n_components=min(50, X.shape[1])) -X_pca = pca.fit_transform(X_scaled) - -# Apply t-SNE (only for visualization!) -tsne = TSNE(n_components=2, random_state=42, perplexity=30) -X_tsne = tsne.fit_transform(X_pca) - -# Visualize -plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.6) -plt.colorbar() -plt.title('t-SNE Visualization') -plt.show() -``` - -## Saving and Loading Models - -```python -import joblib - -# Save model or pipeline -joblib.dump(model, 'model.pkl') -joblib.dump(pipeline, 'pipeline.pkl') - -# Load -loaded_model = joblib.load('model.pkl') -loaded_pipeline = joblib.load('pipeline.pkl') - -# Use loaded model -predictions = loaded_model.predict(X_new) -``` - -## Reference Documentation - -This skill includes comprehensive reference files: - -- **`references/supervised_learning.md`**: Detailed coverage of all classification and regression algorithms, parameters, use cases, and selection guidelines -- **`references/preprocessing.md`**: Complete guide to data preprocessing including scaling, encoding, imputation, transformations, and best practices -- **`references/model_evaluation.md`**: In-depth coverage of cross-validation strategies, metrics, hyperparameter tuning, and validation techniques -- **`references/unsupervised_learning.md`**: Comprehensive guide to clustering, dimensionality reduction, anomaly detection, and evaluation methods -- **`references/pipelines_and_composition.md`**: Complete guide to Pipeline, ColumnTransformer, FeatureUnion, custom transformers, and composition patterns -- **`references/quick_reference.md`**: Quick lookup guide with code snippets, common patterns, and decision trees for algorithm selection - -Read these files when: -- Need detailed parameter explanations for specific algorithms -- Comparing multiple algorithms for a task -- Understanding evaluation metrics in depth -- Building complex preprocessing workflows -- Troubleshooting common issues - -Example search patterns: -```python -# To find information about specific algorithms -grep -r "GradientBoosting" references/ - -# To find preprocessing techniques -grep -r "OneHotEncoder" references/preprocessing.md - -# To find evaluation metrics -grep -r "f1_score" references/model_evaluation.md -``` - -## Common Pitfalls to Avoid - -1. **Data leakage**: Always use pipelines, fit only on training data -2. **Not scaling**: Scale for distance-based algorithms (SVM, KNN, Neural Networks) -3. **Wrong metrics**: Use appropriate metrics for imbalanced data -4. **Not using cross-validation**: Single train-test split can be misleading -5. **Forgetting stratification**: Stratify for imbalanced classification -6. **Using t-SNE for preprocessing**: t-SNE is for visualization only! -7. **Not setting random_state**: Results won't be reproducible -8. **Ignoring class imbalance**: Use stratification, appropriate metrics, or resampling -9. **PCA without scaling**: Components will be dominated by high-variance features -10. **Testing on training data**: Always evaluate on held-out test set +- Official Documentation: https://scikit-learn.org/stable/ +- User Guide: https://scikit-learn.org/stable/user_guide.html +- API Reference: https://scikit-learn.org/stable/api/index.html +- Examples Gallery: https://scikit-learn.org/stable/auto_examples/index.html diff --git a/scientific-packages/scikit-learn/references/model_evaluation.md b/scientific-packages/scikit-learn/references/model_evaluation.md index 5543fcf..e070bd5 100644 --- a/scientific-packages/scikit-learn/references/model_evaluation.md +++ b/scientific-packages/scikit-learn/references/model_evaluation.md @@ -1,443 +1,399 @@ -# Model Evaluation and Selection in scikit-learn +# Model Selection and Evaluation Reference ## Overview -Model evaluation assesses how well models generalize to unseen data. Scikit-learn provides three main APIs for evaluation: -1. **Estimator score methods**: Built-in evaluation (accuracy for classifiers, R² for regressors) -2. **Scoring parameter**: Used in cross-validation and hyperparameter tuning -3. **Metric functions**: Specialized evaluation in `sklearn.metrics` + +Comprehensive guide for evaluating models, tuning hyperparameters, and selecting the best model using scikit-learn's model selection tools. + +## Train-Test Split + +### Basic Splitting + +```python +from sklearn.model_selection import train_test_split + +# Basic split (default 75/25) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) + +# With stratification (preserves class distribution) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.25, stratify=y, random_state=42 +) + +# Three-way split (train/val/test) +X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) +X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) +``` ## Cross-Validation -Cross-validation evaluates model performance by splitting data into multiple train/test sets. This addresses overfitting: "a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data." - -### Basic Cross-Validation - -```python -from sklearn.model_selection import cross_val_score -from sklearn.linear_model import LogisticRegression - -model = LogisticRegression() -scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') -print(f"Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})") -``` - ### Cross-Validation Strategies -#### For i.i.d. Data - -**KFold**: Standard k-fold cross-validation -- Splits data into k equal folds -- Each fold used once as test set -- `n_splits`: Number of folds (typically 5 or 10) - +**KFold** +- Standard k-fold cross-validation +- Splits data into k consecutive folds ```python from sklearn.model_selection import KFold -cv = KFold(n_splits=5, shuffle=True, random_state=42) + +kf = KFold(n_splits=5, shuffle=True, random_state=42) +for train_idx, val_idx in kf.split(X): + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] ``` -**RepeatedKFold**: Repeats KFold with different randomization -- More robust estimation -- Computationally expensive - -**LeaveOneOut (LOO)**: Each sample is a test set -- Maximum training data usage -- Very computationally expensive -- High variance in estimates -- Use only for small datasets (<1000 samples) - -**ShuffleSplit**: Random train/test splits -- Flexible train/test sizes -- Can control number of iterations -- Good for quick evaluation - -```python -from sklearn.model_selection import ShuffleSplit -cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42) -``` - -#### For Imbalanced Classes - -**StratifiedKFold**: Preserves class proportions in each fold -- Essential for imbalanced datasets -- Default for classification in cross_val_score() - +**StratifiedKFold** +- Preserves class distribution in each fold +- Use for imbalanced classification ```python from sklearn.model_selection import StratifiedKFold -cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + +skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) +for train_idx, val_idx in skf.split(X, y): + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] ``` -**StratifiedShuffleSplit**: Stratified random splits - -#### For Grouped Data - -Use when samples are not independent (e.g., multiple measurements from same subject). - -**GroupKFold**: Groups don't appear in both train and test -```python -from sklearn.model_selection import GroupKFold -cv = GroupKFold(n_splits=5) -scores = cross_val_score(model, X, y, groups=groups, cv=cv) -``` - -**StratifiedGroupKFold**: Combines stratification with group separation - -**LeaveOneGroupOut**: Each group becomes a test set - -#### For Time Series - -**TimeSeriesSplit**: Expanding window approach -- Successive training sets are supersets -- Respects temporal ordering -- No data leakage from future to past - +**TimeSeriesSplit** +- For time series data +- Respects temporal order ```python from sklearn.model_selection import TimeSeriesSplit -cv = TimeSeriesSplit(n_splits=5) -for train_idx, test_idx in cv.split(X): - # Train on indices 0 to t, test on t+1 to t+k - pass + +tscv = TimeSeriesSplit(n_splits=5) +for train_idx, val_idx in tscv.split(X): + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] +``` + +**GroupKFold** +- Ensures samples from same group don't appear in both train and validation +- Use when samples are not independent +```python +from sklearn.model_selection import GroupKFold + +gkf = GroupKFold(n_splits=5) +for train_idx, val_idx in gkf.split(X, y, groups=group_ids): + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] +``` + +**LeaveOneOut (LOO)** +- Each sample used as validation set once +- Use for very small datasets +- Computationally expensive +```python +from sklearn.model_selection import LeaveOneOut + +loo = LeaveOneOut() +for train_idx, val_idx in loo.split(X): + X_train, X_val = X[train_idx], X[val_idx] + y_train, y_val = y[train_idx], y[val_idx] ``` ### Cross-Validation Functions -**cross_val_score**: Returns array of scores +**cross_val_score** +- Evaluate model using cross-validation +- Returns array of scores ```python -scores = cross_val_score(model, X, y, cv=5, scoring='f1_weighted') +from sklearn.model_selection import cross_val_score +from sklearn.ensemble import RandomForestClassifier + +model = RandomForestClassifier(n_estimators=100, random_state=42) +scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') + +print(f"Scores: {scores}") +print(f"Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})") ``` -**cross_validate**: Returns multiple metrics and timing +**cross_validate** +- More comprehensive than cross_val_score +- Can return multiple metrics and fit times ```python -results = cross_validate( +from sklearn.model_selection import cross_validate + +model = RandomForestClassifier(n_estimators=100, random_state=42) +cv_results = cross_validate( model, X, y, cv=5, - scoring=['accuracy', 'f1_weighted', 'roc_auc'], + scoring=['accuracy', 'precision', 'recall', 'f1'], return_train_score=True, return_estimator=True # Returns fitted estimators ) -print(results['test_accuracy']) -print(results['fit_time']) + +print(f"Test accuracy: {cv_results['test_accuracy'].mean():.3f}") +print(f"Test precision: {cv_results['test_precision'].mean():.3f}") +print(f"Fit time: {cv_results['fit_time'].mean():.3f}s") ``` -**cross_val_predict**: Returns predictions for model blending/visualization +**cross_val_predict** +- Get predictions for each sample when it was in validation set +- Useful for analyzing errors ```python from sklearn.model_selection import cross_val_predict + +model = RandomForestClassifier(n_estimators=100, random_state=42) y_pred = cross_val_predict(model, X, y, cv=5) -# Use for confusion matrix, error analysis, etc. + +# Now can analyze predictions vs actual +from sklearn.metrics import confusion_matrix +cm = confusion_matrix(y, y_pred) ``` ## Hyperparameter Tuning -### GridSearchCV -Exhaustively searches all parameter combinations. +### Grid Search +**GridSearchCV** +- Exhaustive search over parameter grid +- Tests all combinations ```python from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier param_grid = { - 'n_estimators': [100, 200, 500], - 'max_depth': [10, 20, 30, None], + 'n_estimators': [50, 100, 200], + 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } +model = RandomForestClassifier(random_state=42) grid_search = GridSearchCV( - RandomForestClassifier(random_state=42), - param_grid, + model, param_grid, cv=5, - scoring='f1_weighted', + scoring='accuracy', n_jobs=-1, # Use all CPU cores - verbose=2 + verbose=1 ) grid_search.fit(X_train, y_train) -print("Best parameters:", grid_search.best_params_) -print("Best score:", grid_search.best_score_) -# Use best model +print(f"Best parameters: {grid_search.best_params_}") +print(f"Best cross-validation score: {grid_search.best_score_:.3f}") +print(f"Test score: {grid_search.score(X_test, y_test):.3f}") + +# Access best model best_model = grid_search.best_estimator_ + +# View all results +import pandas as pd +results_df = pd.DataFrame(grid_search.cv_results_) ``` -**When to use**: -- Small parameter spaces -- When computational resources allow -- When exhaustive search is desired - -### RandomizedSearchCV -Samples parameter combinations from distributions. +### Randomized Search +**RandomizedSearchCV** +- Samples random combinations from parameter distributions +- More efficient for large search spaces ```python from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint, uniform param_distributions = { - 'n_estimators': randint(100, 1000), - 'max_depth': randint(5, 50), + 'n_estimators': randint(50, 300), + 'max_depth': [5, 10, 15, 20, None], 'min_samples_split': randint(2, 20), 'min_samples_leaf': randint(1, 10), - 'max_features': uniform(0.1, 0.9) + 'max_features': uniform(0.1, 0.9) # Continuous distribution } +model = RandomForestClassifier(random_state=42) random_search = RandomizedSearchCV( - RandomForestClassifier(random_state=42), - param_distributions, + model, param_distributions, n_iter=100, # Number of parameter settings sampled cv=5, - scoring='f1_weighted', + scoring='accuracy', n_jobs=-1, + verbose=1, random_state=42 ) random_search.fit(X_train, y_train) + +print(f"Best parameters: {random_search.best_params_}") +print(f"Best score: {random_search.best_score_:.3f}") ``` -**When to use**: -- Large parameter spaces -- When budget is limited -- Often finds good parameters faster than GridSearchCV - -**Advantage**: "Budget can be chosen independent of the number of parameters and possible values" - ### Successive Halving -**HalvingGridSearchCV** and **HalvingRandomSearchCV**: Tournament-style selection - -**How it works**: -1. Start with many candidates, minimal resources -2. Eliminate poor performers -3. Increase resources for remaining candidates -4. Repeat until best candidates found - -**When to use**: -- Large parameter spaces -- Expensive model training -- When many parameter combinations are clearly inferior - +**HalvingGridSearchCV / HalvingRandomSearchCV** +- Iteratively selects best candidates using successive halving +- More efficient than exhaustive search ```python from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import HalvingGridSearchCV +param_grid = { + 'n_estimators': [50, 100, 200, 300], + 'max_depth': [5, 10, 15, 20, None], + 'min_samples_split': [2, 5, 10, 20] +} + +model = RandomForestClassifier(random_state=42) halving_search = HalvingGridSearchCV( - estimator, - param_grid, - factor=3, # Proportion of candidates eliminated each round - cv=5 + model, param_grid, + cv=5, + factor=3, # Proportion of candidates eliminated in each iteration + resource='n_samples', # Can also use 'n_estimators' for ensembles + max_resources='auto', + random_state=42 ) + +halving_search.fit(X_train, y_train) +print(f"Best parameters: {halving_search.best_params_}") ``` ## Classification Metrics -### Accuracy-Based Metrics - -**Accuracy**: Proportion of correct predictions -```python -from sklearn.metrics import accuracy_score -accuracy = accuracy_score(y_true, y_pred) -``` - -**When to use**: Balanced datasets only -**When NOT to use**: Imbalanced datasets (misleading) - -**Balanced Accuracy**: Average recall per class -```python -from sklearn.metrics import balanced_accuracy_score -bal_acc = balanced_accuracy_score(y_true, y_pred) -``` - -**When to use**: Imbalanced datasets, ensures all classes matter equally - -### Precision, Recall, F-Score - -**Precision**: Of predicted positives, how many are actually positive -- Formula: TP / (TP + FP) -- Answers: "How reliable are positive predictions?" - -**Recall** (Sensitivity): Of actual positives, how many are predicted positive -- Formula: TP / (TP + FN) -- Answers: "How complete is positive detection?" - -**F1-Score**: Harmonic mean of precision and recall -- Formula: 2 * (precision * recall) / (precision + recall) -- Balanced measure when both precision and recall are important +### Basic Metrics ```python -from sklearn.metrics import precision_recall_fscore_support, f1_score - -precision, recall, f1, support = precision_recall_fscore_support( - y_true, y_pred, average='weighted' +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, + balanced_accuracy_score, matthews_corrcoef ) -# Or individually -f1 = f1_score(y_true, y_pred, average='weighted') +y_pred = model.predict(X_test) + +accuracy = accuracy_score(y_test, y_pred) +precision = precision_score(y_test, y_pred, average='weighted') # For multiclass +recall = recall_score(y_test, y_pred, average='weighted') +f1 = f1_score(y_test, y_pred, average='weighted') +balanced_acc = balanced_accuracy_score(y_test, y_pred) # Good for imbalanced data +mcc = matthews_corrcoef(y_test, y_pred) # Matthews correlation coefficient + +print(f"Accuracy: {accuracy:.3f}") +print(f"Precision: {precision:.3f}") +print(f"Recall: {recall:.3f}") +print(f"F1-score: {f1:.3f}") +print(f"Balanced Accuracy: {balanced_acc:.3f}") +print(f"MCC: {mcc:.3f}") ``` -**Averaging strategies for multiclass**: -- `binary`: Binary classification only -- `micro`: Calculate globally (total TP, FP, FN) -- `macro`: Calculate per class, unweighted mean (all classes equal) -- `weighted`: Calculate per class, weighted by support (class frequency) -- `samples`: For multilabel classification +### Classification Report -**When to use**: -- `macro`: When all classes equally important (even rare ones) -- `weighted`: When class frequency matters -- `micro`: When overall performance across all samples matters +```python +from sklearn.metrics import classification_report + +print(classification_report(y_test, y_pred, target_names=class_names)) +``` ### Confusion Matrix -Shows true positives, false positives, true negatives, false negatives. - ```python from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib.pyplot as plt -cm = confusion_matrix(y_true, y_pred) -disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1']) +cm = confusion_matrix(y_test, y_pred) +disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names) +disp.plot(cmap='Blues') +plt.show() +``` + +### ROC and AUC + +```python +from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay + +# Binary classification +y_proba = model.predict_proba(X_test)[:, 1] +auc = roc_auc_score(y_test, y_proba) +print(f"ROC AUC: {auc:.3f}") + +# Plot ROC curve +fpr, tpr, thresholds = roc_curve(y_test, y_proba) +RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc).plot() + +# Multiclass (one-vs-rest) +auc_ovr = roc_auc_score(y_test, y_proba_multi, multi_class='ovr') +``` + +### Precision-Recall Curve + +```python +from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay +from sklearn.metrics import average_precision_score + +precision, recall, thresholds = precision_recall_curve(y_test, y_proba) +ap = average_precision_score(y_test, y_proba) + +disp = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=ap) disp.plot() -plt.show() -``` - -### ROC Curve and AUC - -**ROC (Receiver Operating Characteristic)**: Plot of true positive rate vs false positive rate at different thresholds - -**AUC (Area Under Curve)**: Measures overall ability to discriminate between classes -- 1.0 = perfect classifier -- 0.5 = random classifier -- <0.5 = worse than random - -```python -from sklearn.metrics import roc_auc_score, roc_curve -import matplotlib.pyplot as plt - -# Requires probability predictions -y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for positive class - -auc = roc_auc_score(y_true, y_proba) -fpr, tpr, thresholds = roc_curve(y_true, y_proba) - -plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}') -plt.xlabel('False Positive Rate') -plt.ylabel('True Positive Rate') -plt.legend() -plt.show() -``` - -**Multiclass ROC**: Use `multi_class='ovr'` (one-vs-rest) or `'ovo'` (one-vs-one) - -```python -auc = roc_auc_score(y_true, y_proba, multi_class='ovr') ``` ### Log Loss -Measures probability calibration quality. - ```python from sklearn.metrics import log_loss -loss = log_loss(y_true, y_proba) -``` -**When to use**: When probability quality matters, not just class predictions -**Lower is better**: Perfect predictions have log loss of 0 - -### Classification Report - -Comprehensive summary of precision, recall, f1-score per class. - -```python -from sklearn.metrics import classification_report - -print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])) +y_proba = model.predict_proba(X_test) +logloss = log_loss(y_test, y_proba) +print(f"Log Loss: {logloss:.3f}") ``` ## Regression Metrics -### Mean Squared Error (MSE) -Average squared difference between predictions and true values. - ```python -from sklearn.metrics import mean_squared_error -mse = mean_squared_error(y_true, y_pred) -rmse = mean_squared_error(y_true, y_pred, squared=False) # Root MSE +from sklearn.metrics import ( + mean_squared_error, mean_absolute_error, r2_score, + mean_absolute_percentage_error, median_absolute_error +) + +y_pred = model.predict(X_test) + +mse = mean_squared_error(y_test, y_pred) +rmse = mean_squared_error(y_test, y_pred, squared=False) +mae = mean_absolute_error(y_test, y_pred) +r2 = r2_score(y_test, y_pred) +mape = mean_absolute_percentage_error(y_test, y_pred) +median_ae = median_absolute_error(y_test, y_pred) + +print(f"MSE: {mse:.3f}") +print(f"RMSE: {rmse:.3f}") +print(f"MAE: {mae:.3f}") +print(f"R² Score: {r2:.3f}") +print(f"MAPE: {mape:.3f}") +print(f"Median AE: {median_ae:.3f}") ``` -**Characteristics**: -- Penalizes large errors heavily (squared term) -- Same units as target² (use RMSE for same units as target) -- Lower is better +## Clustering Metrics -### Mean Absolute Error (MAE) -Average absolute difference between predictions and true values. +### With Ground Truth Labels ```python -from sklearn.metrics import mean_absolute_error -mae = mean_absolute_error(y_true, y_pred) +from sklearn.metrics import ( + adjusted_rand_score, normalized_mutual_info_score, + adjusted_mutual_info_score, fowlkes_mallows_score, + homogeneity_score, completeness_score, v_measure_score +) + +ari = adjusted_rand_score(y_true, y_pred) +nmi = normalized_mutual_info_score(y_true, y_pred) +ami = adjusted_mutual_info_score(y_true, y_pred) +fmi = fowlkes_mallows_score(y_true, y_pred) +homogeneity = homogeneity_score(y_true, y_pred) +completeness = completeness_score(y_true, y_pred) +v_measure = v_measure_score(y_true, y_pred) ``` -**Characteristics**: -- More robust to outliers than MSE -- Same units as target -- More interpretable -- Lower is better - -**MSE vs MAE**: Use MAE when outliers shouldn't dominate the metric - -### R² Score (Coefficient of Determination) -Proportion of variance explained by the model. +### Without Ground Truth ```python -from sklearn.metrics import r2_score -r2 = r2_score(y_true, y_pred) +from sklearn.metrics import ( + silhouette_score, calinski_harabasz_score, davies_bouldin_score +) + +silhouette = silhouette_score(X, labels) # [-1, 1], higher better +ch_score = calinski_harabasz_score(X, labels) # Higher better +db_score = davies_bouldin_score(X, labels) # Lower better ``` -**Interpretation**: -- 1.0 = perfect predictions -- 0.0 = model as good as mean -- <0.0 = model worse than mean (possible!) -- Higher is better +## Custom Scoring -**Note**: Can be negative for models that perform worse than predicting the mean. - -### Mean Absolute Percentage Error (MAPE) -Percentage-based error metric. +### Using make_scorer ```python -from sklearn.metrics import mean_absolute_percentage_error -mape = mean_absolute_percentage_error(y_true, y_pred) -``` +from sklearn.metrics import make_scorer -**When to use**: When relative errors matter more than absolute errors -**Warning**: Undefined when true values are zero - -### Median Absolute Error -Median of absolute errors (robust to outliers). - -```python -from sklearn.metrics import median_absolute_error -med_ae = median_absolute_error(y_true, y_pred) -``` - -### Max Error -Maximum residual error. - -```python -from sklearn.metrics import max_error -max_err = max_error(y_true, y_pred) -``` - -**When to use**: When worst-case performance matters - -## Custom Scoring Functions - -Create custom scorers for GridSearchCV and cross_val_score: - -```python -from sklearn.metrics import make_scorer, fbeta_score - -# F2 score (weights recall higher than precision) -f2_scorer = make_scorer(fbeta_score, beta=2) - -# Custom function def custom_metric(y_true, y_pred): # Your custom logic return score @@ -448,78 +404,37 @@ custom_scorer = make_scorer(custom_metric, greater_is_better=True) scores = cross_val_score(model, X, y, cv=5, scoring=custom_scorer) ``` -## Scoring Parameter Options - -Common scoring strings for `scoring` parameter: - -**Classification**: -- `'accuracy'`, `'balanced_accuracy'` -- `'precision'`, `'recall'`, `'f1'` (add `_macro`, `_micro`, `_weighted` for multiclass) -- `'roc_auc'`, `'roc_auc_ovr'`, `'roc_auc_ovo'` -- `'log_loss'` (lower is better, negate for maximization) -- `'jaccard'` (Jaccard similarity) - -**Regression**: -- `'r2'` -- `'neg_mean_squared_error'`, `'neg_root_mean_squared_error'` -- `'neg_mean_absolute_error'` -- `'neg_mean_absolute_percentage_error'` -- `'neg_median_absolute_error'` - -**Note**: Many metrics are negated (neg_*) so GridSearchCV can maximize them. - -## Validation Strategies - -### Train-Test Split -Simple single split. +### Multiple Metrics in Grid Search ```python -from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV -X_train, X_test, y_train, y_test = train_test_split( - X, y, - test_size=0.2, - random_state=42, - stratify=y # For classification with imbalanced classes +scoring = { + 'accuracy': 'accuracy', + 'precision': 'precision_weighted', + 'recall': 'recall_weighted', + 'f1': 'f1_weighted' +} + +grid_search = GridSearchCV( + model, param_grid, + cv=5, + scoring=scoring, + refit='f1', # Refit on best f1 score + return_train_score=True ) + +grid_search.fit(X_train, y_train) ``` -**When to use**: Large datasets, quick evaluation -**Parameters**: -- `test_size`: Proportion for test (typically 0.2-0.3) -- `stratify`: Preserves class proportions -- `random_state`: Reproducibility +## Validation Curves -### Train-Validation-Test Split -Three-way split for hyperparameter tuning. - -```python -# First split: train+val and test -X_trainval, X_test, y_trainval, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 -) - -# Second split: train and validation -X_train, X_val, y_train, y_val = train_test_split( - X_trainval, y_trainval, test_size=0.2, random_state=42 -) - -# Or use GridSearchCV with train+val, then evaluate on test -``` - -**When to use**: Model selection and final evaluation -**Strategy**: -1. Train: Model training -2. Validation: Hyperparameter tuning -3. Test: Final, unbiased evaluation (touch only once!) - -### Learning Curves - -Diagnose bias vs variance issues. +### Learning Curve ```python from sklearn.model_selection import learning_curve import matplotlib.pyplot as plt +import numpy as np train_sizes, train_scores, val_scores = learning_curve( model, X, y, @@ -529,73 +444,149 @@ train_sizes, train_scores, val_scores = learning_curve( n_jobs=-1 ) -plt.plot(train_sizes, train_scores.mean(axis=1), label='Training score') -plt.plot(train_sizes, val_scores.mean(axis=1), label='Validation score') -plt.xlabel('Training set size') +train_mean = train_scores.mean(axis=1) +train_std = train_scores.std(axis=1) +val_mean = val_scores.mean(axis=1) +val_std = val_scores.std(axis=1) + +plt.figure(figsize=(10, 6)) +plt.plot(train_sizes, train_mean, label='Training score') +plt.plot(train_sizes, val_mean, label='Validation score') +plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1) +plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1) +plt.xlabel('Training Set Size') plt.ylabel('Score') +plt.title('Learning Curve') plt.legend() -plt.show() +plt.grid(True) ``` -**Interpretation**: -- Large gap between train and validation: **Overfitting** (high variance) -- Both scores low: **Underfitting** (high bias) -- Scores converging but low: Need better features or more complex model -- Validation score still improving: More data would help +### Validation Curve + +```python +from sklearn.model_selection import validation_curve + +param_range = [1, 10, 50, 100, 200, 500] +train_scores, val_scores = validation_curve( + model, X, y, + param_name='n_estimators', + param_range=param_range, + cv=5, + scoring='accuracy', + n_jobs=-1 +) + +train_mean = train_scores.mean(axis=1) +val_mean = val_scores.mean(axis=1) + +plt.figure(figsize=(10, 6)) +plt.plot(param_range, train_mean, label='Training score') +plt.plot(param_range, val_mean, label='Validation score') +plt.xlabel('n_estimators') +plt.ylabel('Score') +plt.title('Validation Curve') +plt.legend() +plt.grid(True) +``` + +## Model Persistence + +### Save and Load Models + +```python +import joblib + +# Save model +joblib.dump(model, 'model.pkl') + +# Load model +loaded_model = joblib.load('model.pkl') + +# Also works with pipelines +joblib.dump(pipeline, 'pipeline.pkl') +``` + +### Using pickle + +```python +import pickle + +# Save +with open('model.pkl', 'wb') as f: + pickle.dump(model, f) + +# Load +with open('model.pkl', 'rb') as f: + loaded_model = pickle.load(f) +``` + +## Imbalanced Data Strategies + +### Class Weighting + +```python +from sklearn.ensemble import RandomForestClassifier + +# Automatically balance classes +model = RandomForestClassifier(class_weight='balanced', random_state=42) +model.fit(X_train, y_train) + +# Custom weights +class_weights = {0: 1, 1: 10} # Give class 1 more weight +model = RandomForestClassifier(class_weight=class_weights, random_state=42) +``` + +### Resampling (using imbalanced-learn) + +```python +# Install: uv pip install imbalanced-learn +from imblearn.over_sampling import SMOTE +from imblearn.under_sampling import RandomUnderSampler +from imblearn.pipeline import Pipeline as ImbPipeline + +# SMOTE oversampling +smote = SMOTE(random_state=42) +X_resampled, y_resampled = smote.fit_resample(X_train, y_train) + +# Combined approach +pipeline = ImbPipeline([ + ('over', SMOTE(sampling_strategy=0.5)), + ('under', RandomUnderSampler(sampling_strategy=0.8)), + ('model', RandomForestClassifier()) +]) +``` ## Best Practices -### Metric Selection Guidelines +### Stratified Splitting +Always use stratified splitting for classification: +```python +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, stratify=y, random_state=42 +) +``` -**Classification - Balanced classes**: -- Accuracy or F1-score +### Appropriate Metrics +- **Balanced data**: Accuracy, F1-score +- **Imbalanced data**: Precision, Recall, F1-score, ROC AUC, Balanced Accuracy +- **Cost-sensitive**: Define custom scorer with costs +- **Ranking**: ROC AUC, Average Precision -**Classification - Imbalanced classes**: -- Balanced accuracy -- F1-score (weighted or macro) -- ROC-AUC -- Precision-Recall curve +### Cross-Validation +- Use 5 or 10-fold CV for most cases +- Use StratifiedKFold for classification +- Use TimeSeriesSplit for time series +- Use GroupKFold when samples are grouped -**Classification - Cost-sensitive**: -- Custom scorer with cost matrix -- Adjust threshold on probabilities +### Nested Cross-Validation +For unbiased performance estimates when tuning: +```python +from sklearn.model_selection import cross_val_score, GridSearchCV -**Regression - Typical use**: -- RMSE (sensitive to outliers) -- R² (proportion of variance explained) +# Inner loop: hyperparameter tuning +grid_search = GridSearchCV(model, param_grid, cv=5) -**Regression - Outliers present**: -- MAE (robust to outliers) -- Median absolute error - -**Regression - Percentage errors matter**: -- MAPE - -### Cross-Validation Guidelines - -**Number of folds**: -- 5-10 folds typical -- More folds = more computation, less variance in estimate -- LeaveOneOut only for small datasets - -**Stratification**: -- Always use for classification with imbalanced classes -- Use StratifiedKFold by default for classification - -**Grouping**: -- Always use when samples are not independent -- Time series: Always use TimeSeriesSplit - -**Nested cross-validation**: -- For unbiased performance estimate when doing hyperparameter tuning -- Outer loop: Performance estimation -- Inner loop: Hyperparameter selection - -### Avoiding Common Pitfalls - -1. **Data leakage**: Fit preprocessors only on training data within each CV fold (use Pipeline!) -2. **Test set leakage**: Never use test set for model selection -3. **Improper metric**: Use metrics appropriate for problem (balanced_accuracy for imbalanced data) -4. **Multiple testing**: More models evaluated = higher chance of random good results -5. **Temporal leakage**: For time series, use TimeSeriesSplit, not random splits -6. **Target leakage**: Features shouldn't contain information not available at prediction time +# Outer loop: performance estimation +scores = cross_val_score(grid_search, X, y, cv=5) +print(f"Nested CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})") +``` diff --git a/scientific-packages/scikit-learn/references/pipelines_and_composition.md b/scientific-packages/scikit-learn/references/pipelines_and_composition.md index bcf898f..7206e4c 100644 --- a/scientific-packages/scikit-learn/references/pipelines_and_composition.md +++ b/scientific-packages/scikit-learn/references/pipelines_and_composition.md @@ -1,214 +1,183 @@ -# Pipelines and Composite Estimators in scikit-learn +# Pipelines and Composite Estimators Reference ## Overview -Pipelines chain multiple estimators into a single unit, ensuring proper workflow sequencing and preventing data leakage. As the documentation states: "Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification." + +Pipelines chain multiple processing steps into a single estimator, preventing data leakage and simplifying code. They enable reproducible workflows and seamless integration with cross-validation and hyperparameter tuning. ## Pipeline Basics -### Creating Pipelines +### Creating a Pipeline +**Pipeline (`sklearn.pipeline.Pipeline`)** +- Chains transformers with a final estimator +- All intermediate steps must have fit_transform() +- Final step can be any estimator (transformer, classifier, regressor, clusterer) +- Example: ```python from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression -# Method 1: List of (name, estimator) tuples pipeline = Pipeline([ ('scaler', StandardScaler()), ('pca', PCA(n_components=10)), ('classifier', LogisticRegression()) ]) -# Method 2: Using make_pipeline (auto-generates names) +# Fit the entire pipeline +pipeline.fit(X_train, y_train) + +# Predict using the pipeline +y_pred = pipeline.predict(X_test) +y_proba = pipeline.predict_proba(X_test) +``` + +### Using make_pipeline + +**make_pipeline** +- Convenient constructor that auto-generates step names +- Example: +```python from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC + pipeline = make_pipeline( StandardScaler(), PCA(n_components=10), - LogisticRegression() + SVC(kernel='rbf') ) + +pipeline.fit(X_train, y_train) ``` -### Using Pipelines +## Accessing Pipeline Components + +### Accessing Steps ```python -# Fit and predict like any estimator -pipeline.fit(X_train, y_train) -y_pred = pipeline.predict(X_test) -score = pipeline.score(X_test, y_test) +# By index +scaler = pipeline.steps[0][1] -# Access steps -pipeline.named_steps['scaler'] -pipeline.steps[0] # Returns ('scaler', StandardScaler(...)) -pipeline[0] # Returns StandardScaler(...) object -pipeline['scaler'] # Returns StandardScaler(...) object +# By name +scaler = pipeline.named_steps['scaler'] +pca = pipeline.named_steps['pca'] -# Get final estimator -pipeline[-1] # Returns LogisticRegression(...) object +# Using indexing syntax +scaler = pipeline['scaler'] +pca = pipeline['pca'] + +# Get all step names +print(pipeline.named_steps.keys()) ``` -### Pipeline Rules +### Setting Parameters -**All steps except the last must be transformers** (have `fit()` and `transform()` methods). +```python +# Set parameters using double underscore notation +pipeline.set_params( + pca__n_components=15, + classifier__C=0.1 +) -**The final step** can be: -- Predictor (classifier/regressor) with `fit()` and `predict()` -- Transformer with `fit()` and `transform()` -- Any estimator with at least `fit()` +# Or during creation +pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('pca', PCA(n_components=10)), + ('classifier', LogisticRegression(C=1.0)) +]) +``` -### Pipeline Benefits +### Accessing Attributes -1. **Convenience**: Single `fit()` and `predict()` call -2. **Prevents data leakage**: Ensures proper fit/transform on train/test -3. **Joint parameter selection**: Tune all steps together with GridSearchCV -4. **Reproducibility**: Encapsulates entire workflow +```python +# Access fitted attributes +pca_components = pipeline.named_steps['pca'].components_ +explained_variance = pipeline.named_steps['pca'].explained_variance_ratio_ -## Accessing and Setting Parameters +# Access intermediate transformations +X_scaled = pipeline.named_steps['scaler'].transform(X_test) +X_pca = pipeline.named_steps['pca'].transform(X_scaled) +``` -### Nested Parameters +## Hyperparameter Tuning with Pipelines -Access step parameters using `stepname__parameter` syntax: +### Grid Search with Pipeline ```python from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC pipeline = Pipeline([ ('scaler', StandardScaler()), - ('clf', LogisticRegression()) + ('classifier', SVC()) ]) -# Grid search over pipeline parameters param_grid = { - 'scaler__with_mean': [True, False], - 'clf__C': [0.1, 1.0, 10.0], - 'clf__penalty': ['l1', 'l2'] + 'classifier__C': [0.1, 1, 10, 100], + 'classifier__gamma': ['scale', 'auto', 0.001, 0.01], + 'classifier__kernel': ['rbf', 'linear'] +} + +grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1) +grid_search.fit(X_train, y_train) + +print(f"Best parameters: {grid_search.best_params_}") +print(f"Best score: {grid_search.best_score_:.3f}") +``` + +### Tuning Multiple Pipeline Steps + +```python +param_grid = { + # PCA parameters + 'pca__n_components': [5, 10, 20, 50], + + # Classifier parameters + 'classifier__C': [0.1, 1, 10], + 'classifier__kernel': ['rbf', 'linear'] } grid_search = GridSearchCV(pipeline, param_grid, cv=5) grid_search.fit(X_train, y_train) ``` -### Setting Parameters - -```python -# Set parameters -pipeline.set_params(clf__C=10.0, scaler__with_std=False) - -# Get parameters -params = pipeline.get_params() -``` - -## Caching Intermediate Results - -Cache fitted transformers to avoid recomputation: - -```python -from tempfile import mkdtemp -from shutil import rmtree - -# Create cache directory -cachedir = mkdtemp() - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('pca', PCA(n_components=10)), - ('clf', LogisticRegression()) -], memory=cachedir) - -# When doing grid search, scaler and PCA only fit once per fold -grid_search = GridSearchCV(pipeline, param_grid, cv=5) -grid_search.fit(X_train, y_train) - -# Clean up cache -rmtree(cachedir) - -# Or use joblib for persistent caching -from joblib import Memory -memory = Memory(location='./cache', verbose=0) -pipeline = Pipeline([...], memory=memory) -``` - -**When to use caching**: -- Expensive transformations (PCA, feature selection) -- Grid search over final estimator parameters only -- Multiple experiments with same preprocessing - ## ColumnTransformer -Apply different transformations to different columns (essential for heterogeneous data). - ### Basic Usage +**ColumnTransformer (`sklearn.compose.ColumnTransformer`)** +- Apply different preprocessing to different columns +- Prevents data leakage in cross-validation +- Example: ```python from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer -# Define which transformations for which columns +# Define column groups +numeric_features = ['age', 'income', 'hours_per_week'] +categorical_features = ['gender', 'occupation', 'native_country'] + +# Create preprocessor preprocessor = ColumnTransformer( transformers=[ - ('num', StandardScaler(), ['age', 'income', 'credit_score']), - ('cat', OneHotEncoder(), ['country', 'occupation']) + ('num', StandardScaler(), numeric_features), + ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ], - remainder='drop' # What to do with remaining columns + remainder='passthrough' # Keep other columns unchanged ) X_transformed = preprocessor.fit_transform(X) ``` -### Column Selection Methods +### With Pipeline Steps ```python -# Method 1: Column names (list of strings) -('num', StandardScaler(), ['age', 'income']) - -# Method 2: Column indices (list of integers) -('num', StandardScaler(), [0, 1, 2]) - -# Method 3: Boolean mask -('num', StandardScaler(), [True, True, False, True, False]) - -# Method 4: Slice -('num', StandardScaler(), slice(0, 3)) - -# Method 5: make_column_selector (by dtype or pattern) -from sklearn.compose import make_column_selector as selector - -preprocessor = ColumnTransformer([ - ('num', StandardScaler(), selector(dtype_include='number')), - ('cat', OneHotEncoder(), selector(dtype_include='object')) -]) - -# Select by pattern -selector(pattern='.*_score$') # All columns ending with '_score' -``` - -### Remainder Parameter - -Controls what happens to columns not specified: - -```python -# Drop remaining columns (default) -remainder='drop' - -# Pass through remaining columns unchanged -remainder='passthrough' - -# Apply transformer to remaining columns -remainder=StandardScaler() -``` - -### Full Pipeline with ColumnTransformer - -```python -from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.ensemble import RandomForestClassifier - -# Separate preprocessing for numeric and categorical -numeric_features = ['age', 'income', 'credit_score'] -categorical_features = ['country', 'occupation', 'education'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), @@ -224,456 +193,420 @@ preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) - ]) + ] +) -# Complete pipeline -clf = Pipeline(steps=[ +# Full pipeline with model +full_pipeline = Pipeline([ ('preprocessor', preprocessor), - ('classifier', RandomForestClassifier()) + ('classifier', LogisticRegression()) ]) -clf.fit(X_train, y_train) -y_pred = clf.predict(X_test) +full_pipeline.fit(X_train, y_train) +``` -# Grid search over preprocessing and model parameters -param_grid = { - 'preprocessor__num__imputer__strategy': ['mean', 'median'], - 'preprocessor__cat__onehot__max_categories': [10, 20, None], - 'classifier__n_estimators': [100, 200], - 'classifier__max_depth': [10, 20, None] -} +### Using make_column_transformer -grid_search = GridSearchCV(clf, param_grid, cv=5) -grid_search.fit(X_train, y_train) +```python +from sklearn.compose import make_column_transformer + +preprocessor = make_column_transformer( + (StandardScaler(), numeric_features), + (OneHotEncoder(), categorical_features), + remainder='passthrough' +) +``` + +### Column Selection + +```python +# By column names (if X is DataFrame) +preprocessor = ColumnTransformer([ + ('num', StandardScaler(), ['age', 'income']), + ('cat', OneHotEncoder(), ['gender', 'occupation']) +]) + +# By column indices +preprocessor = ColumnTransformer([ + ('num', StandardScaler(), [0, 1, 2]), + ('cat', OneHotEncoder(), [3, 4]) +]) + +# By boolean mask +numeric_mask = [True, True, True, False, False] +categorical_mask = [False, False, False, True, True] + +preprocessor = ColumnTransformer([ + ('num', StandardScaler(), numeric_mask), + ('cat', OneHotEncoder(), categorical_mask) +]) + +# By callable +def is_numeric(X): + return X.select_dtypes(include=['number']).columns.tolist() + +preprocessor = ColumnTransformer([ + ('num', StandardScaler(), is_numeric) +]) +``` + +### Getting Feature Names + +```python +# Get output feature names +feature_names = preprocessor.get_feature_names_out() + +# After fitting +preprocessor.fit(X_train) +output_features = preprocessor.get_feature_names_out() +print(f"Input features: {X_train.columns.tolist()}") +print(f"Output features: {output_features}") +``` + +### Remainder Handling + +```python +# Drop unspecified columns (default) +preprocessor = ColumnTransformer([...], remainder='drop') + +# Pass through unchanged +preprocessor = ColumnTransformer([...], remainder='passthrough') + +# Apply transformer to remaining columns +preprocessor = ColumnTransformer([...], remainder=StandardScaler()) ``` ## FeatureUnion -Combine multiple transformer outputs by concatenating features side-by-side. +### Basic Usage +**FeatureUnion (`sklearn.pipeline.FeatureUnion`)** +- Concatenates results of multiple transformers +- Transformers are applied in parallel +- Example: ```python from sklearn.pipeline import FeatureUnion from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest # Combine PCA and feature selection -combined_features = FeatureUnion([ +feature_union = FeatureUnion([ ('pca', PCA(n_components=10)), - ('univ_select', SelectKBest(k=5)) + ('select_best', SelectKBest(k=20)) ]) -X_features = combined_features.fit_transform(X, y) -# Result: 15 features (10 from PCA + 5 from SelectKBest) +X_combined = feature_union.fit_transform(X_train, y_train) +print(f"Combined features: {X_combined.shape[1]}") # 10 + 20 = 30 +``` -# In a pipeline +### With Pipeline + +```python +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA, TruncatedSVD + +# Create feature union +feature_union = FeatureUnion([ + ('pca', PCA(n_components=10)), + ('svd', TruncatedSVD(n_components=10)) +]) + +# Full pipeline pipeline = Pipeline([ - ('features', combined_features), + ('scaler', StandardScaler()), + ('features', feature_union), + ('classifier', LogisticRegression()) +]) + +pipeline.fit(X_train, y_train) +``` + +### Weighted Feature Union + +```python +# Apply weights to transformers +feature_union = FeatureUnion( + transformer_list=[ + ('pca', PCA(n_components=10)), + ('select_best', SelectKBest(k=20)) + ], + transformer_weights={ + 'pca': 2.0, # Give PCA features double weight + 'select_best': 1.0 + } +) +``` + +## Advanced Pipeline Patterns + +### Caching Pipeline Steps + +```python +from sklearn.pipeline import Pipeline +from tempfile import mkdtemp +from shutil import rmtree + +# Cache intermediate results +cachedir = mkdtemp() +pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('pca', PCA(n_components=50)), + ('classifier', LogisticRegression()) +], memory=cachedir) + +pipeline.fit(X_train, y_train) + +# Clean up cache +rmtree(cachedir) +``` + +### Nested Pipelines + +```python +from sklearn.pipeline import Pipeline + +# Inner pipeline for text processing +text_pipeline = Pipeline([ + ('vect', CountVectorizer()), + ('tfidf', TfidfTransformer()) +]) + +# Outer pipeline combining text and numeric features +full_pipeline = Pipeline([ + ('features', FeatureUnion([ + ('text', text_pipeline), + ('numeric', StandardScaler()) + ])), ('classifier', LogisticRegression()) ]) ``` -### FeatureUnion with Transformers on Different Data - -```python -from sklearn.pipeline import FeatureUnion -from sklearn.preprocessing import FunctionTransformer -import numpy as np - -def get_numeric_data(X): - return X[:, :3] # First 3 columns - -def get_text_data(X): - return X[:, 3] # 4th column (text) - -from sklearn.feature_extraction.text import TfidfVectorizer - -combined = FeatureUnion([ - ('numeric_features', Pipeline([ - ('selector', FunctionTransformer(get_numeric_data)), - ('scaler', StandardScaler()) - ])), - ('text_features', Pipeline([ - ('selector', FunctionTransformer(get_text_data)), - ('tfidf', TfidfVectorizer()) - ])) -]) -``` - -**Note**: ColumnTransformer is usually more convenient than FeatureUnion for heterogeneous data. - -## Common Pipeline Patterns - -### Classification Pipeline - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.feature_selection import SelectKBest, f_classif -from sklearn.svm import SVC - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('feature_selection', SelectKBest(f_classif, k=10)), - ('classifier', SVC(kernel='rbf')) -]) -``` - -### Regression Pipeline - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler, PolynomialFeatures -from sklearn.linear_model import Ridge - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('poly', PolynomialFeatures(degree=2)), - ('ridge', Ridge(alpha=1.0)) -]) -``` - -### Text Classification Pipeline - -```python -from sklearn.pipeline import Pipeline -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.naive_bayes import MultinomialNB - -pipeline = Pipeline([ - ('tfidf', TfidfVectorizer(max_features=1000)), - ('classifier', MultinomialNB()) -]) - -# Works directly with text -pipeline.fit(X_train_text, y_train) -y_pred = pipeline.predict(X_test_text) -``` - -### Image Processing Pipeline - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.decomposition import PCA -from sklearn.neural_network import MLPClassifier - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('pca', PCA(n_components=100)), - ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50))) -]) -``` - -### Dimensionality Reduction + Clustering - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.decomposition import PCA -from sklearn.cluster import KMeans - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('pca', PCA(n_components=10)), - ('kmeans', KMeans(n_clusters=5)) -]) - -labels = pipeline.fit_predict(X) -``` - -## Custom Transformers - -### Using FunctionTransformer - -```python -from sklearn.preprocessing import FunctionTransformer -import numpy as np - -# Log transformation -log_transformer = FunctionTransformer(np.log1p) - -# Custom function -def custom_transform(X): - # Your transformation logic - return X_transformed - -custom_transformer = FunctionTransformer(custom_transform) - -# In pipeline -pipeline = Pipeline([ - ('log', log_transformer), - ('scaler', StandardScaler()), - ('model', LinearRegression()) -]) -``` - -### Creating Custom Transformer Class +### Custom Transformers in Pipelines ```python from sklearn.base import BaseEstimator, TransformerMixin -class CustomTransformer(BaseEstimator, TransformerMixin): - def __init__(self, parameter=1.0): - self.parameter = parameter - +class TextLengthExtractor(BaseEstimator, TransformerMixin): def fit(self, X, y=None): - # Learn parameters from X - self.learned_param_ = X.mean() # Example return self def transform(self, X): - # Transform X using learned parameters - return X * self.parameter - self.learned_param_ + return [[len(text)] for text in X] - # Optional: for pipelines that need inverse transform - def inverse_transform(self, X): - return (X + self.learned_param_) / self.parameter - -# Use in pipeline pipeline = Pipeline([ - ('custom', CustomTransformer(parameter=2.0)), - ('model', LinearRegression()) + ('length', TextLengthExtractor()), + ('scaler', StandardScaler()), + ('classifier', LogisticRegression()) ]) ``` -**Key requirements**: -- Inherit from `BaseEstimator` and `TransformerMixin` -- Implement `fit()` and `transform()` methods -- `fit()` must return `self` -- Use trailing underscore for learned attributes (`learned_param_`) -- Constructor parameters should be stored as attributes - -### Transformer for Pandas DataFrames +### Slicing Pipelines + +```python +# Get sub-pipeline +sub_pipeline = pipeline[:2] # First two steps + +# Get specific range +middle_steps = pipeline[1:3] +``` + +## TransformedTargetRegressor + +### Basic Usage + +**TransformedTargetRegressor** +- Transforms target variable before fitting +- Automatically inverse-transforms predictions +- Example: +```python +from sklearn.compose import TransformedTargetRegressor +from sklearn.preprocessing import QuantileTransformer +from sklearn.linear_model import LinearRegression + +model = TransformedTargetRegressor( + regressor=LinearRegression(), + transformer=QuantileTransformer(output_distribution='normal') +) + +model.fit(X_train, y_train) +y_pred = model.predict(X_test) # Automatically inverse-transformed +``` + +### With Functions + +```python +import numpy as np + +model = TransformedTargetRegressor( + regressor=LinearRegression(), + func=np.log1p, + inverse_func=np.expm1 +) + +model.fit(X_train, y_train) +``` + +## Complete Example: End-to-End Pipeline ```python -from sklearn.base import BaseEstimator, TransformerMixin import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV -class DataFrameTransformer(BaseEstimator, TransformerMixin): - def __init__(self, columns=None): - self.columns = columns +# Define feature types +numeric_features = ['age', 'income', 'hours_per_week'] +categorical_features = ['gender', 'occupation', 'education'] - def fit(self, X, y=None): - return self +# Numeric preprocessing pipeline +numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()) +]) - def transform(self, X): - if isinstance(X, pd.DataFrame): - if self.columns: - return X[self.columns].values - return X.values - return X +# Categorical preprocessing pipeline +categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) +]) + +# Combine preprocessing +preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) + ] +) + +# Full pipeline +pipeline = Pipeline([ + ('preprocessor', preprocessor), + ('pca', PCA(n_components=0.95)), # Keep 95% variance + ('classifier', RandomForestClassifier(random_state=42)) +]) + +# Hyperparameter tuning +param_grid = { + 'preprocessor__num__imputer__strategy': ['mean', 'median'], + 'pca__n_components': [0.90, 0.95, 0.99], + 'classifier__n_estimators': [100, 200], + 'classifier__max_depth': [10, 20, None] +} + +grid_search = GridSearchCV( + pipeline, param_grid, + cv=5, scoring='accuracy', + n_jobs=-1, verbose=1 +) + +grid_search.fit(X_train, y_train) + +print(f"Best parameters: {grid_search.best_params_}") +print(f"Best CV score: {grid_search.best_score_:.3f}") +print(f"Test score: {grid_search.score(X_test, y_test):.3f}") + +# Make predictions +best_pipeline = grid_search.best_estimator_ +y_pred = best_pipeline.predict(X_test) +y_proba = best_pipeline.predict_proba(X_test) ``` ## Visualization -### Display Pipeline in Jupyter +### Displaying Pipelines ```python +# In Jupyter notebooks, pipelines display as diagrams from sklearn import set_config - -# Enable HTML display set_config(display='diagram') -# Now displaying the pipeline shows interactive diagram -pipeline +pipeline # Displays visual diagram ``` -### Print Pipeline Structure +### Text Representation ```python -from sklearn.utils import estimator_html_repr - -# Get HTML representation -html = estimator_html_repr(pipeline) - -# Or just print +# Print pipeline structure print(pipeline) -``` -## Advanced Patterns - -### Conditional Transformations - -```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler, FunctionTransformer - -def conditional_scale(X, scale=True): - if scale: - return StandardScaler().fit_transform(X) - return X - -pipeline = Pipeline([ - ('conditional_scaler', FunctionTransformer( - conditional_scale, - kw_args={'scale': True} - )), - ('model', LogisticRegression()) -]) -``` - -### Multiple Preprocessing Paths - -```python -from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline - -# Different preprocessing for different feature types -preprocessor = ColumnTransformer([ - # Numeric: impute + scale - ('num_standard', Pipeline([ - ('imputer', SimpleImputer(strategy='mean')), - ('scaler', StandardScaler()) - ]), ['age', 'income']), - - # Numeric: impute + log + scale - ('num_skewed', Pipeline([ - ('imputer', SimpleImputer(strategy='median')), - ('log', FunctionTransformer(np.log1p)), - ('scaler', StandardScaler()) - ]), ['price', 'revenue']), - - # Categorical: impute + one-hot - ('cat', Pipeline([ - ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore')) - ]), ['category', 'region']), - - # Text: TF-IDF - ('text', TfidfVectorizer(), 'description') -]) -``` - -### Feature Engineering Pipeline - -```python -from sklearn.base import BaseEstimator, TransformerMixin - -class FeatureEngineer(BaseEstimator, TransformerMixin): - def fit(self, X, y=None): - return self - - def transform(self, X): - X = X.copy() - # Add engineered features - X['age_income_ratio'] = X['age'] / (X['income'] + 1) - X['total_score'] = X['score1'] + X['score2'] + X['score3'] - return X - -pipeline = Pipeline([ - ('engineer', FeatureEngineer()), - ('preprocessor', preprocessor), - ('model', RandomForestClassifier()) -]) +# Get detailed parameters +print(pipeline.get_params()) ``` ## Best Practices -### Always Use Pipelines When - -1. **Preprocessing is needed**: Scaling, encoding, imputation -2. **Cross-validation**: Ensures proper fit/transform split -3. **Hyperparameter tuning**: Joint optimization of preprocessing and model -4. **Production deployment**: Single object to serialize -5. **Multiple steps**: Any workflow with >1 step - -### Pipeline Do's - -- ✅ Fit pipeline only on training data -- ✅ Use ColumnTransformer for heterogeneous data -- ✅ Cache expensive transformations during grid search -- ✅ Use make_pipeline for simple cases -- ✅ Set verbose=True to debug issues -- ✅ Use remainder='passthrough' when appropriate - -### Pipeline Don'ts - -- ❌ Fit preprocessing on full dataset before split (data leakage!) -- ❌ Manually transform test data (use pipeline.predict()) -- ❌ Forget to handle missing values before scaling -- ❌ Mix pandas DataFrames and arrays inconsistently -- ❌ Skip using pipelines for "just one preprocessing step" - -### Data Leakage Prevention +### Always Use Pipelines +- Prevents data leakage +- Ensures consistency between training and prediction +- Makes code more maintainable +- Enables easy hyperparameter tuning +### Proper Pipeline Construction ```python -# ❌ WRONG - Data leakage -scaler = StandardScaler().fit(X) # Fit on all data -X_train, X_test, y_train, y_test = train_test_split(X, y) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) - -# ✅ CORRECT - No leakage with pipeline +# Good: Preprocessing inside pipeline pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LogisticRegression()) ]) - -X_train, X_test, y_train, y_test = train_test_split(X, y) -pipeline.fit(X_train, y_train) # Scaler fits only on train -y_pred = pipeline.predict(X_test) # Scaler transforms only on test - -# ✅ CORRECT - No leakage in cross-validation -scores = cross_val_score(pipeline, X, y, cv=5) -# Each fold: scaler fits on train folds, transforms on test fold -``` - -### Debugging Pipelines - -```python -# Examine intermediate outputs -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('pca', PCA(n_components=10)), - ('model', LogisticRegression()) -]) - -# Fit pipeline pipeline.fit(X_train, y_train) -# Get output after scaling -X_scaled = pipeline.named_steps['scaler'].transform(X_train) - -# Get output after PCA -X_pca = pipeline[:-1].transform(X_train) # All steps except last - -# Or build partial pipeline -partial_pipeline = Pipeline(pipeline.steps[:-1]) -X_transformed = partial_pipeline.transform(X_train) +# Bad: Preprocessing outside pipeline (can cause leakage) +X_train_scaled = StandardScaler().fit_transform(X_train) +model = LogisticRegression() +model.fit(X_train_scaled, y_train) ``` -### Saving and Loading Pipelines - +### Use ColumnTransformer for Mixed Data +Always use ColumnTransformer when you have both numerical and categorical features: ```python -import joblib - -# Save pipeline -joblib.dump(pipeline, 'model_pipeline.pkl') - -# Load pipeline -pipeline = joblib.load('model_pipeline.pkl') - -# Use loaded pipeline -y_pred = pipeline.predict(X_new) +preprocessor = ColumnTransformer([ + ('num', StandardScaler(), numeric_features), + ('cat', OneHotEncoder(), categorical_features) +]) ``` -## Common Errors and Solutions +### Name Your Steps Meaningfully +```python +# Good +pipeline = Pipeline([ + ('imputer', SimpleImputer()), + ('scaler', StandardScaler()), + ('pca', PCA(n_components=10)), + ('rf_classifier', RandomForestClassifier()) +]) -**Error**: `ValueError: could not convert string to float` -- **Cause**: Categorical features not encoded -- **Solution**: Add OneHotEncoder or OrdinalEncoder to pipeline +# Bad +pipeline = Pipeline([ + ('step1', SimpleImputer()), + ('step2', StandardScaler()), + ('step3', PCA(n_components=10)), + ('step4', RandomForestClassifier()) +]) +``` -**Error**: `All intermediate steps should be transformers` -- **Cause**: Non-transformer in non-final position -- **Solution**: Ensure only last step is predictor +### Cache Expensive Transformations +For repeated fitting (e.g., during grid search), cache expensive steps: +```python +from tempfile import mkdtemp -**Error**: `X has different number of features than during fitting` -- **Cause**: Different columns in train and test -- **Solution**: Ensure consistent column handling, use `handle_unknown='ignore'` in OneHotEncoder +cachedir = mkdtemp() +pipeline = Pipeline([ + ('expensive_preprocessing', ExpensiveTransformer()), + ('classifier', LogisticRegression()) +], memory=cachedir) +``` -**Error**: Different results in cross-validation vs train-test split -- **Cause**: Data leakage (fitting preprocessing on all data) -- **Solution**: Always use Pipeline for preprocessing - -**Error**: Pipeline too slow during grid search -- **Solution**: Use caching with `memory` parameter +### Test Pipeline Compatibility +Ensure all steps are compatible: +- All intermediate steps must have fit() and transform() +- Final step needs fit() and predict() (or transform()) +- Use set_output(transform='pandas') for DataFrame output +```python +pipeline.set_output(transform='pandas') +X_transformed = pipeline.transform(X) # Returns DataFrame +``` diff --git a/scientific-packages/scikit-learn/references/preprocessing.md b/scientific-packages/scikit-learn/references/preprocessing.md index f718e67..f84aa04 100644 --- a/scientific-packages/scikit-learn/references/preprocessing.md +++ b/scientific-packages/scikit-learn/references/preprocessing.md @@ -1,345 +1,563 @@ -# Data Preprocessing in scikit-learn +# Data Preprocessing and Feature Engineering Reference ## Overview -Preprocessing transforms raw data into a format suitable for machine learning algorithms. Many algorithms require standardized or normalized data to perform well. -## Standardization and Scaling +Data preprocessing transforms raw data into a format suitable for machine learning models. This includes scaling, encoding, handling missing values, and feature engineering. + +## Feature Scaling and Normalization ### StandardScaler -Removes mean and scales to unit variance (z-score normalization). - -**Formula**: `z = (x - μ) / σ` - -**Use cases**: -- Most ML algorithms (especially SVM, neural networks, PCA) -- When features have different units or scales -- When assuming Gaussian-like distribution - -**Important**: Fit only on training data, then transform both train and test sets. +**StandardScaler (`sklearn.preprocessing.StandardScaler`)** +- Standardizes features to zero mean and unit variance +- Formula: z = (x - mean) / std +- Use when: Features have different scales, algorithm assumes normally distributed data +- Required for: SVM, KNN, Neural Networks, PCA, Linear Regression with regularization +- Example: ```python from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) -X_test_scaled = scaler.transform(X_test) # Use same parameters +X_test_scaled = scaler.transform(X_test) # Use same parameters as training + +# Access learned parameters +print(f"Mean: {scaler.mean_}") +print(f"Std: {scaler.scale_}") ``` ### MinMaxScaler -Scales features to a specified range, typically [0, 1]. - -**Formula**: `X_scaled = (X - X_min) / (X_max - X_min)` - -**Use cases**: -- When bounded range is needed -- Neural networks (often prefer [0, 1] range) -- When distribution is not Gaussian -- Image pixel values - -**Parameters**: -- `feature_range`: Tuple (min, max), default (0, 1) - -**Warning**: Sensitive to outliers since it uses min/max. - -### MaxAbsScaler -Scales to [-1, 1] by dividing by maximum absolute value. - -**Use cases**: -- Sparse data (preserves sparsity) -- Data already centered at zero -- When sign of values is meaningful - -**Advantage**: Doesn't shift/center the data, preserves zero entries. - -### RobustScaler -Uses median and interquartile range (IQR) instead of mean and standard deviation. - -**Formula**: `X_scaled = (X - median) / IQR` - -**Use cases**: -- When outliers are present -- When StandardScaler produces skewed results -- Robust statistics preferred - -**Parameters**: -- `quantile_range`: Tuple (q_min, q_max), default (25.0, 75.0) - -## Normalization - -### normalize() function and Normalizer -Scales individual samples (rows) to unit norm, not features (columns). - -**Use cases**: -- Text classification (TF-IDF vectors) -- When similarity metrics (dot product, cosine) are used -- When each sample should have equal weight - -**Norms**: -- `l1`: Manhattan norm (sum of absolutes = 1) -- `l2`: Euclidean norm (sum of squares = 1) - **most common** -- `max`: Maximum absolute value = 1 - -**Key difference from scalers**: Operates on rows (samples), not columns (features). +**MinMaxScaler (`sklearn.preprocessing.MinMaxScaler`)** +- Scales features to a given range (default [0, 1]) +- Formula: X_scaled = (X - X.min) / (X.max - X.min) +- Use when: Need bounded values, data not normally distributed +- Sensitive to outliers +- Example: ```python -from sklearn.preprocessing import Normalizer -normalizer = Normalizer(norm='l2') -X_normalized = normalizer.transform(X) +from sklearn.preprocessing import MinMaxScaler + +scaler = MinMaxScaler(feature_range=(0, 1)) +X_scaled = scaler.fit_transform(X_train) + +# Custom range +scaler = MinMaxScaler(feature_range=(-1, 1)) +X_scaled = scaler.fit_transform(X_train) ``` -## Encoding Categorical Features +### RobustScaler + +**RobustScaler (`sklearn.preprocessing.RobustScaler`)** +- Scales using median and interquartile range (IQR) +- Formula: X_scaled = (X - median) / IQR +- Use when: Data contains outliers +- Robust to outliers +- Example: +```python +from sklearn.preprocessing import RobustScaler + +scaler = RobustScaler() +X_scaled = scaler.fit_transform(X_train) +``` + +### Normalizer + +**Normalizer (`sklearn.preprocessing.Normalizer`)** +- Normalizes samples individually to unit norm +- Common norms: 'l1', 'l2', 'max' +- Use when: Need to normalize each sample independently (e.g., text features) +- Example: +```python +from sklearn.preprocessing import Normalizer + +normalizer = Normalizer(norm='l2') # Euclidean norm +X_normalized = normalizer.fit_transform(X) +``` + +### MaxAbsScaler + +**MaxAbsScaler (`sklearn.preprocessing.MaxAbsScaler`)** +- Scales by maximum absolute value +- Range: [-1, 1] +- Doesn't shift/center data (preserves sparsity) +- Use when: Data is already centered or sparse +- Example: +```python +from sklearn.preprocessing import MaxAbsScaler + +scaler = MaxAbsScaler() +X_scaled = scaler.fit_transform(X_sparse) +``` + +## Encoding Categorical Variables + +### OneHotEncoder + +**OneHotEncoder (`sklearn.preprocessing.OneHotEncoder`)** +- Creates binary columns for each category +- Use when: Nominal categories (no order), tree-based models or linear models +- Example: +```python +from sklearn.preprocessing import OneHotEncoder + +encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') +X_encoded = encoder.fit_transform(X_categorical) + +# Get feature names +feature_names = encoder.get_feature_names_out(['color', 'size']) + +# Handle unknown categories during transform +X_test_encoded = encoder.transform(X_test_categorical) +``` ### OrdinalEncoder -Converts categories to integers (0 to n_categories - 1). - -**Use cases**: -- Ordinal relationships exist (small < medium < large) -- Preprocessing before other transformations -- Tree-based algorithms (which can handle integers) - -**Parameters**: -- `handle_unknown`: 'error' or 'use_encoded_value' -- `unknown_value`: Value for unknown categories -- `encoded_missing_value`: Value for missing data +**OrdinalEncoder (`sklearn.preprocessing.OrdinalEncoder`)** +- Encodes categories as integers +- Use when: Ordinal categories (ordered), or tree-based models +- Example: ```python from sklearn.preprocessing import OrdinalEncoder + +# Natural ordering encoder = OrdinalEncoder() X_encoded = encoder.fit_transform(X_categorical) + +# Custom ordering +encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']]) +X_encoded = encoder.fit_transform(X_categorical) ``` -### OneHotEncoder -Creates binary columns for each category. - -**Use cases**: -- Nominal categories (no order) -- Linear models, neural networks -- When category relationships shouldn't be assumed - -**Parameters**: -- `drop`: 'first', 'if_binary', array-like (prevents multicollinearity) -- `sparse_output`: True (default, memory efficient) or False -- `handle_unknown`: 'error', 'ignore', 'infrequent_if_exist' -- `min_frequency`: Group infrequent categories -- `max_categories`: Limit number of categories - -**High cardinality handling**: -```python -encoder = OneHotEncoder(min_frequency=100, handle_unknown='infrequent_if_exist') -# Groups categories appearing < 100 times into 'infrequent' category -``` - -**Memory tip**: Use `sparse_output=True` (default) for high-cardinality features. - -### TargetEncoder -Uses target statistics to encode categories. - -**Use cases**: -- High-cardinality categorical features (zip codes, user IDs) -- When linear relationships with target are expected -- Often improves performance over one-hot encoding - -**How it works**: -- Replaces category with mean of target for that category -- Uses cross-fitting during fit_transform() to prevent target leakage -- Applies smoothing to handle rare categories - -**Parameters**: -- `smooth`: Smoothing parameter for rare categories -- `cv`: Cross-validation strategy - -**Warning**: Only for supervised learning. Requires target variable. - -```python -from sklearn.preprocessing import TargetEncoder -encoder = TargetEncoder() -X_encoded = encoder.fit_transform(X_categorical, y) -``` - ### LabelEncoder -Encodes target labels into integers 0 to n_classes - 1. -**Use cases**: Encoding target variable for classification (not features!) +**LabelEncoder (`sklearn.preprocessing.LabelEncoder`)** +- Encodes target labels (y) as integers +- Use for: Target variable encoding +- Example: +```python +from sklearn.preprocessing import LabelEncoder -**Important**: Use `LabelEncoder` for targets, not features. For features, use OrdinalEncoder or OneHotEncoder. +le = LabelEncoder() +y_encoded = le.fit_transform(y) -### Binarizer -Converts numeric values to binary (0 or 1) based on threshold. +# Decode back +y_decoded = le.inverse_transform(y_encoded) +print(f"Classes: {le.classes_}") +``` -**Use cases**: Creating binary features from continuous values +### Target Encoding (using category_encoders) + +```python +# Install: uv pip install category-encoders +from category_encoders import TargetEncoder + +encoder = TargetEncoder() +X_train_encoded = encoder.fit_transform(X_train_categorical, y_train) +X_test_encoded = encoder.transform(X_test_categorical) +``` ## Non-linear Transformations -### QuantileTransformer -Maps features to uniform or normal distribution using rank transformation. - -**Use cases**: -- Unusual distributions (bimodal, heavy tails) -- Reducing outlier impact -- When normal distribution is desired - -**Parameters**: -- `output_distribution`: 'uniform' (default) or 'normal' -- `n_quantiles`: Number of quantiles (default: min(1000, n_samples)) - -**Effect**: Strong transformation that reduces outlier influence and makes data more Gaussian-like. - -### PowerTransformer -Applies parametric monotonic transformation to make data more Gaussian. - -**Methods**: -- `yeo-johnson`: Works with positive and negative values (default) -- `box-cox`: Only positive values - -**Use cases**: -- Skewed distributions -- When Gaussian assumption is important -- Variance stabilization - -**Advantage**: Less radical than QuantileTransformer, preserves more of original relationships. - -## Discretization - -### KBinsDiscretizer -Bins continuous features into discrete intervals. - -**Strategies**: -- `uniform`: Equal-width bins -- `quantile`: Equal-frequency bins -- `kmeans`: K-means clustering to determine bins - -**Encoding**: -- `ordinal`: Integer encoding (0 to n_bins - 1) -- `onehot`: One-hot encoding -- `onehot-dense`: Dense one-hot encoding - -**Use cases**: -- Making linear models handle non-linear relationships -- Reducing noise in features -- Making features more interpretable +### Power Transforms +**PowerTransformer** +- Makes data more Gaussian-like +- Methods: 'yeo-johnson' (works with negative values), 'box-cox' (positive only) +- Use when: Data is skewed, algorithm assumes normality +- Example: ```python -from sklearn.preprocessing import KBinsDiscretizer -disc = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile') -X_binned = disc.fit_transform(X) +from sklearn.preprocessing import PowerTransformer + +# Yeo-Johnson (handles negative values) +pt = PowerTransformer(method='yeo-johnson', standardize=True) +X_transformed = pt.fit_transform(X) + +# Box-Cox (positive values only) +pt = PowerTransformer(method='box-cox', standardize=True) +X_transformed = pt.fit_transform(X) ``` -## Feature Generation - -### PolynomialFeatures -Generates polynomial and interaction features. - -**Parameters**: -- `degree`: Polynomial degree -- `interaction_only`: Only multiplicative interactions (no x²) -- `include_bias`: Include constant feature - -**Use cases**: -- Adding non-linearity to linear models -- Feature engineering -- Polynomial regression - -**Warning**: Number of features grows rapidly: (n+d)!/d!n! for degree d. +### Quantile Transformation +**QuantileTransformer** +- Transforms features to follow uniform or normal distribution +- Robust to outliers +- Use when: Want to reduce outlier impact +- Example: ```python -from sklearn.preprocessing import PolynomialFeatures -poly = PolynomialFeatures(degree=2, include_bias=False) -X_poly = poly.fit_transform(X) -# [x1, x2] → [x1, x2, x1², x1·x2, x2²] +from sklearn.preprocessing import QuantileTransformer + +# Transform to uniform distribution +qt = QuantileTransformer(output_distribution='uniform', random_state=42) +X_transformed = qt.fit_transform(X) + +# Transform to normal distribution +qt = QuantileTransformer(output_distribution='normal', random_state=42) +X_transformed = qt.fit_transform(X) ``` -### SplineTransformer -Generates B-spline basis functions. +### Log Transform -**Use cases**: -- Smooth non-linear transformations -- Alternative to PolynomialFeatures (less oscillation at boundaries) -- Generalized additive models (GAMs) +```python +import numpy as np -**Parameters**: -- `n_knots`: Number of knots -- `degree`: Spline degree -- `knots`: Knot positions ('uniform', 'quantile', or array) +# Log1p (log(1 + x)) - handles zeros +X_log = np.log1p(X) -## Missing Value Handling +# Or use FunctionTransformer +from sklearn.preprocessing import FunctionTransformer + +log_transformer = FunctionTransformer(np.log1p, inverse_func=np.expm1) +X_log = log_transformer.fit_transform(X) +``` + +## Missing Value Imputation ### SimpleImputer -Imputes missing values with various strategies. - -**Strategies**: -- `mean`: Mean of column (numeric only) -- `median`: Median of column (numeric only) -- `most_frequent`: Mode (numeric or categorical) -- `constant`: Fill with constant value - -**Parameters**: -- `strategy`: Imputation strategy -- `fill_value`: Value when strategy='constant' -- `missing_values`: What represents missing (np.nan, None, specific value) +**SimpleImputer (`sklearn.impute.SimpleImputer`)** +- Basic imputation strategies +- Strategies: 'mean', 'median', 'most_frequent', 'constant' +- Example: ```python from sklearn.impute import SimpleImputer -imputer = SimpleImputer(strategy='median') + +# For numerical features +imputer = SimpleImputer(strategy='mean') +X_imputed = imputer.fit_transform(X) + +# For categorical features +imputer = SimpleImputer(strategy='most_frequent') +X_imputed = imputer.fit_transform(X_categorical) + +# Fill with constant +imputer = SimpleImputer(strategy='constant', fill_value=0) X_imputed = imputer.fit_transform(X) ``` -### KNNImputer -Imputes using k-nearest neighbors. +### Iterative Imputer -**Use cases**: When relationships between features should inform imputation +**IterativeImputer** +- Models each feature with missing values as function of other features +- More sophisticated than SimpleImputer +- Example: +```python +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import IterativeImputer -**Parameters**: -- `n_neighbors`: Number of neighbors -- `weights`: 'uniform' or 'distance' +imputer = IterativeImputer(max_iter=10, random_state=42) +X_imputed = imputer.fit_transform(X) +``` -### IterativeImputer -Models each feature with missing values as function of other features. +### KNN Imputer -**Use cases**: -- Complex relationships between features -- When multiple features have missing values -- Higher quality imputation (but slower) +**KNNImputer** +- Imputes using k-nearest neighbors +- Use when: Features are correlated +- Example: +```python +from sklearn.impute import KNNImputer -**Parameters**: -- `estimator`: Estimator for regression (default: BayesianRidge) -- `max_iter`: Maximum iterations +imputer = KNNImputer(n_neighbors=5) +X_imputed = imputer.fit_transform(X) +``` -## Function Transformers +## Feature Engineering -### FunctionTransformer -Applies custom function to data. +### Polynomial Features -**Use cases**: -- Custom transformations in pipelines -- Log transformation, square root, etc. -- Domain-specific preprocessing +**PolynomialFeatures** +- Creates polynomial and interaction features +- Use when: Need non-linear features for linear models +- Example: +```python +from sklearn.preprocessing import PolynomialFeatures + +# Degree 2: includes x1, x2, x1^2, x2^2, x1*x2 +poly = PolynomialFeatures(degree=2, include_bias=False) +X_poly = poly.fit_transform(X) + +# Get feature names +feature_names = poly.get_feature_names_out(['x1', 'x2']) + +# Only interactions (no powers) +poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) +X_interactions = poly.fit_transform(X) +``` + +### Binning/Discretization + +**KBinsDiscretizer** +- Bins continuous features into discrete intervals +- Strategies: 'uniform', 'quantile', 'kmeans' +- Encoding: 'onehot', 'ordinal', 'onehot-dense' +- Example: +```python +from sklearn.preprocessing import KBinsDiscretizer + +# Equal-width bins +binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') +X_binned = binner.fit_transform(X) + +# Equal-frequency bins (quantile-based) +binner = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile') +X_binned = binner.fit_transform(X) +``` + +### Binarization + +**Binarizer** +- Converts features to binary (0 or 1) based on threshold +- Example: +```python +from sklearn.preprocessing import Binarizer + +binarizer = Binarizer(threshold=0.5) +X_binary = binarizer.fit_transform(X) +``` + +### Spline Features + +**SplineTransformer** +- Creates spline basis functions +- Useful for capturing non-linear relationships +- Example: +```python +from sklearn.preprocessing import SplineTransformer + +spline = SplineTransformer(n_knots=5, degree=3) +X_splines = spline.fit_transform(X) +``` + +## Text Feature Extraction + +### CountVectorizer + +**CountVectorizer (`sklearn.feature_extraction.text.CountVectorizer`)** +- Converts text to token count matrix +- Use for: Bag-of-words representation +- Example: +```python +from sklearn.feature_extraction.text import CountVectorizer + +vectorizer = CountVectorizer( + max_features=5000, # Keep top 5000 features + min_df=2, # Ignore terms appearing in < 2 documents + max_df=0.8, # Ignore terms appearing in > 80% documents + ngram_range=(1, 2) # Unigrams and bigrams +) + +X_counts = vectorizer.fit_transform(documents) +feature_names = vectorizer.get_feature_names_out() +``` + +### TfidfVectorizer + +**TfidfVectorizer** +- TF-IDF (Term Frequency-Inverse Document Frequency) transformation +- Better than CountVectorizer for most tasks +- Example: +```python +from sklearn.feature_extraction.text import TfidfVectorizer + +vectorizer = TfidfVectorizer( + max_features=5000, + min_df=2, + max_df=0.8, + ngram_range=(1, 2), + stop_words='english' # Remove English stop words +) + +X_tfidf = vectorizer.fit_transform(documents) +``` + +### HashingVectorizer + +**HashingVectorizer** +- Uses hashing trick for memory efficiency +- No fit needed, can't reverse transform +- Use when: Very large vocabulary, streaming data +- Example: +```python +from sklearn.feature_extraction.text import HashingVectorizer + +vectorizer = HashingVectorizer(n_features=2**18) +X_hashed = vectorizer.transform(documents) # No fit needed +``` + +## Feature Selection + +### Filter Methods + +**Variance Threshold** +- Removes low-variance features +- Example: +```python +from sklearn.feature_selection import VarianceThreshold + +selector = VarianceThreshold(threshold=0.01) +X_selected = selector.fit_transform(X) +``` + +**SelectKBest / SelectPercentile** +- Select features based on statistical tests +- Tests: f_classif, chi2, mutual_info_classif +- Example: +```python +from sklearn.feature_selection import SelectKBest, f_classif + +# Select top 10 features +selector = SelectKBest(score_func=f_classif, k=10) +X_selected = selector.fit_transform(X_train, y_train) + +# Get selected feature indices +selected_indices = selector.get_support(indices=True) +``` + +### Wrapper Methods + +**Recursive Feature Elimination (RFE)** +- Recursively removes features +- Uses model feature importances +- Example: +```python +from sklearn.feature_selection import RFE +from sklearn.ensemble import RandomForestClassifier + +model = RandomForestClassifier(n_estimators=100, random_state=42) +rfe = RFE(estimator=model, n_features_to_select=10, step=1) +X_selected = rfe.fit_transform(X_train, y_train) + +# Get selected features +selected_features = rfe.support_ +feature_ranking = rfe.ranking_ +``` + +**RFECV (with Cross-Validation)** +- RFE with cross-validation to find optimal number of features +- Example: +```python +from sklearn.feature_selection import RFECV + +model = RandomForestClassifier(n_estimators=100, random_state=42) +rfecv = RFECV(estimator=model, cv=5, scoring='accuracy') +X_selected = rfecv.fit_transform(X_train, y_train) + +print(f"Optimal number of features: {rfecv.n_features_}") +``` + +### Embedded Methods + +**SelectFromModel** +- Select features based on model coefficients/importances +- Works with: Linear models (L1), Tree-based models +- Example: +```python +from sklearn.feature_selection import SelectFromModel +from sklearn.ensemble import RandomForestClassifier + +model = RandomForestClassifier(n_estimators=100, random_state=42) +selector = SelectFromModel(model, threshold='median') +selector.fit(X_train, y_train) +X_selected = selector.transform(X_train) + +# Get selected features +selected_features = selector.get_support() +``` + +**L1-based Feature Selection** +```python +from sklearn.linear_model import LogisticRegression +from sklearn.feature_selection import SelectFromModel + +model = LogisticRegression(penalty='l1', solver='liblinear', C=0.1) +selector = SelectFromModel(model) +selector.fit(X_train, y_train) +X_selected = selector.transform(X_train) +``` + +## Handling Outliers + +### IQR Method + +```python +import numpy as np + +Q1 = np.percentile(X, 25, axis=0) +Q3 = np.percentile(X, 75, axis=0) +IQR = Q3 - Q1 + +# Define outlier boundaries +lower_bound = Q1 - 1.5 * IQR +upper_bound = Q3 + 1.5 * IQR + +# Remove outliers +mask = np.all((X >= lower_bound) & (X <= upper_bound), axis=1) +X_no_outliers = X[mask] +``` + +### Winsorization + +```python +from scipy.stats import mstats + +# Clip outliers at 5th and 95th percentiles +X_winsorized = mstats.winsorize(X, limits=[0.05, 0.05], axis=0) +``` + +## Custom Transformers + +### Using FunctionTransformer ```python from sklearn.preprocessing import FunctionTransformer import numpy as np -log_transformer = FunctionTransformer(np.log1p, validate=True) -X_log = log_transformer.transform(X) +def log_transform(X): + return np.log1p(X) + +transformer = FunctionTransformer(log_transform, inverse_func=np.expm1) +X_transformed = transformer.fit_transform(X) +``` + +### Creating Custom Transformer + +```python +from sklearn.base import BaseEstimator, TransformerMixin + +class CustomTransformer(BaseEstimator, TransformerMixin): + def __init__(self, parameter=1): + self.parameter = parameter + + def fit(self, X, y=None): + # Learn parameters from X if needed + return self + + def transform(self, X): + # Transform X + return X * self.parameter + +transformer = CustomTransformer(parameter=2) +X_transformed = transformer.fit_transform(X) ``` ## Best Practices -### Feature Scaling Guidelines +### Fit on Training Data Only +Always fit transformers on training data only: +```python +# Correct +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) -**Always scale**: -- SVM, neural networks -- K-nearest neighbors -- Linear/Logistic regression with regularization -- PCA, LDA -- Gradient descent-based algorithms - -**Don't need to scale**: -- Tree-based algorithms (Decision Trees, Random Forests, Gradient Boosting) -- Naive Bayes - -### Pipeline Integration - -Always use preprocessing within pipelines to prevent data leakage: +# Wrong - causes data leakage +scaler = StandardScaler() +X_all_scaled = scaler.fit_transform(np.vstack([X_train, X_test])) +``` +### Use Pipelines +Combine preprocessing with models: ```python from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler @@ -350,64 +568,39 @@ pipeline = Pipeline([ ('classifier', LogisticRegression()) ]) -pipeline.fit(X_train, y_train) # Scaler fit only on train data -y_pred = pipeline.predict(X_test) # Scaler transform only on test data +pipeline.fit(X_train, y_train) ``` -### Common Transformations by Data Type - -**Numeric - Continuous**: -- StandardScaler (most common) -- MinMaxScaler (neural networks) -- RobustScaler (outliers present) -- PowerTransformer (skewed data) - -**Numeric - Count Data**: -- sqrt or log transformation -- QuantileTransformer -- StandardScaler after transformation - -**Categorical - Low Cardinality (<10 categories)**: -- OneHotEncoder - -**Categorical - High Cardinality (>10 categories)**: -- TargetEncoder (supervised) -- Frequency encoding -- OneHotEncoder with min_frequency parameter - -**Categorical - Ordinal**: -- OrdinalEncoder - -**Text**: -- CountVectorizer or TfidfVectorizer -- Normalizer after vectorization - -### Data Leakage Prevention - -1. **Fit only on training data**: Never include test data when fitting preprocessors -2. **Use pipelines**: Ensures proper fit/transform separation -3. **Cross-validation**: Use Pipeline with cross_val_score() for proper evaluation -4. **Target encoding**: Use cv parameter in TargetEncoder for cross-fitting - +### Handle Categorical and Numerical Separately +Use ColumnTransformer: ```python -# WRONG - data leakage -scaler = StandardScaler().fit(X_full) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder -# CORRECT - no leakage -scaler = StandardScaler().fit(X_train) -X_train_scaled = scaler.transform(X_train) -X_test_scaled = scaler.transform(X_test) +numeric_features = ['age', 'income'] +categorical_features = ['gender', 'occupation'] + +preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numeric_features), + ('cat', OneHotEncoder(), categorical_features) + ] +) + +X_transformed = preprocessor.fit_transform(X) ``` -## Preprocessing Checklist +### Algorithm-Specific Requirements -Before modeling: -1. Handle missing values (imputation or removal) -2. Encode categorical variables appropriately -3. Scale/normalize numeric features (if needed for algorithm) -4. Handle outliers (RobustScaler, clipping, removal) -5. Create additional features if beneficial (PolynomialFeatures, domain knowledge) -6. Check for data leakage in preprocessing steps -7. Wrap everything in a Pipeline +**Require Scaling:** +- SVM, KNN, Neural Networks +- PCA, Linear/Logistic Regression with regularization +- K-Means clustering + +**Don't Require Scaling:** +- Tree-based models (Decision Trees, Random Forest, Gradient Boosting) +- Naive Bayes + +**Encoding Requirements:** +- Linear models, SVM, KNN: One-hot encoding for nominal features +- Tree-based models: Can handle ordinal encoding directly diff --git a/scientific-packages/scikit-learn/references/quick_reference.md b/scientific-packages/scikit-learn/references/quick_reference.md index 97adc71..3bcdd20 100644 --- a/scientific-packages/scikit-learn/references/quick_reference.md +++ b/scientific-packages/scikit-learn/references/quick_reference.md @@ -1,546 +1,287 @@ # Scikit-learn Quick Reference -## Essential Imports +## Common Import Patterns ```python -# Core -import numpy as np -import pandas as pd +# Core scikit-learn +import sklearn + +# Data splitting and cross-validation from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV -from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.compose import ColumnTransformer # Preprocessing -from sklearn.preprocessing import ( - StandardScaler, MinMaxScaler, RobustScaler, - OneHotEncoder, OrdinalEncoder, LabelEncoder, - PolynomialFeatures -) +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.impute import SimpleImputer -# Models - Classification -from sklearn.linear_model import LogisticRegression +# Feature selection +from sklearn.feature_selection import SelectKBest, RFE + +# Supervised learning +from sklearn.linear_model import LogisticRegression, Ridge, Lasso +from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor +from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import ( - RandomForestClassifier, - GradientBoostingClassifier, - HistGradientBoostingClassifier -) -from sklearn.svm import SVC -from sklearn.neighbors import KNeighborsClassifier -# Models - Regression -from sklearn.linear_model import LinearRegression, Ridge, Lasso -from sklearn.ensemble import ( - RandomForestRegressor, - GradientBoostingRegressor, - HistGradientBoostingRegressor -) - -# Clustering +# Unsupervised learning from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering -from sklearn.mixture import GaussianMixture - -# Dimensionality Reduction -from sklearn.decomposition import PCA, NMF, TruncatedSVD -from sklearn.manifold import TSNE +from sklearn.decomposition import PCA, NMF # Metrics from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, - confusion_matrix, classification_report, - mean_squared_error, r2_score, mean_absolute_error + mean_squared_error, r2_score, confusion_matrix, classification_report ) + +# Pipeline +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.compose import ColumnTransformer, make_column_transformer + +# Utilities +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt ``` -## Basic Workflow Template +## Installation -### Classification +```bash +# Using uv (recommended) +uv pip install scikit-learn + +# Optional dependencies +uv pip install scikit-learn[plots] # For plotting utilities +uv pip install pandas numpy matplotlib seaborn # Common companions +``` + +## Quick Workflow Templates + +### Classification Pipeline ```python from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import classification_report +from sklearn.metrics import classification_report, confusion_matrix # Split data X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y + X, y, test_size=0.2, stratify=y, random_state=42 ) -# Scale features +# Preprocess scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) -# Train model +# Train model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train_scaled, y_train) -# Predict and evaluate +# Evaluate y_pred = model.predict(X_test_scaled) print(classification_report(y_test, y_pred)) +print(confusion_matrix(y_test, y_pred)) ``` -### Regression +### Regression Pipeline ```python from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, r2_score -# Split data +# Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) -# Scale features +# Preprocess and train scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) -# Train model -model = RandomForestRegressor(n_estimators=100, random_state=42) +model = GradientBoostingRegressor(n_estimators=100, random_state=42) model.fit(X_train_scaled, y_train) -# Predict and evaluate +# Evaluate y_pred = model.predict(X_test_scaled) print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.3f}") -print(f"R²: {r2_score(y_test, y_pred):.3f}") +print(f"R² Score: {r2_score(y_test, y_pred):.3f}") ``` -### With Pipeline (Recommended) +### Cross-Validation ```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split, cross_val_score -# Create pipeline -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) -]) - -# Split and train -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 -) -pipeline.fit(X_train, y_train) - -# Evaluate -score = pipeline.score(X_test, y_test) -cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5) -print(f"Test accuracy: {score:.3f}") -print(f"CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})") +model = RandomForestClassifier(n_estimators=100, random_state=42) +scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') +print(f"CV Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})") ``` -## Common Preprocessing Patterns - -### Numeric Data +### Complete Pipeline with Mixed Data Types ```python -from sklearn.preprocessing import StandardScaler -from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.ensemble import RandomForestClassifier +# Define feature types +numeric_features = ['age', 'income'] +categorical_features = ['gender', 'occupation'] + +# Create preprocessing pipelines numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) -``` - -### Categorical Data - -```python -from sklearn.preprocessing import OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.pipeline import Pipeline categorical_transformer = Pipeline([ - ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) -``` -### Mixed Data with ColumnTransformer - -```python -from sklearn.compose import ColumnTransformer - -numeric_features = ['age', 'income', 'credit_score'] -categorical_features = ['country', 'occupation'] - -preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features) - ]) - -# Complete pipeline -from sklearn.ensemble import RandomForestClassifier -pipeline = Pipeline([ - ('preprocessor', preprocessor), - ('classifier', RandomForestClassifier()) +# Combine transformers +preprocessor = ColumnTransformer([ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) ]) + +# Full pipeline +model = Pipeline([ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) +]) + +# Fit and predict +model.fit(X_train, y_train) +y_pred = model.predict(X_test) ``` -## Model Selection Cheat Sheet - -### Quick Decision Tree - -``` -Is it supervised? -├─ Yes -│ ├─ Predicting categories? → Classification -│ │ ├─ Start with: LogisticRegression (baseline) -│ │ ├─ Then try: RandomForestClassifier -│ │ └─ Best performance: HistGradientBoostingClassifier -│ └─ Predicting numbers? → Regression -│ ├─ Start with: LinearRegression/Ridge (baseline) -│ ├─ Then try: RandomForestRegressor -│ └─ Best performance: HistGradientBoostingRegressor -└─ No - ├─ Grouping similar items? → Clustering - │ ├─ Know # clusters: KMeans - │ └─ Unknown # clusters: DBSCAN or HDBSCAN - ├─ Reducing dimensions? - │ ├─ For preprocessing: PCA - │ └─ For visualization: t-SNE or UMAP - └─ Finding outliers? → IsolationForest or LocalOutlierFactor -``` - -### Algorithm Selection by Data Size - -- **Small (<1K samples)**: Any algorithm -- **Medium (1K-100K)**: Random Forests, Gradient Boosting, Neural Networks -- **Large (>100K)**: SGDClassifier/Regressor, HistGradientBoosting, LinearSVC - -### When to Scale Features - -**Always scale**: -- SVM, Neural Networks -- K-Nearest Neighbors -- Linear/Logistic Regression (with regularization) -- PCA, LDA -- Any gradient descent algorithm - -**Don't need to scale**: -- Tree-based (Decision Trees, Random Forests, Gradient Boosting) -- Naive Bayes - -## Hyperparameter Tuning - -### GridSearchCV +### Hyperparameter Tuning ```python from sklearn.model_selection import GridSearchCV +from sklearn.ensemble import RandomForestClassifier param_grid = { - 'n_estimators': [100, 200, 500], + 'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10] } +model = RandomForestClassifier(random_state=42) grid_search = GridSearchCV( - RandomForestClassifier(random_state=42), - param_grid, - cv=5, - scoring='f1_weighted', - n_jobs=-1 + model, param_grid, cv=5, scoring='accuracy', n_jobs=-1 ) grid_search.fit(X_train, y_train) -best_model = grid_search.best_estimator_ print(f"Best params: {grid_search.best_params_}") +print(f"Best score: {grid_search.best_score_:.3f}") + +# Use best model +best_model = grid_search.best_estimator_ ``` -### RandomizedSearchCV (Faster) +## Common Patterns + +### Loading Data ```python -from sklearn.model_selection import RandomizedSearchCV -from scipy.stats import randint, uniform +# From scikit-learn datasets +from sklearn.datasets import load_iris, load_digits, make_classification -param_distributions = { - 'n_estimators': randint(100, 1000), - 'max_depth': randint(5, 50), - 'min_samples_split': randint(2, 20) -} +# Built-in datasets +iris = load_iris() +X, y = iris.data, iris.target -random_search = RandomizedSearchCV( - RandomForestClassifier(random_state=42), - param_distributions, - n_iter=50, # Number of combinations to try - cv=5, - n_jobs=-1, - random_state=42 +# Synthetic data +X, y = make_classification( + n_samples=1000, n_features=20, n_classes=2, random_state=42 ) -random_search.fit(X_train, y_train) +# From pandas +import pandas as pd +df = pd.read_csv('data.csv') +X = df.drop('target', axis=1) +y = df['target'] ``` -### Pipeline with GridSearchCV +### Handling Imbalanced Data ```python -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC -from sklearn.model_selection import GridSearchCV - -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('svm', SVC()) -]) - -param_grid = { - 'svm__C': [0.1, 1, 10], - 'svm__kernel': ['rbf', 'linear'], - 'svm__gamma': ['scale', 'auto'] -} - -grid = GridSearchCV(pipeline, param_grid, cv=5) -grid.fit(X_train, y_train) -``` - -## Cross-Validation - -### Basic Cross-Validation - -```python -from sklearn.model_selection import cross_val_score - -scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') -print(f"Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})") -``` - -### Multiple Metrics - -```python -from sklearn.model_selection import cross_validate - -scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] -results = cross_validate(model, X, y, cv=5, scoring=scoring) - -for metric in scoring: - scores = results[f'test_{metric}'] - print(f"{metric}: {scores.mean():.3f} (+/- {scores.std():.3f})") -``` - -### Custom CV Strategies - -```python -from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit - -# For imbalanced classification -cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) - -# For time series -cv = TimeSeriesSplit(n_splits=5) - -scores = cross_val_score(model, X, y, cv=cv) -``` - -## Common Metrics - -### Classification - -```python -from sklearn.metrics import ( - accuracy_score, balanced_accuracy_score, - precision_score, recall_score, f1_score, - confusion_matrix, classification_report, - roc_auc_score -) - -# Basic metrics -accuracy = accuracy_score(y_true, y_pred) -f1 = f1_score(y_true, y_pred, average='weighted') - -# Comprehensive report -print(classification_report(y_true, y_pred)) - -# ROC AUC (requires probabilities) -y_proba = model.predict_proba(X_test)[:, 1] -auc = roc_auc_score(y_true, y_proba) -``` - -### Regression - -```python -from sklearn.metrics import ( - mean_squared_error, - mean_absolute_error, - r2_score -) - -mse = mean_squared_error(y_true, y_pred) -rmse = mean_squared_error(y_true, y_pred, squared=False) -mae = mean_absolute_error(y_true, y_pred) -r2 = r2_score(y_true, y_pred) - -print(f"RMSE: {rmse:.3f}") -print(f"MAE: {mae:.3f}") -print(f"R²: {r2:.3f}") -``` - -## Feature Engineering - -### Polynomial Features - -```python -from sklearn.preprocessing import PolynomialFeatures - -poly = PolynomialFeatures(degree=2, include_bias=False) -X_poly = poly.fit_transform(X) -# [x1, x2] → [x1, x2, x1², x1·x2, x2²] -``` - -### Feature Selection - -```python -from sklearn.feature_selection import ( - SelectKBest, f_classif, - RFE, - SelectFromModel -) - -# Univariate selection -selector = SelectKBest(f_classif, k=10) -X_selected = selector.fit_transform(X, y) - -# Recursive feature elimination from sklearn.ensemble import RandomForestClassifier -rfe = RFE(RandomForestClassifier(), n_features_to_select=10) -X_selected = rfe.fit_transform(X, y) -# Model-based selection -selector = SelectFromModel( - RandomForestClassifier(n_estimators=100), - threshold='median' -) -X_selected = selector.fit_transform(X, y) +# Use class_weight parameter +model = RandomForestClassifier(class_weight='balanced', random_state=42) +model.fit(X_train, y_train) + +# Or use appropriate metrics +from sklearn.metrics import balanced_accuracy_score, f1_score +print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.3f}") +print(f"F1 Score: {f1_score(y_test, y_pred):.3f}") ``` ### Feature Importance ```python -# Tree-based models -model = RandomForestClassifier() +from sklearn.ensemble import RandomForestClassifier +import pandas as pd + +model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) -importances = model.feature_importances_ -# Visualize -import matplotlib.pyplot as plt -indices = np.argsort(importances)[::-1] -plt.bar(range(X.shape[1]), importances[indices]) -plt.xticks(range(X.shape[1]), feature_names[indices], rotation=90) -plt.show() +# Get feature importances +importances = pd.DataFrame({ + 'feature': feature_names, + 'importance': model.feature_importances_ +}).sort_values('importance', ascending=False) -# Permutation importance (works for any model) -from sklearn.inspection import permutation_importance -result = permutation_importance(model, X_test, y_test, n_repeats=10) -importances = result.importances_mean +print(importances.head(10)) ``` -## Clustering - -### K-Means +### Clustering ```python from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler -# Always scale for k-means +# Scale data first scaler = StandardScaler() X_scaled = scaler.fit_transform(X) -# Fit k-means +# Fit K-Means kmeans = KMeans(n_clusters=3, random_state=42) labels = kmeans.fit_predict(X_scaled) # Evaluate from sklearn.metrics import silhouette_score score = silhouette_score(X_scaled, labels) -print(f"Silhouette score: {score:.3f}") +print(f"Silhouette Score: {score:.3f}") ``` -### Elbow Method - -```python -inertias = [] -K_range = range(2, 11) - -for k in K_range: - kmeans = KMeans(n_clusters=k, random_state=42) - kmeans.fit(X_scaled) - inertias.append(kmeans.inertia_) - -plt.plot(K_range, inertias, 'bo-') -plt.xlabel('k') -plt.ylabel('Inertia') -plt.show() -``` - -### DBSCAN - -```python -from sklearn.cluster import DBSCAN - -dbscan = DBSCAN(eps=0.5, min_samples=5) -labels = dbscan.fit_predict(X_scaled) - -# -1 indicates noise/outliers -n_clusters = len(set(labels)) - (1 if -1 in labels else 0) -n_noise = list(labels).count(-1) -print(f"Clusters: {n_clusters}, Noise points: {n_noise}") -``` - -## Dimensionality Reduction - -### PCA +### Dimensionality Reduction ```python from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt -# Always scale before PCA -scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) - -# Specify n_components +# Fit PCA pca = PCA(n_components=2) -X_pca = pca.fit_transform(X_scaled) +X_reduced = pca.fit_transform(X) -# Or specify variance to retain -pca = PCA(n_components=0.95) # Keep 95% variance -X_pca = pca.fit_transform(X_scaled) - -print(f"Explained variance: {pca.explained_variance_ratio_}") -print(f"Components needed: {pca.n_components_}") +# Plot +plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis') +plt.xlabel('PC1') +plt.ylabel('PC2') +plt.title(f'PCA (explained variance: {pca.explained_variance_ratio_.sum():.2%})') ``` -### t-SNE (Visualization Only) - -```python -from sklearn.manifold import TSNE - -# Reduce to 50 dimensions with PCA first (recommended) -pca = PCA(n_components=50) -X_pca = pca.fit_transform(X_scaled) - -# Apply t-SNE -tsne = TSNE(n_components=2, random_state=42, perplexity=30) -X_tsne = tsne.fit_transform(X_pca) - -# Visualize -plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis') -plt.colorbar() -plt.show() -``` - -## Saving and Loading Models +### Model Persistence ```python import joblib @@ -548,78 +289,145 @@ import joblib # Save model joblib.dump(model, 'model.pkl') -# Save pipeline -joblib.dump(pipeline, 'pipeline.pkl') - -# Load -model = joblib.load('model.pkl') -pipeline = joblib.load('pipeline.pkl') - -# Use loaded model -y_pred = model.predict(X_new) +# Load model +loaded_model = joblib.load('model.pkl') +predictions = loaded_model.predict(X_new) ``` -## Common Pitfalls and Solutions +## Common Gotchas and Solutions ### Data Leakage -❌ **Wrong**: Fit on all data before split ```python -scaler = StandardScaler().fit(X) -X_train, X_test = train_test_split(scaler.transform(X)) -``` +# WRONG: Fitting scaler on all data +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X) +X_train, X_test = train_test_split(X_scaled) -✅ **Correct**: Use pipeline or fit only on train -```python +# RIGHT: Fit on training data only X_train, X_test = train_test_split(X) -pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)]) -pipeline.fit(X_train, y_train) +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# BEST: Use Pipeline +from sklearn.pipeline import Pipeline +pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('model', LogisticRegression()) +]) +pipeline.fit(X_train, y_train) # No leakage! ``` -### Not Scaling -❌ **Wrong**: Using SVM without scaling -```python -svm = SVC() -svm.fit(X_train, y_train) -``` - -✅ **Correct**: Scale for SVM -```python -pipeline = Pipeline([('scaler', StandardScaler()), ('svm', SVC())]) -pipeline.fit(X_train, y_train) -``` - -### Wrong Metric for Imbalanced Data -❌ **Wrong**: Using accuracy for 99:1 imbalance -```python -accuracy = accuracy_score(y_true, y_pred) # Can be misleading -``` - -✅ **Correct**: Use appropriate metrics -```python -f1 = f1_score(y_true, y_pred, average='weighted') -balanced_acc = balanced_accuracy_score(y_true, y_pred) -``` - -### Not Using Stratification -❌ **Wrong**: Random split for imbalanced data -```python -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) -``` - -✅ **Correct**: Stratify for imbalanced classes +### Stratified Splitting for Classification ```python +# Always use stratify for classification X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, stratify=y + X, y, test_size=0.2, stratify=y, random_state=42 ) ``` +### Random State for Reproducibility +```python +# Set random_state for reproducibility +model = RandomForestClassifier(n_estimators=100, random_state=42) +``` + +### Handling Unknown Categories +```python +# Use handle_unknown='ignore' for OneHotEncoder +encoder = OneHotEncoder(handle_unknown='ignore') +``` + +### Feature Names with Pipelines +```python +# Get feature names after transformation +preprocessor.fit(X_train) +feature_names = preprocessor.get_feature_names_out() +``` + +## Cheat Sheet: Algorithm Selection + +### Classification + +| Problem | Algorithm | When to Use | +|---------|-----------|-------------| +| Binary/Multiclass | Logistic Regression | Fast baseline, interpretability | +| Binary/Multiclass | Random Forest | Good default, robust | +| Binary/Multiclass | Gradient Boosting | Best accuracy, willing to tune | +| Binary/Multiclass | SVM | Small data, complex boundaries | +| Binary/Multiclass | Naive Bayes | Text classification, fast | +| High dimensions | Linear SVM or Logistic | Text, many features | + +### Regression + +| Problem | Algorithm | When to Use | +|---------|-----------|-------------| +| Continuous target | Linear Regression | Fast baseline, interpretability | +| Continuous target | Ridge/Lasso | Regularization needed | +| Continuous target | Random Forest | Good default, non-linear | +| Continuous target | Gradient Boosting | Best accuracy | +| Continuous target | SVR | Small data, non-linear | + +### Clustering + +| Problem | Algorithm | When to Use | +|---------|-----------|-------------| +| Known K, spherical | K-Means | Fast, simple | +| Unknown K, arbitrary shapes | DBSCAN | Noise/outliers present | +| Hierarchical structure | Agglomerative | Need dendrogram | +| Soft clustering | Gaussian Mixture | Probability estimates | + +### Dimensionality Reduction + +| Problem | Algorithm | When to Use | +|---------|-----------|-------------| +| Linear reduction | PCA | Variance explanation | +| Visualization | t-SNE | 2D/3D plots | +| Non-negative data | NMF | Images, text | +| Sparse data | TruncatedSVD | Text, recommender systems | + ## Performance Tips -1. **Use n_jobs=-1** for parallel processing (RandomForest, GridSearchCV) -2. **Use HistGradientBoosting** for large datasets (>10K samples) -3. **Use MiniBatchKMeans** for large clustering tasks -4. **Use IncrementalPCA** for data that doesn't fit in memory -5. **Use sparse matrices** for high-dimensional sparse data (text) -6. **Cache transformers** in pipelines during grid search -7. **Use RandomizedSearchCV** instead of GridSearchCV for large parameter spaces -8. **Reduce dimensionality** with PCA before applying expensive algorithms +### Speed Up Training +```python +# Use n_jobs=-1 for parallel processing +model = RandomForestClassifier(n_estimators=100, n_jobs=-1) + +# Use warm_start for incremental learning +model = RandomForestClassifier(n_estimators=100, warm_start=True) +model.fit(X, y) +model.n_estimators += 50 +model.fit(X, y) # Adds 50 more trees + +# Use partial_fit for online learning +from sklearn.linear_model import SGDClassifier +model = SGDClassifier() +for X_batch, y_batch in batches: + model.partial_fit(X_batch, y_batch, classes=np.unique(y)) +``` + +### Memory Efficiency +```python +# Use sparse matrices +from scipy.sparse import csr_matrix +X_sparse = csr_matrix(X) + +# Use MiniBatchKMeans for large data +from sklearn.cluster import MiniBatchKMeans +model = MiniBatchKMeans(n_clusters=8, batch_size=100) +``` + +## Version Check + +```python +import sklearn +print(f"scikit-learn version: {sklearn.__version__}") +``` + +## Useful Resources + +- Official Documentation: https://scikit-learn.org/stable/ +- User Guide: https://scikit-learn.org/stable/user_guide.html +- API Reference: https://scikit-learn.org/stable/api/index.html +- Examples: https://scikit-learn.org/stable/auto_examples/index.html +- Tutorials: https://scikit-learn.org/stable/tutorial/index.html diff --git a/scientific-packages/scikit-learn/references/supervised_learning.md b/scientific-packages/scikit-learn/references/supervised_learning.md index a424313..24085ad 100644 --- a/scientific-packages/scikit-learn/references/supervised_learning.md +++ b/scientific-packages/scikit-learn/references/supervised_learning.md @@ -1,261 +1,378 @@ -# Supervised Learning in scikit-learn +# Supervised Learning Reference ## Overview -Supervised learning algorithms learn patterns from labeled training data to make predictions on new data. Scikit-learn organizes supervised learning into 17 major categories. + +Supervised learning algorithms learn from labeled training data to make predictions on new data. Scikit-learn provides comprehensive implementations for both classification and regression tasks. ## Linear Models ### Regression -- **LinearRegression**: Ordinary least squares regression -- **Ridge**: L2-regularized regression, good for multicollinearity -- **Lasso**: L1-regularized regression, performs feature selection -- **ElasticNet**: Combined L1/L2 regularization -- **LassoLars**: Lasso using Least Angle Regression algorithm -- **BayesianRidge**: Bayesian approach with automatic relevance determination + +**Linear Regression (`sklearn.linear_model.LinearRegression`)** +- Ordinary least squares regression +- Fast, interpretable, no hyperparameters +- Use when: Linear relationships, interpretability matters +- Example: +```python +from sklearn.linear_model import LinearRegression + +model = LinearRegression() +model.fit(X_train, y_train) +predictions = model.predict(X_test) +``` + +**Ridge Regression (`sklearn.linear_model.Ridge`)** +- L2 regularization to prevent overfitting +- Key parameter: `alpha` (regularization strength, default=1.0) +- Use when: Multicollinearity present, need regularization +- Example: +```python +from sklearn.linear_model import Ridge + +model = Ridge(alpha=1.0) +model.fit(X_train, y_train) +``` + +**Lasso (`sklearn.linear_model.Lasso`)** +- L1 regularization with feature selection +- Key parameter: `alpha` (regularization strength) +- Use when: Want sparse models, feature selection +- Can reduce some coefficients to exactly zero +- Example: +```python +from sklearn.linear_model import Lasso + +model = Lasso(alpha=0.1) +model.fit(X_train, y_train) +# Check which features were selected +print(f"Non-zero coefficients: {sum(model.coef_ != 0)}") +``` + +**ElasticNet (`sklearn.linear_model.ElasticNet`)** +- Combines L1 and L2 regularization +- Key parameters: `alpha`, `l1_ratio` (0=Ridge, 1=Lasso) +- Use when: Need both feature selection and regularization +- Example: +```python +from sklearn.linear_model import ElasticNet + +model = ElasticNet(alpha=0.1, l1_ratio=0.5) +model.fit(X_train, y_train) +``` ### Classification -- **LogisticRegression**: Binary and multiclass classification -- **RidgeClassifier**: Ridge regression for classification -- **SGDClassifier**: Linear classifiers with SGD training -**Use cases**: Baseline models, interpretable predictions, high-dimensional data, when linear relationships are expected +**Logistic Regression (`sklearn.linear_model.LogisticRegression`)** +- Binary and multiclass classification +- Key parameters: `C` (inverse regularization), `penalty` ('l1', 'l2', 'elasticnet') +- Returns probability estimates +- Use when: Need probabilistic predictions, interpretability +- Example: +```python +from sklearn.linear_model import LogisticRegression -**Key parameters**: -- `alpha`: Regularization strength (higher = more regularization) -- `fit_intercept`: Whether to calculate intercept -- `solver`: Optimization algorithm ('lbfgs', 'saga', 'liblinear') +model = LogisticRegression(C=1.0, max_iter=1000) +model.fit(X_train, y_train) +probas = model.predict_proba(X_test) +``` -## Support Vector Machines (SVM) +**Stochastic Gradient Descent (SGD)** +- `SGDClassifier`, `SGDRegressor` +- Efficient for large-scale learning +- Key parameters: `loss`, `penalty`, `alpha`, `learning_rate` +- Use when: Very large datasets (>10^4 samples) +- Example: +```python +from sklearn.linear_model import SGDClassifier -- **SVC**: Support Vector Classification -- **SVR**: Support Vector Regression -- **LinearSVC**: Linear SVM using liblinear (faster for large datasets) -- **OneClassSVM**: Unsupervised outlier detection +model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3) +model.fit(X_train, y_train) +``` -**Use cases**: Complex non-linear decision boundaries, high-dimensional spaces, when clear margin of separation exists +## Support Vector Machines -**Key parameters**: -- `kernel`: 'linear', 'poly', 'rbf', 'sigmoid' -- `C`: Regularization parameter (lower = more regularization) -- `gamma`: Kernel coefficient ('scale', 'auto', or float) -- `degree`: Polynomial degree (for poly kernel) +**SVC (`sklearn.svm.SVC`)** +- Classification with kernel methods +- Key parameters: `C`, `kernel` ('linear', 'rbf', 'poly'), `gamma` +- Use when: Small to medium datasets, complex decision boundaries +- Note: Does not scale well to large datasets +- Example: +```python +from sklearn.svm import SVC -**Performance tip**: SVMs don't scale well beyond tens of thousands of samples. Use LinearSVC for large datasets with linear kernel. +# Linear kernel for linearly separable data +model_linear = SVC(kernel='linear', C=1.0) + +# RBF kernel for non-linear data +model_rbf = SVC(kernel='rbf', C=1.0, gamma='scale') +model_rbf.fit(X_train, y_train) +``` + +**SVR (`sklearn.svm.SVR`)** +- Regression with kernel methods +- Similar parameters to SVC +- Additional parameter: `epsilon` (tube width) +- Example: +```python +from sklearn.svm import SVR + +model = SVR(kernel='rbf', C=1.0, epsilon=0.1) +model.fit(X_train, y_train) +``` ## Decision Trees -- **DecisionTreeClassifier**: Classification tree -- **DecisionTreeRegressor**: Regression tree -- **ExtraTreeClassifier/Regressor**: Extremely randomized tree +**DecisionTreeClassifier / DecisionTreeRegressor** +- Non-parametric model learning decision rules +- Key parameters: + - `max_depth`: Maximum tree depth (prevents overfitting) + - `min_samples_split`: Minimum samples to split a node + - `min_samples_leaf`: Minimum samples in leaf + - `criterion`: 'gini', 'entropy' for classification; 'squared_error', 'absolute_error' for regression +- Use when: Need interpretable model, non-linear relationships, mixed feature types +- Prone to overfitting - use ensembles or pruning +- Example: +```python +from sklearn.tree import DecisionTreeClassifier -**Use cases**: Non-linear relationships, feature importance analysis, interpretable rules, handling mixed data types +model = DecisionTreeClassifier( + max_depth=5, + min_samples_split=20, + min_samples_leaf=10, + criterion='gini' +) +model.fit(X_train, y_train) -**Key parameters**: -- `max_depth`: Maximum tree depth (controls overfitting) -- `min_samples_split`: Minimum samples to split a node -- `min_samples_leaf`: Minimum samples in leaf node -- `max_features`: Number of features to consider for splits -- `criterion`: 'gini', 'entropy' (classification); 'squared_error', 'absolute_error' (regression) - -**Overfitting prevention**: Limit `max_depth`, increase `min_samples_split/leaf`, use pruning with `ccp_alpha` +# Visualize the tree +from sklearn.tree import plot_tree +plot_tree(model, feature_names=feature_names, class_names=class_names) +``` ## Ensemble Methods ### Random Forests -- **RandomForestClassifier**: Ensemble of decision trees -- **RandomForestRegressor**: Regression variant -**Use cases**: Robust general-purpose algorithm, reduces overfitting vs single trees, handles non-linear relationships +**RandomForestClassifier / RandomForestRegressor** +- Ensemble of decision trees with bagging +- Key parameters: + - `n_estimators`: Number of trees (default=100) + - `max_depth`: Maximum tree depth + - `max_features`: Features to consider for splits ('sqrt', 'log2', or int) + - `min_samples_split`, `min_samples_leaf`: Control tree growth +- Use when: High accuracy needed, can afford computation +- Provides feature importance +- Example: +```python +from sklearn.ensemble import RandomForestClassifier -**Key parameters**: -- `n_estimators`: Number of trees (higher = better but slower) -- `max_depth`: Maximum tree depth -- `max_features`: Features per split ('sqrt', 'log2', int, float) -- `bootstrap`: Whether to use bootstrap samples -- `n_jobs`: Parallel processing (-1 uses all cores) +model = RandomForestClassifier( + n_estimators=100, + max_depth=10, + max_features='sqrt', + n_jobs=-1 # Use all CPU cores +) +model.fit(X_train, y_train) + +# Feature importance +importances = model.feature_importances_ +``` ### Gradient Boosting -- **HistGradientBoostingClassifier/Regressor**: Histogram-based, fast for large datasets (>10k samples) -- **GradientBoostingClassifier/Regressor**: Traditional implementation, better for small datasets -**Use cases**: High-performance predictions, winning Kaggle competitions, structured/tabular data +**GradientBoostingClassifier / GradientBoostingRegressor** +- Sequential ensemble building trees on residuals +- Key parameters: + - `n_estimators`: Number of boosting stages + - `learning_rate`: Shrinks contribution of each tree + - `max_depth`: Depth of individual trees (typically 3-5) + - `subsample`: Fraction of samples for training each tree +- Use when: Need high accuracy, can afford training time +- Often achieves best performance +- Example: +```python +from sklearn.ensemble import GradientBoostingClassifier -**Key parameters**: -- `n_estimators`: Number of boosting stages -- `learning_rate`: Shrinks contribution of each tree -- `max_depth`: Maximum tree depth (typically 3-8) -- `subsample`: Fraction of samples per tree (enables stochastic gradient boosting) -- `early_stopping`: Stop when validation score stops improving +model = GradientBoostingClassifier( + n_estimators=100, + learning_rate=0.1, + max_depth=3, + subsample=0.8 +) +model.fit(X_train, y_train) +``` -**Performance tip**: HistGradientBoosting is orders of magnitude faster for large datasets +**HistGradientBoostingClassifier / HistGradientBoostingRegressor** +- Faster gradient boosting with histogram-based algorithm +- Native support for missing values and categorical features +- Key parameters: Similar to GradientBoosting +- Use when: Large datasets, need faster training +- Example: +```python +from sklearn.ensemble import HistGradientBoostingClassifier -### AdaBoost -- **AdaBoostClassifier/Regressor**: Adaptive boosting +model = HistGradientBoostingClassifier( + max_iter=100, + learning_rate=0.1, + max_depth=None, # No limit by default + categorical_features='from_dtype' # Auto-detect categorical +) +model.fit(X_train, y_train) +``` -**Use cases**: Boosting weak learners, less prone to overfitting than other methods +### Other Ensemble Methods -**Key parameters**: -- `estimator`: Base estimator (default: DecisionTreeClassifier with max_depth=1) -- `n_estimators`: Number of boosting iterations -- `learning_rate`: Weight applied to each classifier +**AdaBoost** +- Adaptive boosting focusing on misclassified samples +- Key parameters: `n_estimators`, `learning_rate`, `estimator` (base estimator) +- Use when: Simple boosting approach needed +- Example: +```python +from sklearn.ensemble import AdaBoostClassifier -### Bagging -- **BaggingClassifier/Regressor**: Bootstrap aggregating with any base estimator +model = AdaBoostClassifier(n_estimators=50, learning_rate=1.0) +model.fit(X_train, y_train) +``` -**Use cases**: Reducing variance of unstable models, parallel ensemble creation +**Voting Classifier / Regressor** +- Combines predictions from multiple models +- Types: 'hard' (majority vote) or 'soft' (average probabilities) +- Use when: Want to ensemble different model types +- Example: +```python +from sklearn.ensemble import VotingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import SVC -**Key parameters**: -- `estimator`: Base estimator to fit -- `n_estimators`: Number of estimators -- `max_samples`: Samples to draw per estimator -- `bootstrap`: Whether to use replacement +model = VotingClassifier( + estimators=[ + ('lr', LogisticRegression()), + ('dt', DecisionTreeClassifier()), + ('svc', SVC(probability=True)) + ], + voting='soft' +) +model.fit(X_train, y_train) +``` -### Voting & Stacking -- **VotingClassifier/Regressor**: Combines different model types -- **StackingClassifier/Regressor**: Meta-learner trained on base predictions +**Stacking Classifier / Regressor** +- Trains a meta-model on predictions from base models +- More sophisticated than voting +- Key parameter: `final_estimator` (meta-learner) +- Example: +```python +from sklearn.ensemble import StackingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import SVC -**Use cases**: Combining diverse models, leveraging different model strengths +model = StackingClassifier( + estimators=[ + ('dt', DecisionTreeClassifier()), + ('svc', SVC()) + ], + final_estimator=LogisticRegression() +) +model.fit(X_train, y_train) +``` -## Neural Networks +## K-Nearest Neighbors -- **MLPClassifier**: Multi-layer perceptron classifier -- **MLPRegressor**: Multi-layer perceptron regressor +**KNeighborsClassifier / KNeighborsRegressor** +- Non-parametric method based on distance +- Key parameters: + - `n_neighbors`: Number of neighbors (default=5) + - `weights`: 'uniform' or 'distance' + - `metric`: Distance metric ('euclidean', 'manhattan', etc.) +- Use when: Small dataset, simple baseline needed +- Slow prediction on large datasets +- Example: +```python +from sklearn.neighbors import KNeighborsClassifier -**Use cases**: Complex non-linear patterns, when gradient boosting is too slow, deep feature learning - -**Key parameters**: -- `hidden_layer_sizes`: Tuple of hidden layer sizes (e.g., (100, 50)) -- `activation`: 'relu', 'tanh', 'logistic' -- `solver`: 'adam', 'lbfgs', 'sgd' -- `alpha`: L2 regularization term -- `learning_rate`: Learning rate schedule -- `early_stopping`: Stop when validation score stops improving - -**Important**: Feature scaling is critical for neural networks. Always use StandardScaler or similar. - -## Nearest Neighbors - -- **KNeighborsClassifier/Regressor**: K-nearest neighbors -- **RadiusNeighborsClassifier/Regressor**: Radius-based neighbors -- **NearestCentroid**: Classification using class centroids - -**Use cases**: Simple baseline, irregular decision boundaries, when interpretability isn't critical - -**Key parameters**: -- `n_neighbors`: Number of neighbors (typically 3-11) -- `weights`: 'uniform' or 'distance' (distance-weighted voting) -- `metric`: Distance metric ('euclidean', 'manhattan', 'minkowski') -- `algorithm`: 'auto', 'ball_tree', 'kd_tree', 'brute' +model = KNeighborsClassifier(n_neighbors=5, weights='distance') +model.fit(X_train, y_train) +``` ## Naive Bayes -- **GaussianNB**: Assumes Gaussian distribution of features -- **MultinomialNB**: For discrete counts (text classification) -- **BernoulliNB**: For binary/boolean features -- **CategoricalNB**: For categorical features -- **ComplementNB**: Adapted for imbalanced datasets +**GaussianNB, MultinomialNB, BernoulliNB** +- Probabilistic classifiers based on Bayes' theorem +- Fast training and prediction +- GaussianNB: Continuous features (assumes Gaussian distribution) +- MultinomialNB: Count features (text classification) +- BernoulliNB: Binary features +- Use when: Text classification, fast baseline, probabilistic predictions +- Example: +```python +from sklearn.naive_bayes import GaussianNB, MultinomialNB -**Use cases**: Text classification, fast baseline, when features are independent, small training sets +# For continuous features +model_gaussian = GaussianNB() -**Key parameters**: -- `alpha`: Smoothing parameter (Laplace/Lidstone smoothing) -- `fit_prior`: Whether to learn class prior probabilities +# For text/count data +model_multinomial = MultinomialNB(alpha=1.0) # alpha is smoothing parameter +model_multinomial.fit(X_train, y_train) +``` -## Linear/Quadratic Discriminant Analysis +## Neural Networks -- **LinearDiscriminantAnalysis**: Linear decision boundary with dimensionality reduction -- **QuadraticDiscriminantAnalysis**: Quadratic decision boundary +**MLPClassifier / MLPRegressor** +- Multi-layer perceptron (feedforward neural network) +- Key parameters: + - `hidden_layer_sizes`: Tuple of hidden layer sizes, e.g., (100, 50) + - `activation`: 'relu', 'tanh', 'logistic' + - `solver`: 'adam', 'sgd', 'lbfgs' + - `alpha`: L2 regularization parameter + - `learning_rate`: 'constant', 'adaptive' +- Use when: Complex non-linear patterns, large datasets +- Requires feature scaling +- Example: +```python +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler -**Use cases**: When classes have Gaussian distributions, dimensionality reduction, when covariance assumptions hold +# Scale features first +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) -## Gaussian Processes +model = MLPClassifier( + hidden_layer_sizes=(100, 50), + activation='relu', + solver='adam', + alpha=0.0001, + max_iter=1000 +) +model.fit(X_train_scaled, y_train) +``` -- **GaussianProcessClassifier**: Probabilistic classification -- **GaussianProcessRegressor**: Probabilistic regression with uncertainty estimates +## Algorithm Selection Guide -**Use cases**: When uncertainty quantification is important, small datasets, smooth function approximation +### Choose based on: -**Key parameters**: -- `kernel`: Covariance function (RBF, Matern, RationalQuadratic, etc.) -- `alpha`: Noise level +**Dataset size:** +- Small (<1k samples): KNN, SVM, Decision Trees +- Medium (1k-100k): Random Forest, Gradient Boosting, Linear Models +- Large (>100k): SGD, Linear Models, HistGradientBoosting -**Limitation**: Doesn't scale well to large datasets (O(n³) complexity) +**Interpretability:** +- High: Linear Models, Decision Trees +- Medium: Random Forest (feature importance) +- Low: SVM with RBF kernel, Neural Networks -## Stochastic Gradient Descent +**Accuracy vs Speed:** +- Fast training: Naive Bayes, Linear Models, KNN +- High accuracy: Gradient Boosting, Random Forest, Stacking +- Fast prediction: Linear Models, Naive Bayes +- Slow prediction: KNN (on large datasets), SVM -- **SGDClassifier**: Linear classifiers with SGD -- **SGDRegressor**: Linear regressors with SGD +**Feature types:** +- Continuous: Most algorithms work well +- Categorical: Trees, HistGradientBoosting (native support) +- Mixed: Trees, Gradient Boosting +- Text: Naive Bayes, Linear Models with TF-IDF -**Use cases**: Very large datasets (>100k samples), online learning, when data doesn't fit in memory - -**Key parameters**: -- `loss`: Loss function ('hinge', 'log_loss', 'squared_error', etc.) -- `penalty`: Regularization ('l2', 'l1', 'elasticnet') -- `alpha`: Regularization strength -- `learning_rate`: Learning rate schedule - -## Semi-Supervised Learning - -- **SelfTrainingClassifier**: Self-training with any base classifier -- **LabelPropagation**: Label propagation through graph -- **LabelSpreading**: Label spreading (modified label propagation) - -**Use cases**: When labeled data is scarce but unlabeled data is abundant - -## Feature Selection - -- **VarianceThreshold**: Remove low-variance features -- **SelectKBest**: Select K highest scoring features -- **SelectPercentile**: Select top percentile of features -- **RFE**: Recursive feature elimination -- **RFECV**: RFE with cross-validation -- **SelectFromModel**: Select features based on importance -- **SequentialFeatureSelector**: Forward/backward feature selection - -**Use cases**: Reducing dimensionality, removing irrelevant features, improving interpretability, reducing overfitting - -## Probability Calibration - -- **CalibratedClassifierCV**: Calibrate classifier probabilities - -**Use cases**: When probability estimates are important (not just class predictions), especially with SVM and Naive Bayes - -**Methods**: -- `sigmoid`: Platt scaling -- `isotonic`: Isotonic regression (more flexible, needs more data) - -## Multi-Output Methods - -- **MultiOutputClassifier**: Fit one classifier per target -- **MultiOutputRegressor**: Fit one regressor per target -- **ClassifierChain**: Models dependencies between targets -- **RegressorChain**: Regression variant - -**Use cases**: Predicting multiple related targets simultaneously - -## Specialized Regression - -- **IsotonicRegression**: Monotonic regression -- **QuantileRegressor**: Quantile regression for prediction intervals - -## Algorithm Selection Guidelines - -**Start with**: -1. **Logistic Regression** (classification) or **LinearRegression/Ridge** (regression) as baseline -2. **RandomForestClassifier/Regressor** for general non-linear problems -3. **HistGradientBoostingClassifier/Regressor** when best performance is needed - -**Consider dataset size**: -- Small (<1k samples): SVM, Gaussian Processes, any algorithm -- Medium (1k-100k): Random Forests, Gradient Boosting, Neural Networks -- Large (>100k): SGD, HistGradientBoosting, LinearSVC - -**Consider interpretability needs**: -- High interpretability: Linear models, Decision Trees, Naive Bayes -- Medium: Random Forests (feature importance), Rule extraction -- Low (black box acceptable): Gradient Boosting, Neural Networks, SVM with RBF kernel - -**Consider training time**: -- Fast: Linear models, Naive Bayes, Decision Trees -- Medium: Random Forests (parallelizable), SVM (small data) -- Slow: Gradient Boosting, Neural Networks, SVM (large data), Gaussian Processes +**Common starting points:** +1. Logistic Regression (classification) / Linear Regression (regression) - fast baseline +2. Random Forest - good default choice +3. Gradient Boosting - optimize for best accuracy diff --git a/scientific-packages/scikit-learn/references/unsupervised_learning.md b/scientific-packages/scikit-learn/references/unsupervised_learning.md index b379c48..e18c958 100644 --- a/scientific-packages/scikit-learn/references/unsupervised_learning.md +++ b/scientific-packages/scikit-learn/references/unsupervised_learning.md @@ -1,728 +1,505 @@ -# Unsupervised Learning in scikit-learn +# Unsupervised Learning Reference ## Overview -Unsupervised learning discovers patterns in data without labeled targets. Main tasks include clustering (grouping similar samples), dimensionality reduction (reducing feature count), and anomaly detection (finding outliers). -## Clustering Algorithms +Unsupervised learning discovers patterns in unlabeled data through clustering, dimensionality reduction, and density estimation. + +## Clustering ### K-Means -Groups data into k clusters by minimizing within-cluster variance. - -**Algorithm**: -1. Initialize k centroids (k-means++ initialization recommended) -2. Assign each point to nearest centroid -3. Update centroids to mean of assigned points -4. Repeat until convergence - +**KMeans (`sklearn.cluster.KMeans`)** +- Partition-based clustering into K clusters +- Key parameters: + - `n_clusters`: Number of clusters to form + - `init`: Initialization method ('k-means++', 'random') + - `n_init`: Number of initializations (default=10) + - `max_iter`: Maximum iterations +- Use when: Know number of clusters, spherical cluster shapes +- Fast and scalable +- Example: ```python from sklearn.cluster import KMeans -kmeans = KMeans( - n_clusters=3, - init='k-means++', # Smart initialization - n_init=10, # Number of times to run with different seeds - max_iter=300, - random_state=42 -) -labels = kmeans.fit_predict(X) -centroids = kmeans.cluster_centers_ +model = KMeans(n_clusters=3, init='k-means++', n_init=10, random_state=42) +labels = model.fit_predict(X) +centers = model.cluster_centers_ + +# Inertia (sum of squared distances to nearest center) +print(f"Inertia: {model.inertia_}") ``` -**Use cases**: -- Customer segmentation -- Image compression -- Data preprocessing (clustering as features) +**MiniBatchKMeans** +- Faster K-Means using mini-batches +- Use when: Large datasets, need faster training +- Slightly less accurate than K-Means +- Example: +```python +from sklearn.cluster import MiniBatchKMeans -**Strengths**: -- Fast and scalable -- Simple to understand -- Works well with spherical clusters +model = MiniBatchKMeans(n_clusters=3, batch_size=100, random_state=42) +labels = model.fit_predict(X) +``` -**Limitations**: -- Assumes spherical clusters of similar size -- Sensitive to initialization (mitigated by k-means++) -- Must specify k beforehand -- Sensitive to outliers - -**Choosing k**: Use elbow method, silhouette score, or domain knowledge - -**Variants**: -- **MiniBatchKMeans**: Faster for large datasets, uses mini-batches -- **KMeans with n_init='auto'**: Adaptive number of initializations - -### DBSCAN - -Density-Based Spatial Clustering of Applications with Noise. Identifies clusters as dense regions separated by sparse areas. +### Density-Based Clustering +**DBSCAN (`sklearn.cluster.DBSCAN`)** +- Density-Based Spatial Clustering +- Key parameters: + - `eps`: Maximum distance between two samples to be neighbors + - `min_samples`: Minimum samples in neighborhood to form core point + - `metric`: Distance metric +- Use when: Arbitrary cluster shapes, presence of noise/outliers +- Automatically determines number of clusters +- Labels noise points as -1 +- Example: ```python from sklearn.cluster import DBSCAN -dbscan = DBSCAN( - eps=0.5, # Maximum distance between neighbors - min_samples=5, # Minimum points to form dense region - metric='euclidean' -) -labels = dbscan.fit_predict(X) -# -1 indicates noise/outliers +model = DBSCAN(eps=0.5, min_samples=5, metric='euclidean') +labels = model.fit_predict(X) + +# Number of clusters (excluding noise) +n_clusters = len(set(labels)) - (1 if -1 in labels else 0) +n_noise = list(labels).count(-1) +print(f"Clusters: {n_clusters}, Noise points: {n_noise}") ``` -**Use cases**: -- Arbitrary cluster shapes -- Outlier detection -- When cluster count is unknown -- Geographic/spatial data - -**Strengths**: -- Discovers arbitrary-shaped clusters -- Automatically detects outliers -- Doesn't require specifying number of clusters -- Robust to outliers - -**Limitations**: -- Struggles with varying densities -- Sensitive to eps and min_samples parameters -- Not deterministic (border points may vary) - -**Parameter tuning**: -- `eps`: Plot k-distance graph, look for elbow -- `min_samples`: Rule of thumb: 2 * dimensions - -### HDBSCAN - -Hierarchical DBSCAN that handles variable cluster densities. - +**HDBSCAN (`sklearn.cluster.HDBSCAN`)** +- Hierarchical DBSCAN with adaptive epsilon +- More robust than DBSCAN +- Key parameter: `min_cluster_size` +- Use when: Varying density clusters +- Example: ```python from sklearn.cluster import HDBSCAN -hdbscan = HDBSCAN( - min_cluster_size=5, - min_samples=None, # Defaults to min_cluster_size - metric='euclidean' -) -labels = hdbscan.fit_predict(X) +model = HDBSCAN(min_cluster_size=10, min_samples=5) +labels = model.fit_predict(X) ``` -**Advantages over DBSCAN**: -- Handles variable density clusters -- More robust parameter selection -- Provides cluster membership probabilities -- Hierarchical structure +**OPTICS (`sklearn.cluster.OPTICS`)** +- Ordering points to identify clustering structure +- Similar to DBSCAN but doesn't require eps parameter +- Key parameters: `min_samples`, `max_eps` +- Use when: Varying density, exploratory analysis +- Example: +```python +from sklearn.cluster import OPTICS -**Use cases**: When DBSCAN struggles with varying densities +model = OPTICS(min_samples=5, max_eps=0.5) +labels = model.fit_predict(X) +``` ### Hierarchical Clustering -Builds nested cluster hierarchies using agglomerative (bottom-up) approach. - +**AgglomerativeClustering** +- Bottom-up hierarchical clustering +- Key parameters: + - `n_clusters`: Number of clusters (or use `distance_threshold`) + - `linkage`: 'ward', 'complete', 'average', 'single' + - `metric`: Distance metric +- Use when: Need dendrogram, hierarchical structure important +- Example: ```python from sklearn.cluster import AgglomerativeClustering -agg_clust = AgglomerativeClustering( - n_clusters=3, - linkage='ward', # 'ward', 'complete', 'average', 'single' - metric='euclidean' -) -labels = agg_clust.fit_predict(X) +model = AgglomerativeClustering(n_clusters=3, linkage='ward') +labels = model.fit_predict(X) -# Visualize with dendrogram -from scipy.cluster.hierarchy import dendrogram, linkage as scipy_linkage -import matplotlib.pyplot as plt - -linkage_matrix = scipy_linkage(X, method='ward') -dendrogram(linkage_matrix) -plt.show() +# Create dendrogram using scipy +from scipy.cluster.hierarchy import dendrogram, linkage +Z = linkage(X, method='ward') +dendrogram(Z) ``` -**Linkage methods**: -- `ward`: Minimizes variance (only with Euclidean) - **most common** -- `complete`: Maximum distance between clusters -- `average`: Average distance between clusters -- `single`: Minimum distance between clusters - -**Use cases**: -- When hierarchical structure is meaningful -- Taxonomy/phylogenetic trees -- When visualization is important (dendrograms) - -**Strengths**: -- No need to specify k initially (cut dendrogram at desired level) -- Produces hierarchy of clusters -- Deterministic - -**Limitations**: -- Computationally expensive (O(n²) to O(n³)) -- Not suitable for large datasets -- Cannot undo previous merges - -### Spectral Clustering - -Performs dimensionality reduction using affinity matrix before clustering. - -```python -from sklearn.cluster import SpectralClustering - -spectral = SpectralClustering( - n_clusters=3, - affinity='rbf', # 'rbf', 'nearest_neighbors', 'precomputed' - gamma=1.0, - n_neighbors=10, - random_state=42 -) -labels = spectral.fit_predict(X) -``` - -**Use cases**: -- Non-convex clusters -- Image segmentation -- Graph clustering -- When similarity matrix is available - -**Strengths**: -- Handles non-convex clusters -- Works with similarity matrices -- Often better than k-means for complex shapes - -**Limitations**: -- Computationally expensive -- Requires specifying number of clusters -- Memory intensive - -### Mean Shift - -Discovers clusters through iterative centroid updates based on density. +### Other Clustering Methods +**MeanShift** +- Finds clusters by shifting points toward mode of density +- Automatically determines number of clusters +- Key parameter: `bandwidth` +- Use when: Don't know number of clusters, arbitrary shapes +- Example: ```python from sklearn.cluster import MeanShift, estimate_bandwidth # Estimate bandwidth bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) - -mean_shift = MeanShift(bandwidth=bandwidth) -labels = mean_shift.fit_predict(X) -cluster_centers = mean_shift.cluster_centers_ +model = MeanShift(bandwidth=bandwidth) +labels = model.fit_predict(X) ``` -**Use cases**: -- When cluster count is unknown -- Computer vision applications -- Object tracking +**SpectralClustering** +- Uses graph-based approach with eigenvalues +- Key parameters: `n_clusters`, `affinity` ('rbf', 'nearest_neighbors') +- Use when: Non-convex clusters, graph structure +- Example: +```python +from sklearn.cluster import SpectralClustering -**Strengths**: +model = SpectralClustering(n_clusters=3, affinity='rbf', random_state=42) +labels = model.fit_predict(X) +``` + +**AffinityPropagation** +- Finds exemplars by message passing - Automatically determines number of clusters -- Handles arbitrary shapes -- No assumptions about cluster shape - -**Limitations**: -- Computationally expensive -- Very sensitive to bandwidth parameter -- Doesn't scale well - -### Affinity Propagation - -Uses message-passing between samples to identify exemplars. - +- Key parameters: `damping`, `preference` +- Use when: Don't know number of clusters +- Example: ```python from sklearn.cluster import AffinityPropagation -affinity_prop = AffinityPropagation( - damping=0.5, # Damping factor (0.5-1.0) - preference=None, # Self-preference (controls number of clusters) - random_state=42 -) -labels = affinity_prop.fit_predict(X) -exemplars = affinity_prop.cluster_centers_indices_ +model = AffinityPropagation(damping=0.9, random_state=42) +labels = model.fit_predict(X) +n_clusters = len(model.cluster_centers_indices_) ``` -**Use cases**: -- When number of clusters is unknown -- When exemplars (representative samples) are needed - -**Strengths**: -- Automatically determines number of clusters -- Identifies exemplar samples -- No initialization required - -**Limitations**: -- Very slow: O(n²t) where t is iterations -- Not suitable for large datasets -- Memory intensive - -### Gaussian Mixture Models (GMM) - -Probabilistic model assuming data comes from mixture of Gaussian distributions. - -```python -from sklearn.mixture import GaussianMixture - -gmm = GaussianMixture( - n_components=3, - covariance_type='full', # 'full', 'tied', 'diag', 'spherical' - random_state=42 -) -labels = gmm.fit_predict(X) -probabilities = gmm.predict_proba(X) # Soft clustering -``` - -**Covariance types**: -- `full`: Each component has its own covariance matrix -- `tied`: All components share same covariance -- `diag`: Diagonal covariance (independent features) -- `spherical`: Spherical covariance (isotropic) - -**Use cases**: -- When soft clustering is needed (probabilities) -- When clusters have different shapes/sizes -- Generative modeling -- Density estimation - -**Strengths**: -- Provides probabilities (soft clustering) -- Can handle elliptical clusters -- Generative model (can sample new data) -- Model selection with BIC/AIC - -**Limitations**: -- Assumes Gaussian distributions -- Sensitive to initialization -- Can converge to local optima - -**Model selection**: -```python -from sklearn.mixture import GaussianMixture -import numpy as np - -n_components_range = range(2, 10) -bic_scores = [] - -for n in n_components_range: - gmm = GaussianMixture(n_components=n, random_state=42) - gmm.fit(X) - bic_scores.append(gmm.bic(X)) - -optimal_n = n_components_range[np.argmin(bic_scores)] -``` - -### BIRCH - -Builds Clustering Feature Tree for memory-efficient processing of large datasets. - +**BIRCH** +- Balanced Iterative Reducing and Clustering using Hierarchies +- Memory efficient for large datasets +- Key parameters: `n_clusters`, `threshold`, `branching_factor` +- Use when: Very large datasets +- Example: ```python from sklearn.cluster import Birch -birch = Birch( - n_clusters=3, - threshold=0.5, - branching_factor=50 -) -labels = birch.fit_predict(X) +model = Birch(n_clusters=3, threshold=0.5) +labels = model.fit_predict(X) ``` -**Use cases**: -- Very large datasets -- Streaming data -- Memory constraints - -**Strengths**: -- Memory efficient -- Single pass over data -- Incremental learning - -## Dimensionality Reduction - -### Principal Component Analysis (PCA) - -Finds orthogonal components that explain maximum variance. +### Clustering Evaluation +**Metrics when ground truth is known:** ```python -from sklearn.decomposition import PCA -import matplotlib.pyplot as plt +from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score +from sklearn.metrics import adjusted_mutual_info_score, fowlkes_mallows_score -# Specify number of components -pca = PCA(n_components=2, random_state=42) -X_transformed = pca.fit_transform(X) - -print("Explained variance ratio:", pca.explained_variance_ratio_) -print("Total variance explained:", pca.explained_variance_ratio_.sum()) - -# Or specify variance to retain -pca = PCA(n_components=0.95) # Keep 95% of variance -X_transformed = pca.fit_transform(X) -print(f"Components needed: {pca.n_components_}") - -# Visualize explained variance -plt.plot(np.cumsum(pca.explained_variance_ratio_)) -plt.xlabel('Number of components') -plt.ylabel('Cumulative explained variance') -plt.show() -``` - -**Use cases**: -- Visualization (reduce to 2-3 dimensions) -- Remove multicollinearity -- Noise reduction -- Speed up training -- Feature extraction - -**Strengths**: -- Fast and efficient -- Reduces multicollinearity -- Works well for linear relationships -- Interpretable components - -**Limitations**: -- Only linear transformations -- Sensitive to scaling (always standardize first!) -- Components may be hard to interpret - -**Variants**: -- **IncrementalPCA**: For datasets that don't fit in memory -- **KernelPCA**: Non-linear dimensionality reduction -- **SparsePCA**: Sparse loadings for interpretability - -### t-SNE - -t-Distributed Stochastic Neighbor Embedding for visualization. - -```python -from sklearn.manifold import TSNE - -tsne = TSNE( - n_components=2, - perplexity=30, # Balance local vs global structure (5-50) - learning_rate='auto', - n_iter=1000, - random_state=42 -) -X_embedded = tsne.fit_transform(X) - -# Visualize -plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y) -plt.show() -``` - -**Use cases**: -- Visualization only (do not use for preprocessing!) -- Exploring high-dimensional data -- Finding clusters visually - -**Important notes**: -- **Only for visualization**, not for preprocessing -- Each run produces different results (use random_state for reproducibility) -- Slow for large datasets -- Cannot transform new data (no transform() method) - -**Parameter tuning**: -- `perplexity`: 5-50, larger for larger datasets -- Lower perplexity = focus on local structure -- Higher perplexity = focus on global structure - -### UMAP - -Uniform Manifold Approximation and Projection (requires umap-learn package). - -**Advantages over t-SNE**: -- Preserves global structure better -- Faster -- Can transform new data -- Can be used for preprocessing (not just visualization) - -### Truncated SVD (LSA) - -Similar to PCA but works with sparse matrices (e.g., TF-IDF). - -```python -from sklearn.decomposition import TruncatedSVD - -svd = TruncatedSVD(n_components=100, random_state=42) -X_reduced = svd.fit_transform(X_sparse) -``` - -**Use cases**: -- Text data (after TF-IDF) -- Sparse matrices -- Latent Semantic Analysis (LSA) - -### Non-negative Matrix Factorization (NMF) - -Factorizes data into non-negative components. - -```python -from sklearn.decomposition import NMF - -nmf = NMF(n_components=10, init='nndsvd', random_state=42) -W = nmf.fit_transform(X) # Document-topic matrix -H = nmf.components_ # Topic-word matrix -``` - -**Use cases**: -- Topic modeling -- Audio source separation -- Image processing -- When non-negativity is important (e.g., counts) - -**Strengths**: -- Interpretable components (additive, non-negative) -- Sparse representations - -### Independent Component Analysis (ICA) - -Separates multivariate signal into independent components. - -```python -from sklearn.decomposition import FastICA - -ica = FastICA(n_components=10, random_state=42) -X_independent = ica.fit_transform(X) -``` - -**Use cases**: -- Blind source separation -- Signal processing -- Feature extraction when independence is expected - -### Factor Analysis - -Models observed variables as linear combinations of latent factors plus noise. - -```python -from sklearn.decomposition import FactorAnalysis - -fa = FactorAnalysis(n_components=5, random_state=42) -X_factors = fa.fit_transform(X) -``` - -**Use cases**: -- When noise is heteroscedastic -- Latent variable modeling -- Psychology/social science research - -**Difference from PCA**: Models noise explicitly, assumes features have independent noise - -## Anomaly Detection - -### One-Class SVM - -Learns boundary around normal data. - -```python -from sklearn.svm import OneClassSVM - -oc_svm = OneClassSVM( - nu=0.1, # Proportion of outliers expected - kernel='rbf', - gamma='auto' -) -oc_svm.fit(X_train) -predictions = oc_svm.predict(X_test) # 1 for inliers, -1 for outliers -``` - -**Use cases**: -- Novelty detection -- When only normal data is available for training - -### Isolation Forest - -Isolates outliers using random forests. - -```python -from sklearn.ensemble import IsolationForest - -iso_forest = IsolationForest( - contamination=0.1, # Expected proportion of outliers - random_state=42 -) -predictions = iso_forest.fit_predict(X) # 1 for inliers, -1 for outliers -scores = iso_forest.score_samples(X) # Anomaly scores -``` - -**Use cases**: -- General anomaly detection -- Works well with high-dimensional data -- Fast and scalable - -**Strengths**: -- Fast -- Effective in high dimensions -- Low memory requirements - -### Local Outlier Factor (LOF) - -Detects outliers based on local density deviation. - -```python -from sklearn.neighbors import LocalOutlierFactor - -lof = LocalOutlierFactor( - n_neighbors=20, - contamination=0.1 -) -predictions = lof.fit_predict(X) # 1 for inliers, -1 for outliers -scores = lof.negative_outlier_factor_ # Anomaly scores (negative) -``` - -**Use cases**: -- Finding local outliers -- When global methods fail - -## Clustering Evaluation - -### With Ground Truth Labels - -When true labels are available (for validation): - -**Adjusted Rand Index (ARI)**: -```python -from sklearn.metrics import adjusted_rand_score +# Compare predicted labels with true labels ari = adjusted_rand_score(y_true, y_pred) -# Range: [-1, 1], 1 = perfect, 0 = random -``` - -**Normalized Mutual Information (NMI)**: -```python -from sklearn.metrics import normalized_mutual_info_score nmi = normalized_mutual_info_score(y_true, y_pred) -# Range: [0, 1], 1 = perfect +ami = adjusted_mutual_info_score(y_true, y_pred) +fmi = fowlkes_mallows_score(y_true, y_pred) ``` -**V-Measure**: -```python -from sklearn.metrics import v_measure_score -v = v_measure_score(y_true, y_pred) -# Range: [0, 1], harmonic mean of homogeneity and completeness -``` - -### Without Ground Truth Labels - -When true labels are unavailable (unsupervised evaluation): - -**Silhouette Score**: -Measures how similar objects are to their own cluster vs other clusters. - -```python -from sklearn.metrics import silhouette_score, silhouette_samples -import matplotlib.pyplot as plt - -score = silhouette_score(X, labels) -# Range: [-1, 1], higher is better -# >0.7: Strong structure -# 0.5-0.7: Reasonable structure -# 0.25-0.5: Weak structure -# <0.25: No substantial structure - -# Per-sample scores for detailed analysis -sample_scores = silhouette_samples(X, labels) - -# Visualize silhouette plot -for i in range(n_clusters): - cluster_scores = sample_scores[labels == i] - cluster_scores.sort() - plt.barh(range(len(cluster_scores)), cluster_scores) -plt.axvline(x=score, color='red', linestyle='--') -plt.show() -``` - -**Davies-Bouldin Index**: +**Metrics without ground truth:** ```python +from sklearn.metrics import silhouette_score, calinski_harabasz_score from sklearn.metrics import davies_bouldin_score -db = davies_bouldin_score(X, labels) -# Lower is better, 0 = perfect + +# Silhouette: [-1, 1], higher is better +silhouette = silhouette_score(X, labels) + +# Calinski-Harabasz: higher is better +ch_score = calinski_harabasz_score(X, labels) + +# Davies-Bouldin: lower is better +db_score = davies_bouldin_score(X, labels) ``` -**Calinski-Harabasz Index** (Variance Ratio Criterion): -```python -from sklearn.metrics import calinski_harabasz_score -ch = calinski_harabasz_score(X, labels) -# Higher is better -``` - -**Inertia** (K-Means specific): -```python -inertia = kmeans.inertia_ -# Sum of squared distances to nearest cluster center -# Use for elbow method -``` - -### Elbow Method (K-Means) - +**Elbow method for K-Means:** ```python from sklearn.cluster import KMeans import matplotlib.pyplot as plt inertias = [] K_range = range(2, 11) - for k in K_range: - kmeans = KMeans(n_clusters=k, random_state=42) - kmeans.fit(X) - inertias.append(kmeans.inertia_) + model = KMeans(n_clusters=k, random_state=42) + model.fit(X) + inertias.append(model.inertia_) plt.plot(K_range, inertias, 'bo-') -plt.xlabel('Number of clusters (k)') +plt.xlabel('Number of clusters') plt.ylabel('Inertia') plt.title('Elbow Method') -plt.show() -# Look for "elbow" where inertia starts decreasing more slowly ``` -## Best Practices +## Dimensionality Reduction -### Clustering Algorithm Selection +### Principal Component Analysis (PCA) -**Use K-Means when**: -- Clusters are spherical and similar size -- Speed is important -- Data is not too high-dimensional +**PCA (`sklearn.decomposition.PCA`)** +- Linear dimensionality reduction using SVD +- Key parameters: + - `n_components`: Number of components (int or float for explained variance) + - `whiten`: Whiten components to unit variance +- Use when: Linear relationships, want to explain variance +- Example: +```python +from sklearn.decomposition import PCA -**Use DBSCAN when**: -- Arbitrary cluster shapes -- Number of clusters unknown -- Outlier detection needed +# Keep components explaining 95% variance +pca = PCA(n_components=0.95) +X_reduced = pca.fit_transform(X) -**Use Hierarchical when**: -- Hierarchy is meaningful -- Small to medium datasets -- Visualization is important +print(f"Original dimensions: {X.shape[1]}") +print(f"Reduced dimensions: {X_reduced.shape[1]}") +print(f"Explained variance ratio: {pca.explained_variance_ratio_}") +print(f"Total variance explained: {pca.explained_variance_ratio_.sum()}") -**Use GMM when**: -- Soft clustering needed -- Clusters have different shapes/sizes -- Probabilistic interpretation needed +# Or specify exact number of components +pca = PCA(n_components=2) +X_2d = pca.fit_transform(X) +``` -**Use Spectral Clustering when**: -- Non-convex clusters -- Have similarity matrix -- Moderate dataset size +**IncrementalPCA** +- PCA for large datasets that don't fit in memory +- Processes data in batches +- Key parameter: `n_components`, `batch_size` +- Example: +```python +from sklearn.decomposition import IncrementalPCA -### Preprocessing for Clustering +pca = IncrementalPCA(n_components=50, batch_size=100) +X_reduced = pca.fit_transform(X) +``` -1. **Always scale features**: Use StandardScaler or MinMaxScaler -2. **Handle outliers**: Remove or use robust algorithms (DBSCAN, HDBSCAN) -3. **Reduce dimensionality if needed**: PCA for speed, careful with interpretation -4. **Check for categorical variables**: Encode appropriately or use specialized algorithms +**KernelPCA** +- Non-linear dimensionality reduction using kernels +- Key parameters: `n_components`, `kernel` ('linear', 'poly', 'rbf', 'sigmoid') +- Use when: Non-linear relationships +- Example: +```python +from sklearn.decomposition import KernelPCA -### Dimensionality Reduction Guidelines +pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.1) +X_reduced = pca.fit_transform(X) +``` -**For preprocessing/feature extraction**: -- PCA (linear relationships) -- TruncatedSVD (sparse data) -- NMF (non-negative data) +### Manifold Learning -**For visualization only**: -- t-SNE (preserves local structure) -- UMAP (preserves both local and global structure) +**t-SNE (`sklearn.manifold.TSNE`)** +- t-distributed Stochastic Neighbor Embedding +- Excellent for 2D/3D visualization +- Key parameters: + - `n_components`: Usually 2 or 3 + - `perplexity`: Balance between local and global structure (5-50) + - `learning_rate`: Usually 10-1000 + - `n_iter`: Number of iterations (min 250) +- Use when: Visualizing high-dimensional data +- Note: Slow on large datasets, no transform() method +- Example: +```python +from sklearn.manifold import TSNE -**Always**: -- Standardize features before PCA -- Use appropriate n_components (elbow plot, explained variance) -- Don't use t-SNE for anything except visualization +tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42) +X_embedded = tsne.fit_transform(X) -### Common Pitfalls +# Visualize +import matplotlib.pyplot as plt +plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap='viridis') +plt.title('t-SNE visualization') +``` -1. **Not scaling data**: Most algorithms sensitive to scale -2. **Using t-SNE for preprocessing**: Only for visualization! -3. **Overfitting cluster count**: Too many clusters = overfitting noise -4. **Ignoring outliers**: Can severely affect centroid-based methods -5. **Wrong metric**: Euclidean assumes all features equally important -6. **Not validating results**: Always check with multiple metrics and domain knowledge -7. **PCA without standardization**: Components dominated by high-variance features +**UMAP (not in scikit-learn, but compatible)** +- Uniform Manifold Approximation and Projection +- Faster than t-SNE, preserves global structure better +- Install: `uv pip install umap-learn` +- Example: +```python +from umap import UMAP + +reducer = UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42) +X_embedded = reducer.fit_transform(X) +``` + +**Isomap** +- Isometric Mapping +- Preserves geodesic distances +- Key parameters: `n_components`, `n_neighbors` +- Use when: Non-linear manifolds +- Example: +```python +from sklearn.manifold import Isomap + +isomap = Isomap(n_components=2, n_neighbors=5) +X_embedded = isomap.fit_transform(X) +``` + +**Locally Linear Embedding (LLE)** +- Preserves local neighborhood structure +- Key parameters: `n_components`, `n_neighbors` +- Example: +```python +from sklearn.manifold import LocallyLinearEmbedding + +lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10) +X_embedded = lle.fit_transform(X) +``` + +**MDS (Multidimensional Scaling)** +- Preserves pairwise distances +- Key parameter: `n_components`, `metric` (True/False) +- Example: +```python +from sklearn.manifold import MDS + +mds = MDS(n_components=2, metric=True, random_state=42) +X_embedded = mds.fit_transform(X) +``` + +### Matrix Factorization + +**NMF (Non-negative Matrix Factorization)** +- Factorizes into non-negative matrices +- Key parameters: `n_components`, `init` ('nndsvd', 'random') +- Use when: Data is non-negative (images, text) +- Interpretable components +- Example: +```python +from sklearn.decomposition import NMF + +nmf = NMF(n_components=10, init='nndsvd', random_state=42) +W = nmf.fit_transform(X) # Document-topic matrix +H = nmf.components_ # Topic-word matrix +``` + +**TruncatedSVD** +- SVD for sparse matrices +- Similar to PCA but works with sparse data +- Use when: Text data, sparse matrices +- Example: +```python +from sklearn.decomposition import TruncatedSVD + +svd = TruncatedSVD(n_components=100, random_state=42) +X_reduced = svd.fit_transform(X_sparse) +print(f"Explained variance: {svd.explained_variance_ratio_.sum()}") +``` + +**FastICA** +- Independent Component Analysis +- Separates multivariate signal into independent components +- Key parameter: `n_components` +- Use when: Signal separation (e.g., audio, EEG) +- Example: +```python +from sklearn.decomposition import FastICA + +ica = FastICA(n_components=10, random_state=42) +S = ica.fit_transform(X) # Independent sources +A = ica.mixing_ # Mixing matrix +``` + +**LatentDirichletAllocation (LDA)** +- Topic modeling for text data +- Key parameters: `n_components` (number of topics), `learning_method` ('batch', 'online') +- Use when: Topic modeling, document clustering +- Example: +```python +from sklearn.decomposition import LatentDirichletAllocation + +lda = LatentDirichletAllocation(n_components=10, random_state=42) +doc_topics = lda.fit_transform(X_counts) # Document-topic distribution + +# Get top words for each topic +feature_names = vectorizer.get_feature_names_out() +for topic_idx, topic in enumerate(lda.components_): + top_words = [feature_names[i] for i in topic.argsort()[-10:]] + print(f"Topic {topic_idx}: {', '.join(top_words)}") +``` + +## Outlier and Novelty Detection + +### Outlier Detection + +**IsolationForest** +- Isolates anomalies using random trees +- Key parameters: + - `contamination`: Expected proportion of outliers + - `n_estimators`: Number of trees +- Use when: High-dimensional data, efficiency important +- Example: +```python +from sklearn.ensemble import IsolationForest + +model = IsolationForest(contamination=0.1, random_state=42) +predictions = model.fit_predict(X) # -1 for outliers, 1 for inliers +``` + +**LocalOutlierFactor** +- Measures local density deviation +- Key parameters: `n_neighbors`, `contamination` +- Use when: Varying density regions +- Example: +```python +from sklearn.neighbors import LocalOutlierFactor + +lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1) +predictions = lof.fit_predict(X) # -1 for outliers, 1 for inliers +outlier_scores = lof.negative_outlier_factor_ +``` + +**One-Class SVM** +- Learns decision boundary around normal data +- Key parameters: `nu` (upper bound on outliers), `kernel`, `gamma` +- Use when: Small training set of normal data +- Example: +```python +from sklearn.svm import OneClassSVM + +model = OneClassSVM(nu=0.1, kernel='rbf', gamma='auto') +model.fit(X_train) +predictions = model.predict(X_test) # -1 for outliers, 1 for inliers +``` + +**EllipticEnvelope** +- Assumes Gaussian distribution +- Key parameter: `contamination` +- Use when: Data is Gaussian-distributed +- Example: +```python +from sklearn.covariance import EllipticEnvelope + +model = EllipticEnvelope(contamination=0.1, random_state=42) +predictions = model.fit_predict(X) +``` + +## Gaussian Mixture Models + +**GaussianMixture** +- Probabilistic clustering with mixture of Gaussians +- Key parameters: + - `n_components`: Number of mixture components + - `covariance_type`: 'full', 'tied', 'diag', 'spherical' +- Use when: Soft clustering, need probability estimates +- Example: +```python +from sklearn.mixture import GaussianMixture + +gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42) +gmm.fit(X) + +# Predict cluster labels +labels = gmm.predict(X) + +# Get probability of each cluster +probabilities = gmm.predict_proba(X) + +# Information criteria for model selection +print(f"BIC: {gmm.bic(X)}") # Lower is better +print(f"AIC: {gmm.aic(X)}") # Lower is better +``` + +## Choosing the Right Method + +### Clustering: +- **Know K, spherical clusters**: K-Means +- **Arbitrary shapes, noise**: DBSCAN, HDBSCAN +- **Hierarchical structure**: AgglomerativeClustering +- **Very large data**: MiniBatchKMeans, BIRCH +- **Probabilistic**: GaussianMixture + +### Dimensionality Reduction: +- **Linear, variance explanation**: PCA +- **Non-linear, visualization**: t-SNE, UMAP +- **Non-negative data**: NMF +- **Sparse data**: TruncatedSVD +- **Topic modeling**: LatentDirichletAllocation + +### Outlier Detection: +- **High-dimensional**: IsolationForest +- **Varying density**: LocalOutlierFactor +- **Gaussian data**: EllipticEnvelope diff --git a/scientific-packages/scikit-learn/scripts/classification_pipeline.py b/scientific-packages/scikit-learn/scripts/classification_pipeline.py index 749fd6d..c770355 100644 --- a/scientific-packages/scikit-learn/scripts/classification_pipeline.py +++ b/scientific-packages/scikit-learn/scripts/classification_pipeline.py @@ -1,219 +1,257 @@ -#!/usr/bin/env python3 """ -Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning. -Demonstrates best practices for scikit-learn workflows. +Complete classification pipeline example with preprocessing, model training, +hyperparameter tuning, and evaluation. """ import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score -import joblib +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + classification_report, confusion_matrix, roc_auc_score, + accuracy_score, precision_score, recall_score, f1_score +) +import warnings +warnings.filterwarnings('ignore') def create_preprocessing_pipeline(numeric_features, categorical_features): """ - Create preprocessing pipeline for mixed data types. + Create a preprocessing pipeline for mixed data types. - Args: - numeric_features: List of numeric column names - categorical_features: List of categorical column names + Parameters: + ----------- + numeric_features : list + List of numeric feature column names + categorical_features : list + List of categorical feature column names Returns: - ColumnTransformer with appropriate preprocessing for each data type + -------- + ColumnTransformer + Preprocessing pipeline """ + # Numeric preprocessing numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) + # Categorical preprocessing categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)) + ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ]) + # Combine transformers preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) - ]) + ] + ) return preprocessor -def create_full_pipeline(preprocessor, classifier=None): +def train_and_evaluate_model(X, y, numeric_features, categorical_features, + test_size=0.2, random_state=42): """ - Create complete ML pipeline with preprocessing and classification. + Complete pipeline: preprocess, train, tune, and evaluate a classifier. - Args: - preprocessor: Preprocessing ColumnTransformer - classifier: Classifier instance (default: RandomForestClassifier) + Parameters: + ----------- + X : DataFrame or array + Feature matrix + y : Series or array + Target variable + numeric_features : list + List of numeric feature names + categorical_features : list + List of categorical feature names + test_size : float + Proportion of data for testing + random_state : int + Random seed Returns: - Complete Pipeline + -------- + dict + Dictionary containing trained model, predictions, and metrics """ - if classifier is None: - classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) + # Split data with stratification + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, stratify=y, random_state=random_state + ) - pipeline = Pipeline(steps=[ - ('preprocessor', preprocessor), - ('classifier', classifier) - ]) + print(f"Training set size: {len(X_train)}") + print(f"Test set size: {len(X_test)}") + print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}") - return pipeline + # Create preprocessor + preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features) - -def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5): - """ - Evaluate model using cross-validation and test set. - - Args: - pipeline: Trained pipeline - X_train, y_train: Training data - X_test, y_test: Test data - cv: Number of cross-validation folds - - Returns: - Dictionary with evaluation results - """ - # Cross-validation on training set - cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy') - - # Test set evaluation - y_pred = pipeline.predict(X_test) - test_score = pipeline.score(X_test, y_test) - - # Get probabilities if available - try: - y_proba = pipeline.predict_proba(X_test) - if len(np.unique(y_test)) == 2: - # Binary classification - auc = roc_auc_score(y_test, y_proba[:, 1]) - else: - # Multiclass - auc = roc_auc_score(y_test, y_proba, multi_class='ovr') - except: - auc = None - - results = { - 'cv_mean': cv_scores.mean(), - 'cv_std': cv_scores.std(), - 'test_score': test_score, - 'auc': auc, - 'classification_report': classification_report(y_test, y_pred), - 'confusion_matrix': confusion_matrix(y_test, y_pred) + # Define models to compare + models = { + 'Logistic Regression': Pipeline([ + ('preprocessor', preprocessor), + ('classifier', LogisticRegression(max_iter=1000, random_state=random_state)) + ]), + 'Random Forest': Pipeline([ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state)) + ]), + 'Gradient Boosting': Pipeline([ + ('preprocessor', preprocessor), + ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state)) + ]) } - return results + # Compare models using cross-validation + print("\n" + "="*60) + print("Model Comparison (5-Fold Cross-Validation)") + print("="*60) + cv_results = {} + for name, model in models.items(): + scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy') + cv_results[name] = scores.mean() + print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})") -def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5): - """ - Perform hyperparameter tuning using GridSearchCV. + # Select best model based on CV + best_model_name = max(cv_results, key=cv_results.get) + best_model = models[best_model_name] - Args: - pipeline: Pipeline to tune - X_train, y_train: Training data - param_grid: Dictionary of parameters to search - cv: Number of cross-validation folds + print(f"\nBest model: {best_model_name}") + + # Hyperparameter tuning for best model + if best_model_name == 'Random Forest': + param_grid = { + 'classifier__n_estimators': [100, 200], + 'classifier__max_depth': [10, 20, None], + 'classifier__min_samples_split': [2, 5] + } + elif best_model_name == 'Gradient Boosting': + param_grid = { + 'classifier__n_estimators': [100, 200], + 'classifier__learning_rate': [0.01, 0.1], + 'classifier__max_depth': [3, 5] + } + else: # Logistic Regression + param_grid = { + 'classifier__C': [0.1, 1.0, 10.0], + 'classifier__penalty': ['l2'] + } + + print("\n" + "="*60) + print("Hyperparameter Tuning") + print("="*60) - Returns: - GridSearchCV object with best model - """ grid_search = GridSearchCV( - pipeline, - param_grid, - cv=cv, - scoring='f1_weighted', - n_jobs=-1, - verbose=1 + best_model, param_grid, cv=5, scoring='accuracy', + n_jobs=-1, verbose=0 ) grid_search.fit(X_train, y_train) print(f"Best parameters: {grid_search.best_params_}") - print(f"Best CV score: {grid_search.best_score_:.3f}") + print(f"Best CV score: {grid_search.best_score_:.4f}") - return grid_search + # Evaluate on test set + tuned_model = grid_search.best_estimator_ + y_pred = tuned_model.predict(X_test) + y_pred_proba = tuned_model.predict_proba(X_test) + print("\n" + "="*60) + print("Test Set Evaluation") + print("="*60) -def main(): - """ - Example usage of the classification pipeline. - """ - # Load your data here - # X, y = load_data() + # Calculate metrics + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred, average='weighted') + recall = recall_score(y_test, y_pred, average='weighted') + f1 = f1_score(y_test, y_pred, average='weighted') - # Example with synthetic data - from sklearn.datasets import make_classification - X, y = make_classification( - n_samples=1000, - n_features=20, - n_informative=15, - n_redundant=5, - random_state=42 - ) + print(f"Accuracy: {accuracy:.4f}") + print(f"Precision: {precision:.4f}") + print(f"Recall: {recall:.4f}") + print(f"F1-Score: {f1:.4f}") - # Convert to DataFrame for demonstration - feature_names = [f'feature_{i}' for i in range(X.shape[1])] - X = pd.DataFrame(X, columns=feature_names) + # ROC AUC (if binary classification) + if len(np.unique(y)) == 2: + roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1]) + print(f"ROC AUC: {roc_auc:.4f}") - # Split features into numeric and categorical (all numeric in this example) - numeric_features = feature_names - categorical_features = [] - - # Split data (use stratify for imbalanced classes) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y - ) - - # Create preprocessing pipeline - preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features) - - # Create full pipeline - pipeline = create_full_pipeline(preprocessor) - - # Train model - print("Training model...") - pipeline.fit(X_train, y_train) - - # Evaluate model - print("\nEvaluating model...") - results = evaluate_model(pipeline, X_train, y_train, X_test, y_test) - - print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})") - print(f"Test Accuracy: {results['test_score']:.3f}") - if results['auc']: - print(f"ROC-AUC: {results['auc']:.3f}") - print("\nClassification Report:") - print(results['classification_report']) - - # Hyperparameter tuning (optional) - print("\nTuning hyperparameters...") - param_grid = { - 'classifier__n_estimators': [100, 200], - 'classifier__max_depth': [10, 20, None], - 'classifier__min_samples_split': [2, 5] - } - - grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid) - - # Evaluate best model - print("\nEvaluating tuned model...") - best_pipeline = grid_search.best_estimator_ - y_pred = best_pipeline.predict(X_test) + print("\n" + "="*60) + print("Classification Report") + print("="*60) print(classification_report(y_test, y_pred)) - # Save model - print("\nSaving model...") - joblib.dump(best_pipeline, 'best_model.pkl') - print("Model saved as 'best_model.pkl'") + print("\n" + "="*60) + print("Confusion Matrix") + print("="*60) + print(confusion_matrix(y_test, y_pred)) + + # Feature importance (if available) + if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'): + print("\n" + "="*60) + print("Top 10 Most Important Features") + print("="*60) + + feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out() + importances = tuned_model.named_steps['classifier'].feature_importances_ + + feature_importance_df = pd.DataFrame({ + 'feature': feature_names, + 'importance': importances + }).sort_values('importance', ascending=False).head(10) + + print(feature_importance_df.to_string(index=False)) + + return { + 'model': tuned_model, + 'y_test': y_test, + 'y_pred': y_pred, + 'y_pred_proba': y_pred_proba, + 'metrics': { + 'accuracy': accuracy, + 'precision': precision, + 'recall': recall, + 'f1': f1 + } + } +# Example usage if __name__ == "__main__": - main() + # Load example dataset + from sklearn.datasets import load_breast_cancer + + # Load data + data = load_breast_cancer() + X = pd.DataFrame(data.data, columns=data.feature_names) + y = data.target + + # For demonstration, treat all features as numeric + numeric_features = X.columns.tolist() + categorical_features = [] + + print("="*60) + print("Classification Pipeline Example") + print("Dataset: Breast Cancer Wisconsin") + print("="*60) + + # Run complete pipeline + results = train_and_evaluate_model( + X, y, numeric_features, categorical_features, + test_size=0.2, random_state=42 + ) + + print("\n" + "="*60) + print("Pipeline Complete!") + print("="*60) diff --git a/scientific-packages/scikit-learn/scripts/clustering_analysis.py b/scientific-packages/scikit-learn/scripts/clustering_analysis.py index c8625f8..d4dbc31 100644 --- a/scientific-packages/scikit-learn/scripts/clustering_analysis.py +++ b/scientific-packages/scikit-learn/scripts/clustering_analysis.py @@ -1,291 +1,386 @@ -#!/usr/bin/env python3 """ -Clustering analysis script with multiple algorithms and evaluation. -Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization. +Clustering analysis example with multiple algorithms, evaluation, and visualization. """ import numpy as np import pandas as pd -from sklearn.preprocessing import StandardScaler -from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering -from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score -from sklearn.decomposition import PCA import matplotlib.pyplot as plt -import seaborn as sns +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering +from sklearn.mixture import GaussianMixture +from sklearn.metrics import ( + silhouette_score, calinski_harabasz_score, davies_bouldin_score +) +import warnings +warnings.filterwarnings('ignore') -def scale_data(X): +def preprocess_for_clustering(X, scale=True, pca_components=None): """ - Scale features using StandardScaler. - ALWAYS scale data before clustering! + Preprocess data for clustering. - Args: - X: Feature matrix + Parameters: + ----------- + X : array-like + Feature matrix + scale : bool + Whether to standardize features + pca_components : int or None + Number of PCA components (None to skip PCA) Returns: - Scaled feature matrix and fitted scaler + -------- + array + Preprocessed data """ - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - return X_scaled, scaler + X_processed = X.copy() + + if scale: + scaler = StandardScaler() + X_processed = scaler.fit_transform(X_processed) + + if pca_components is not None: + pca = PCA(n_components=pca_components) + X_processed = pca.fit_transform(X_processed) + print(f"PCA: Explained variance ratio = {pca.explained_variance_ratio_.sum():.3f}") + + return X_processed -def find_optimal_k(X_scaled, k_range=range(2, 11)): +def find_optimal_k_kmeans(X, k_range=range(2, 11)): """ - Find optimal number of clusters using elbow method and silhouette score. + Find optimal K for K-Means using elbow method and silhouette score. - Args: - X_scaled: Scaled feature matrix - k_range: Range of k values to try + Parameters: + ----------- + X : array-like + Feature matrix (should be scaled) + k_range : range + Range of K values to test Returns: - Dictionary with inertias and silhouette scores + -------- + dict + Dictionary with inertia and silhouette scores for each K """ inertias = [] silhouette_scores = [] for k in k_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) - labels = kmeans.fit_predict(X_scaled) + labels = kmeans.fit_predict(X) + inertias.append(kmeans.inertia_) - silhouette_scores.append(silhouette_score(X_scaled, labels)) + silhouette_scores.append(silhouette_score(X, labels)) + + # Plot results + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) + + # Elbow plot + ax1.plot(k_range, inertias, 'bo-') + ax1.set_xlabel('Number of clusters (K)') + ax1.set_ylabel('Inertia') + ax1.set_title('Elbow Method') + ax1.grid(True) + + # Silhouette plot + ax2.plot(k_range, silhouette_scores, 'ro-') + ax2.set_xlabel('Number of clusters (K)') + ax2.set_ylabel('Silhouette Score') + ax2.set_title('Silhouette Analysis') + ax2.grid(True) + + plt.tight_layout() + plt.savefig('clustering_optimization.png', dpi=300, bbox_inches='tight') + print("Saved: clustering_optimization.png") + plt.close() + + # Find best K based on silhouette score + best_k = k_range[np.argmax(silhouette_scores)] + print(f"\nRecommended K based on silhouette score: {best_k}") return { 'k_values': list(k_range), 'inertias': inertias, - 'silhouette_scores': silhouette_scores + 'silhouette_scores': silhouette_scores, + 'best_k': best_k } -def plot_elbow_silhouette(results): +def compare_clustering_algorithms(X, n_clusters=3): """ - Plot elbow method and silhouette scores. + Compare different clustering algorithms. - Args: - results: Dictionary from find_optimal_k - """ - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) - - # Elbow plot - ax1.plot(results['k_values'], results['inertias'], 'bo-') - ax1.set_xlabel('Number of clusters (k)') - ax1.set_ylabel('Inertia') - ax1.set_title('Elbow Method') - ax1.grid(True, alpha=0.3) - - # Silhouette plot - ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-') - ax2.set_xlabel('Number of clusters (k)') - ax2.set_ylabel('Silhouette Score') - ax2.set_title('Silhouette Score vs k') - ax2.grid(True, alpha=0.3) - - plt.tight_layout() - plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight') - print("Saved elbow and silhouette plots to 'elbow_silhouette.png'") - plt.close() - - -def evaluate_clustering(X_scaled, labels, algorithm_name): - """ - Evaluate clustering using multiple metrics. - - Args: - X_scaled: Scaled feature matrix - labels: Cluster labels - algorithm_name: Name of clustering algorithm + Parameters: + ----------- + X : array-like + Feature matrix (should be scaled) + n_clusters : int + Number of clusters Returns: - Dictionary with evaluation metrics + -------- + dict + Dictionary with results for each algorithm """ - # Filter out noise points for DBSCAN (-1 labels) - mask = labels != -1 - X_filtered = X_scaled[mask] - labels_filtered = labels[mask] + print("="*60) + print(f"Comparing Clustering Algorithms (n_clusters={n_clusters})") + print("="*60) - n_clusters = len(set(labels_filtered)) - n_noise = list(labels).count(-1) - - results = { - 'algorithm': algorithm_name, - 'n_clusters': n_clusters, - 'n_noise': n_noise + algorithms = { + 'K-Means': KMeans(n_clusters=n_clusters, random_state=42, n_init=10), + 'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters, linkage='ward'), + 'Gaussian Mixture': GaussianMixture(n_components=n_clusters, random_state=42) } - # Calculate metrics if we have valid clusters - if n_clusters > 1: - results['silhouette'] = silhouette_score(X_filtered, labels_filtered) - results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered) - results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered) + # DBSCAN doesn't require n_clusters + # We'll add it separately + dbscan = DBSCAN(eps=0.5, min_samples=5) + dbscan_labels = dbscan.fit_predict(X) + + results = {} + + for name, algorithm in algorithms.items(): + labels = algorithm.fit_predict(X) + + # Calculate metrics + silhouette = silhouette_score(X, labels) + calinski = calinski_harabasz_score(X, labels) + davies = davies_bouldin_score(X, labels) + + results[name] = { + 'labels': labels, + 'n_clusters': n_clusters, + 'silhouette': silhouette, + 'calinski_harabasz': calinski, + 'davies_bouldin': davies + } + + print(f"\n{name}:") + print(f" Silhouette Score: {silhouette:.4f} (higher is better)") + print(f" Calinski-Harabasz: {calinski:.4f} (higher is better)") + print(f" Davies-Bouldin: {davies:.4f} (lower is better)") + + # DBSCAN results + n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0) + n_noise = list(dbscan_labels).count(-1) + + if n_clusters_dbscan > 1: + # Only calculate metrics if we have multiple clusters + mask = dbscan_labels != -1 # Exclude noise + if mask.sum() > 0: + silhouette = silhouette_score(X[mask], dbscan_labels[mask]) + calinski = calinski_harabasz_score(X[mask], dbscan_labels[mask]) + davies = davies_bouldin_score(X[mask], dbscan_labels[mask]) + + results['DBSCAN'] = { + 'labels': dbscan_labels, + 'n_clusters': n_clusters_dbscan, + 'n_noise': n_noise, + 'silhouette': silhouette, + 'calinski_harabasz': calinski, + 'davies_bouldin': davies + } + + print(f"\nDBSCAN:") + print(f" Clusters found: {n_clusters_dbscan}") + print(f" Noise points: {n_noise}") + print(f" Silhouette Score: {silhouette:.4f} (higher is better)") + print(f" Calinski-Harabasz: {calinski:.4f} (higher is better)") + print(f" Davies-Bouldin: {davies:.4f} (lower is better)") else: - results['silhouette'] = None - results['davies_bouldin'] = None - results['calinski_harabasz'] = None + print(f"\nDBSCAN:") + print(f" Clusters found: {n_clusters_dbscan}") + print(f" Noise points: {n_noise}") + print(" Note: Insufficient clusters for metric calculation") return results -def perform_kmeans(X_scaled, n_clusters=3): +def visualize_clusters(X, results, true_labels=None): """ - Perform k-means clustering. + Visualize clustering results using PCA for 2D projection. - Args: - X_scaled: Scaled feature matrix - n_clusters: Number of clusters - - Returns: - Fitted KMeans model and labels + Parameters: + ----------- + X : array-like + Feature matrix + results : dict + Dictionary with clustering results + true_labels : array-like or None + True labels (if available) for comparison """ - kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) - labels = kmeans.fit_predict(X_scaled) - return kmeans, labels + # Reduce to 2D using PCA + pca = PCA(n_components=2) + X_2d = pca.fit_transform(X) + # Determine number of subplots + n_plots = len(results) + if true_labels is not None: + n_plots += 1 -def perform_dbscan(X_scaled, eps=0.5, min_samples=5): - """ - Perform DBSCAN clustering. + n_cols = min(3, n_plots) + n_rows = (n_plots + n_cols - 1) // n_cols - Args: - X_scaled: Scaled feature matrix - eps: Maximum distance between neighbors - min_samples: Minimum points to form dense region + fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows)) + if n_plots == 1: + axes = np.array([axes]) + axes = axes.flatten() - Returns: - Fitted DBSCAN model and labels - """ - dbscan = DBSCAN(eps=eps, min_samples=min_samples) - labels = dbscan.fit_predict(X_scaled) - return dbscan, labels + plot_idx = 0 + # Plot true labels if available + if true_labels is not None: + ax = axes[plot_idx] + scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=true_labels, cmap='viridis', alpha=0.6) + ax.set_title('True Labels') + ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})') + ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})') + plt.colorbar(scatter, ax=ax) + plot_idx += 1 -def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'): - """ - Perform hierarchical clustering. + # Plot clustering results + for name, result in results.items(): + ax = axes[plot_idx] + labels = result['labels'] - Args: - X_scaled: Scaled feature matrix - n_clusters: Number of clusters - linkage: Linkage criterion ('ward', 'complete', 'average', 'single') + scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6) - Returns: - Fitted AgglomerativeClustering model and labels - """ - hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage) - labels = hierarchical.fit_predict(X_scaled) - return hierarchical, labels + # Highlight noise points for DBSCAN + if name == 'DBSCAN' and -1 in labels: + noise_mask = labels == -1 + ax.scatter(X_2d[noise_mask, 0], X_2d[noise_mask, 1], + c='red', marker='x', s=100, label='Noise', alpha=0.8) + ax.legend() + title = f"{name} (K={result['n_clusters']})" + if 'silhouette' in result: + title += f"\nSilhouette: {result['silhouette']:.3f}" + ax.set_title(title) + ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})') + ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})') + plt.colorbar(scatter, ax=ax) -def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'): - """ - Visualize clusters in 2D using PCA or t-SNE. + plot_idx += 1 - Args: - X_scaled: Scaled feature matrix - labels: Cluster labels - algorithm_name: Name of algorithm for title - method: 'pca' or 'tsne' - """ - # Reduce to 2D - if method == 'pca': - pca = PCA(n_components=2, random_state=42) - X_2d = pca.fit_transform(X_scaled) - variance = pca.explained_variance_ratio_ - xlabel = f'PC1 ({variance[0]:.1%} variance)' - ylabel = f'PC2 ({variance[1]:.1%} variance)' - else: - from sklearn.manifold import TSNE - # Use PCA first to speed up t-SNE - pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42) - X_pca = pca.fit_transform(X_scaled) - tsne = TSNE(n_components=2, random_state=42, perplexity=30) - X_2d = tsne.fit_transform(X_pca) - xlabel = 't-SNE 1' - ylabel = 't-SNE 2' + # Hide unused subplots + for idx in range(plot_idx, len(axes)): + axes[idx].axis('off') - # Plot - plt.figure(figsize=(10, 8)) - scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50) - plt.colorbar(scatter, label='Cluster') - plt.xlabel(xlabel) - plt.ylabel(ylabel) - plt.title(f'{algorithm_name} Clustering ({method.upper()})') - plt.grid(True, alpha=0.3) - - filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png' - plt.savefig(filename, dpi=300, bbox_inches='tight') - print(f"Saved visualization to '{filename}'") + plt.tight_layout() + plt.savefig('clustering_results.png', dpi=300, bbox_inches='tight') + print("\nSaved: clustering_results.png") plt.close() -def main(): +def complete_clustering_analysis(X, true_labels=None, scale=True, + find_k=True, k_range=range(2, 11), n_clusters=3): """ - Example clustering analysis workflow. + Complete clustering analysis workflow. + + Parameters: + ----------- + X : array-like + Feature matrix + true_labels : array-like or None + True labels (for comparison only, not used in clustering) + scale : bool + Whether to scale features + find_k : bool + Whether to search for optimal K + k_range : range + Range of K values to test + n_clusters : int + Number of clusters to use in comparison + + Returns: + -------- + dict + Dictionary with all analysis results """ - # Load your data here - # X = load_data() + print("="*60) + print("Clustering Analysis") + print("="*60) + print(f"Data shape: {X.shape}") - # Example with synthetic data - from sklearn.datasets import make_blobs - X, y_true = make_blobs( - n_samples=500, - n_features=10, - centers=4, - cluster_std=1.0, - random_state=42 - ) + # Preprocess data + X_processed = preprocess_for_clustering(X, scale=scale) - print(f"Dataset shape: {X.shape}") + # Find optimal K if requested + optimization_results = None + if find_k: + print("\n" + "="*60) + print("Finding Optimal Number of Clusters") + print("="*60) + optimization_results = find_optimal_k_kmeans(X_processed, k_range=k_range) - # Scale data (ALWAYS scale for clustering!) - print("\nScaling data...") - X_scaled, scaler = scale_data(X) + # Use recommended K + if optimization_results: + n_clusters = optimization_results['best_k'] - # Find optimal k - print("\nFinding optimal number of clusters...") - results = find_optimal_k(X_scaled) - plot_elbow_silhouette(results) + # Compare clustering algorithms + comparison_results = compare_clustering_algorithms(X_processed, n_clusters=n_clusters) - # Based on elbow/silhouette, choose optimal k - optimal_k = 4 # Adjust based on plots - - # Perform k-means - print(f"\nPerforming k-means with k={optimal_k}...") - kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k) - kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means') - - # Perform DBSCAN - print("\nPerforming DBSCAN...") - dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5) - dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN') - - # Perform hierarchical clustering - print("\nPerforming hierarchical clustering...") - hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k) - hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical') - - # Print results + # Visualize results print("\n" + "="*60) - print("CLUSTERING RESULTS") + print("Visualizing Results") + print("="*60) + visualize_clusters(X_processed, comparison_results, true_labels=true_labels) + + return { + 'X_processed': X_processed, + 'optimization': optimization_results, + 'comparison': comparison_results + } + + +# Example usage +if __name__ == "__main__": + from sklearn.datasets import load_iris, make_blobs + + print("="*60) + print("Example 1: Iris Dataset") print("="*60) - for results in [kmeans_results, dbscan_results, hier_results]: - print(f"\n{results['algorithm']}:") - print(f" Clusters: {results['n_clusters']}") - if results['n_noise'] > 0: - print(f" Noise points: {results['n_noise']}") - if results['silhouette']: - print(f" Silhouette Score: {results['silhouette']:.3f}") - print(f" Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)") - print(f" Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)") + # Load Iris dataset + iris = load_iris() + X_iris = iris.data + y_iris = iris.target - # Visualize clusters - print("\nCreating visualizations...") - visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca') - visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca') - visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca') + results_iris = complete_clustering_analysis( + X_iris, + true_labels=y_iris, + scale=True, + find_k=True, + k_range=range(2, 8), + n_clusters=3 + ) - print("\nClustering analysis complete!") + print("\n" + "="*60) + print("Example 2: Synthetic Dataset with Noise") + print("="*60) + # Create synthetic dataset + X_synth, y_synth = make_blobs( + n_samples=500, n_features=2, centers=4, + cluster_std=0.5, random_state=42 + ) -if __name__ == "__main__": - main() + # Add noise points + noise = np.random.randn(50, 2) * 3 + X_synth = np.vstack([X_synth, noise]) + y_synth_with_noise = np.concatenate([y_synth, np.full(50, -1)]) + + results_synth = complete_clustering_analysis( + X_synth, + true_labels=y_synth_with_noise, + scale=True, + find_k=True, + k_range=range(2, 8), + n_clusters=4 + ) + + print("\n" + "="*60) + print("Analysis Complete!") + print("="*60)