Add more scientific skills

2026-03-28 07:33:45 +08:00 · 2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions
--- a/scientific-packages/scikit-learn/scripts/classification_pipeline.py
+++ b/scientific-packages/scikit-learn/scripts/classification_pipeline.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Complete classification pipeline with preprocessing, training, evaluation, and hyperparameter tuning.
+Demonstrates best practices for scikit-learn workflows.
+"""
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
+import joblib
+
+
+def create_preprocessing_pipeline(numeric_features, categorical_features):
+    """
+    Create preprocessing pipeline for mixed data types.
+
+    Args:
+        numeric_features: List of numeric column names
+        categorical_features: List of categorical column names
+
+    Returns:
+        ColumnTransformer with appropriate preprocessing for each data type
+    """
+    numeric_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='median')),
+        ('scaler', StandardScaler())
+    ])
+
+    categorical_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
+    ])
+
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numeric_features),
+            ('cat', categorical_transformer, categorical_features)
+        ])
+
+    return preprocessor
+
+
+def create_full_pipeline(preprocessor, classifier=None):
+    """
+    Create complete ML pipeline with preprocessing and classification.
+
+    Args:
+        preprocessor: Preprocessing ColumnTransformer
+        classifier: Classifier instance (default: RandomForestClassifier)
+
+    Returns:
+        Complete Pipeline
+    """
+    if classifier is None:
+        classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
+
+    pipeline = Pipeline(steps=[
+        ('preprocessor', preprocessor),
+        ('classifier', classifier)
+    ])
+
+    return pipeline
+
+
+def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv=5):
+    """
+    Evaluate model using cross-validation and test set.
+
+    Args:
+        pipeline: Trained pipeline
+        X_train, y_train: Training data
+        X_test, y_test: Test data
+        cv: Number of cross-validation folds
+
+    Returns:
+        Dictionary with evaluation results
+    """
+    # Cross-validation on training set
+    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
+
+    # Test set evaluation
+    y_pred = pipeline.predict(X_test)
+    test_score = pipeline.score(X_test, y_test)
+
+    # Get probabilities if available
+    try:
+        y_proba = pipeline.predict_proba(X_test)
+        if len(np.unique(y_test)) == 2:
+            # Binary classification
+            auc = roc_auc_score(y_test, y_proba[:, 1])
+        else:
+            # Multiclass
+            auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
+    except:
+        auc = None
+
+    results = {
+        'cv_mean': cv_scores.mean(),
+        'cv_std': cv_scores.std(),
+        'test_score': test_score,
+        'auc': auc,
+        'classification_report': classification_report(y_test, y_pred),
+        'confusion_matrix': confusion_matrix(y_test, y_pred)
+    }
+
+    return results
+
+
+def tune_hyperparameters(pipeline, X_train, y_train, param_grid, cv=5):
+    """
+    Perform hyperparameter tuning using GridSearchCV.
+
+    Args:
+        pipeline: Pipeline to tune
+        X_train, y_train: Training data
+        param_grid: Dictionary of parameters to search
+        cv: Number of cross-validation folds
+
+    Returns:
+        GridSearchCV object with best model
+    """
+    grid_search = GridSearchCV(
+        pipeline,
+        param_grid,
+        cv=cv,
+        scoring='f1_weighted',
+        n_jobs=-1,
+        verbose=1
+    )
+
+    grid_search.fit(X_train, y_train)
+
+    print(f"Best parameters: {grid_search.best_params_}")
+    print(f"Best CV score: {grid_search.best_score_:.3f}")
+
+    return grid_search
+
+
+def main():
+    """
+    Example usage of the classification pipeline.
+    """
+    # Load your data here
+    # X, y = load_data()
+
+    # Example with synthetic data
+    from sklearn.datasets import make_classification
+    X, y = make_classification(
+        n_samples=1000,
+        n_features=20,
+        n_informative=15,
+        n_redundant=5,
+        random_state=42
+    )
+
+    # Convert to DataFrame for demonstration
+    feature_names = [f'feature_{i}' for i in range(X.shape[1])]
+    X = pd.DataFrame(X, columns=feature_names)
+
+    # Split features into numeric and categorical (all numeric in this example)
+    numeric_features = feature_names
+    categorical_features = []
+
+    # Split data (use stratify for imbalanced classes)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+
+    # Create preprocessing pipeline
+    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
+
+    # Create full pipeline
+    pipeline = create_full_pipeline(preprocessor)
+
+    # Train model
+    print("Training model...")
+    pipeline.fit(X_train, y_train)
+
+    # Evaluate model
+    print("\nEvaluating model...")
+    results = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
+
+    print(f"CV Accuracy: {results['cv_mean']:.3f} (+/- {results['cv_std']:.3f})")
+    print(f"Test Accuracy: {results['test_score']:.3f}")
+    if results['auc']:
+        print(f"ROC-AUC: {results['auc']:.3f}")
+    print("\nClassification Report:")
+    print(results['classification_report'])
+
+    # Hyperparameter tuning (optional)
+    print("\nTuning hyperparameters...")
+    param_grid = {
+        'classifier__n_estimators': [100, 200],
+        'classifier__max_depth': [10, 20, None],
+        'classifier__min_samples_split': [2, 5]
+    }
+
+    grid_search = tune_hyperparameters(pipeline, X_train, y_train, param_grid)
+
+    # Evaluate best model
+    print("\nEvaluating tuned model...")
+    best_pipeline = grid_search.best_estimator_
+    y_pred = best_pipeline.predict(X_test)
+    print(classification_report(y_test, y_pred))
+
+    # Save model
+    print("\nSaving model...")
+    joblib.dump(best_pipeline, 'best_model.pkl')
+    print("Model saved as 'best_model.pkl'")
+
+
+if __name__ == "__main__":
+    main()
--- a/scientific-packages/scikit-learn/scripts/clustering_analysis.py
+++ b/scientific-packages/scikit-learn/scripts/clustering_analysis.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python3
+"""
+Clustering analysis script with multiple algorithms and evaluation.
+Demonstrates k-means, DBSCAN, and hierarchical clustering with visualization.
+"""
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
+from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def scale_data(X):
+    """
+    Scale features using StandardScaler.
+    ALWAYS scale data before clustering!
+
+    Args:
+        X: Feature matrix
+
+    Returns:
+        Scaled feature matrix and fitted scaler
+    """
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    return X_scaled, scaler
+
+
+def find_optimal_k(X_scaled, k_range=range(2, 11)):
+    """
+    Find optimal number of clusters using elbow method and silhouette score.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        k_range: Range of k values to try
+
+    Returns:
+        Dictionary with inertias and silhouette scores
+    """
+    inertias = []
+    silhouette_scores = []
+
+    for k in k_range:
+        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = kmeans.fit_predict(X_scaled)
+        inertias.append(kmeans.inertia_)
+        silhouette_scores.append(silhouette_score(X_scaled, labels))
+
+    return {
+        'k_values': list(k_range),
+        'inertias': inertias,
+        'silhouette_scores': silhouette_scores
+    }
+
+
+def plot_elbow_silhouette(results):
+    """
+    Plot elbow method and silhouette scores.
+
+    Args:
+        results: Dictionary from find_optimal_k
+    """
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+    # Elbow plot
+    ax1.plot(results['k_values'], results['inertias'], 'bo-')
+    ax1.set_xlabel('Number of clusters (k)')
+    ax1.set_ylabel('Inertia')
+    ax1.set_title('Elbow Method')
+    ax1.grid(True, alpha=0.3)
+
+    # Silhouette plot
+    ax2.plot(results['k_values'], results['silhouette_scores'], 'ro-')
+    ax2.set_xlabel('Number of clusters (k)')
+    ax2.set_ylabel('Silhouette Score')
+    ax2.set_title('Silhouette Score vs k')
+    ax2.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig('elbow_silhouette.png', dpi=300, bbox_inches='tight')
+    print("Saved elbow and silhouette plots to 'elbow_silhouette.png'")
+    plt.close()
+
+
+def evaluate_clustering(X_scaled, labels, algorithm_name):
+    """
+    Evaluate clustering using multiple metrics.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        labels: Cluster labels
+        algorithm_name: Name of clustering algorithm
+
+    Returns:
+        Dictionary with evaluation metrics
+    """
+    # Filter out noise points for DBSCAN (-1 labels)
+    mask = labels != -1
+    X_filtered = X_scaled[mask]
+    labels_filtered = labels[mask]
+
+    n_clusters = len(set(labels_filtered))
+    n_noise = list(labels).count(-1)
+
+    results = {
+        'algorithm': algorithm_name,
+        'n_clusters': n_clusters,
+        'n_noise': n_noise
+    }
+
+    # Calculate metrics if we have valid clusters
+    if n_clusters > 1:
+        results['silhouette'] = silhouette_score(X_filtered, labels_filtered)
+        results['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
+        results['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
+    else:
+        results['silhouette'] = None
+        results['davies_bouldin'] = None
+        results['calinski_harabasz'] = None
+
+    return results
+
+
+def perform_kmeans(X_scaled, n_clusters=3):
+    """
+    Perform k-means clustering.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        n_clusters: Number of clusters
+
+    Returns:
+        Fitted KMeans model and labels
+    """
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X_scaled)
+    return kmeans, labels
+
+
+def perform_dbscan(X_scaled, eps=0.5, min_samples=5):
+    """
+    Perform DBSCAN clustering.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        eps: Maximum distance between neighbors
+        min_samples: Minimum points to form dense region
+
+    Returns:
+        Fitted DBSCAN model and labels
+    """
+    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+    labels = dbscan.fit_predict(X_scaled)
+    return dbscan, labels
+
+
+def perform_hierarchical(X_scaled, n_clusters=3, linkage='ward'):
+    """
+    Perform hierarchical clustering.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        n_clusters: Number of clusters
+        linkage: Linkage criterion ('ward', 'complete', 'average', 'single')
+
+    Returns:
+        Fitted AgglomerativeClustering model and labels
+    """
+    hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
+    labels = hierarchical.fit_predict(X_scaled)
+    return hierarchical, labels
+
+
+def visualize_clusters_2d(X_scaled, labels, algorithm_name, method='pca'):
+    """
+    Visualize clusters in 2D using PCA or t-SNE.
+
+    Args:
+        X_scaled: Scaled feature matrix
+        labels: Cluster labels
+        algorithm_name: Name of algorithm for title
+        method: 'pca' or 'tsne'
+    """
+    # Reduce to 2D
+    if method == 'pca':
+        pca = PCA(n_components=2, random_state=42)
+        X_2d = pca.fit_transform(X_scaled)
+        variance = pca.explained_variance_ratio_
+        xlabel = f'PC1 ({variance[0]:.1%} variance)'
+        ylabel = f'PC2 ({variance[1]:.1%} variance)'
+    else:
+        from sklearn.manifold import TSNE
+        # Use PCA first to speed up t-SNE
+        pca = PCA(n_components=min(50, X_scaled.shape[1]), random_state=42)
+        X_pca = pca.fit_transform(X_scaled)
+        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
+        X_2d = tsne.fit_transform(X_pca)
+        xlabel = 't-SNE 1'
+        ylabel = 't-SNE 2'
+
+    # Plot
+    plt.figure(figsize=(10, 8))
+    scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', alpha=0.6, s=50)
+    plt.colorbar(scatter, label='Cluster')
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.title(f'{algorithm_name} Clustering ({method.upper()})')
+    plt.grid(True, alpha=0.3)
+
+    filename = f'{algorithm_name.lower().replace(" ", "_")}_{method}.png'
+    plt.savefig(filename, dpi=300, bbox_inches='tight')
+    print(f"Saved visualization to '{filename}'")
+    plt.close()
+
+
+def main():
+    """
+    Example clustering analysis workflow.
+    """
+    # Load your data here
+    # X = load_data()
+
+    # Example with synthetic data
+    from sklearn.datasets import make_blobs
+    X, y_true = make_blobs(
+        n_samples=500,
+        n_features=10,
+        centers=4,
+        cluster_std=1.0,
+        random_state=42
+    )
+
+    print(f"Dataset shape: {X.shape}")
+
+    # Scale data (ALWAYS scale for clustering!)
+    print("\nScaling data...")
+    X_scaled, scaler = scale_data(X)
+
+    # Find optimal k
+    print("\nFinding optimal number of clusters...")
+    results = find_optimal_k(X_scaled)
+    plot_elbow_silhouette(results)
+
+    # Based on elbow/silhouette, choose optimal k
+    optimal_k = 4  # Adjust based on plots
+
+    # Perform k-means
+    print(f"\nPerforming k-means with k={optimal_k}...")
+    kmeans, kmeans_labels = perform_kmeans(X_scaled, n_clusters=optimal_k)
+    kmeans_results = evaluate_clustering(X_scaled, kmeans_labels, 'K-Means')
+
+    # Perform DBSCAN
+    print("\nPerforming DBSCAN...")
+    dbscan, dbscan_labels = perform_dbscan(X_scaled, eps=0.5, min_samples=5)
+    dbscan_results = evaluate_clustering(X_scaled, dbscan_labels, 'DBSCAN')
+
+    # Perform hierarchical clustering
+    print("\nPerforming hierarchical clustering...")
+    hierarchical, hier_labels = perform_hierarchical(X_scaled, n_clusters=optimal_k)
+    hier_results = evaluate_clustering(X_scaled, hier_labels, 'Hierarchical')
+
+    # Print results
+    print("\n" + "="*60)
+    print("CLUSTERING RESULTS")
+    print("="*60)
+
+    for results in [kmeans_results, dbscan_results, hier_results]:
+        print(f"\n{results['algorithm']}:")
+        print(f"  Clusters: {results['n_clusters']}")
+        if results['n_noise'] > 0:
+            print(f"  Noise points: {results['n_noise']}")
+        if results['silhouette']:
+            print(f"  Silhouette Score: {results['silhouette']:.3f}")
+            print(f"  Davies-Bouldin Index: {results['davies_bouldin']:.3f} (lower is better)")
+            print(f"  Calinski-Harabasz Index: {results['calinski_harabasz']:.1f} (higher is better)")
+
+    # Visualize clusters
+    print("\nCreating visualizations...")
+    visualize_clusters_2d(X_scaled, kmeans_labels, 'K-Means', method='pca')
+    visualize_clusters_2d(X_scaled, dbscan_labels, 'DBSCAN', method='pca')
+    visualize_clusters_2d(X_scaled, hier_labels, 'Hierarchical', method='pca')
+
+    print("\nClustering analysis complete!")
+
+
+if __name__ == "__main__":
+    main()