claude-scientific-skills/scientific-skills/brenda-database/scripts/enzyme_pathway_builder.py

"""
Enzyme Pathway Builder for Retrosynthetic Analysis

This module provides tools for constructing enzymatic pathways and
retrosynthetic trees using BRENDA database information.

Key features:
- Find enzymatic pathways for target products
- Build retrosynthetic trees from products
- Suggest enzyme substitutions and alternatives
- Calculate pathway feasibility and thermodynamics
- Optimize pathway conditions (pH, temperature, cofactors)
- Generate detailed pathway reports
- Support for metabolic engineering and synthetic biology

Installation:
    uv pip install networkx matplotlib pandas

Usage:
    from scripts.enzyme_pathway_builder import find_pathway_for_product, build_retrosynthetic_tree

    pathway = find_pathway_for_product("lactate", max_steps=3)
    tree = build_retrosynthetic_tree("lactate", depth=2)
"""

import re
import json
import time
from typing import List, Dict, Any, Optional, Set, Tuple
from pathlib import Path

try:
    import networkx as nx
    NETWORKX_AVAILABLE = True
except ImportError:
    print("Warning: networkx not installed. Install with: uv pip install networkx")
    NETWORKX_AVAILABLE = False

try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    print("Warning: pandas not installed. Install with: uv pip install pandas")
    PANDAS_AVAILABLE = False

try:
    import matplotlib.pyplot as plt
    MATPLOTLIB_AVAILABLE = True
except ImportError:
    print("Warning: matplotlib not installed. Install with: uv pip install matplotlib")
    MATPLOTLIB_AVAILABLE = False

try:
    from brenda_queries import (
        search_enzymes_by_product, search_enzymes_by_substrate,
        get_environmental_parameters, compare_across_organisms,
        get_substrate_specificity, get_cofactor_requirements,
        find_thermophilic_homologs, find_ph_stable_variants
    )
    BRENDA_QUERIES_AVAILABLE = True
except ImportError:
    print("Warning: brenda_queries not available")
    BRENDA_QUERIES_AVAILABLE = False


def validate_dependencies():
    """Validate that required dependencies are installed."""
    missing = []
    if not NETWORKX_AVAILABLE:
        missing.append("networkx")
    if not PANDAS_AVAILABLE:
        missing.append("pandas")
    if not BRENDA_QUERIES_AVAILABLE:
        missing.append("brenda_queries")
    if missing:
        raise ImportError(f"Missing required dependencies: {', '.join(missing)}")


# Common biochemical transformations with typical EC numbers
COMMON_TRANSFORMATIONS = {
    'oxidation': ['1.1.1'],      # Alcohol dehydrogenases
    'reduction': ['1.1.1'],      # Alcohol dehydrogenases
    'hydrolysis': ['3.1.1', '3.1.3'],  # Esterases, phosphatases
    'carboxylation': ['6.4.1'],   # Carboxylases
    'decarboxylation': ['4.1.1'], # Decarboxylases
    'transamination': ['2.6.1'],  # Aminotransferases
    'phosphorylation': ['2.7.1'], # Kinases
    'dephosphorylation': ['3.1.3'], # Phosphatases
    'isomerization': ['5.1.1', '5.3.1'], # Isomerases
    'ligation': ['6.3.1'],       # Ligases
    'transfer': ['2.1.1', '2.2.1', '2.4.1'], # Transferases
    'hydride_transfer': ['1.1.1', '1.2.1'],  # Oxidoreductases
    'group_transfer': ['2.1.1'],  # Methyltransferases
}

# Simple metabolite database (expanded for pathway building)
METABOLITE_DATABASE = {
    # Primary metabolites
    'glucose': {'formula': 'C6H12O6', 'mw': 180.16, 'class': 'sugar'},
    'fructose': {'formula': 'C6H12O6', 'mw': 180.16, 'class': 'sugar'},
    'galactose': {'formula': 'C6H12O6', 'mw': 180.16, 'class': 'sugar'},
    'pyruvate': {'formula': 'C3H4O3', 'mw': 90.08, 'class': 'carboxylic_acid'},
    'lactate': {'formula': 'C3H6O3', 'mw': 90.08, 'class': 'carboxylic_acid'},
    'acetate': {'formula': 'C2H4O2', 'mw': 60.05, 'class': 'carboxylic_acid'},
    'ethanol': {'formula': 'C2H6O', 'mw': 46.07, 'class': 'alcohol'},
    'acetaldehyde': {'formula': 'C2H4O', 'mw': 44.05, 'class': 'aldehyde'},
    'acetone': {'formula': 'C3H6O', 'mw': 58.08, 'class': 'ketone'},
    'glycerol': {'formula': 'C3H8O3', 'mw': 92.09, 'class': 'alcohol'},
    'ammonia': {'formula': 'NH3', 'mw': 17.03, 'class': 'inorganic'},
    'carbon dioxide': {'formula': 'CO2', 'mw': 44.01, 'class': 'inorganic'},
    'water': {'formula': 'H2O', 'mw': 18.02, 'class': 'inorganic'},
    'oxygen': {'formula': 'O2', 'mw': 32.00, 'class': 'inorganic'},
    'hydrogen': {'formula': 'H2', 'mw': 2.02, 'class': 'inorganic'},
    'nitrogen': {'formula': 'N2', 'mw': 28.01, 'class': 'inorganic'},
    'phosphate': {'formula': 'PO4', 'mw': 94.97, 'class': 'inorganic'},
    'sulfate': {'formula': 'SO4', 'mw': 96.06, 'class': 'inorganic'},

    # Amino acids
    'alanine': {'formula': 'C3H7NO2', 'mw': 89.09, 'class': 'amino_acid'},
    'glycine': {'formula': 'C2H5NO2', 'mw': 75.07, 'class': 'amino_acid'},
    'serine': {'formula': 'C3H7NO3', 'mw': 105.09, 'class': 'amino_acid'},
    'threonine': {'formula': 'C4H9NO3', 'mw': 119.12, 'class': 'amino_acid'},
    'aspartate': {'formula': 'C4H7NO4', 'mw': 133.10, 'class': 'amino_acid'},
    'glutamate': {'formula': 'C5H9NO4', 'mw': 147.13, 'class': 'amino_acid'},
    'asparagine': {'formula': 'C4H8N2O3', 'mw': 132.12, 'class': 'amino_acid'},
    'glutamine': {'formula': 'C5H10N2O3', 'mw': 146.15, 'class': 'amino_acid'},
    'lysine': {'formula': 'C6H14N2O2', 'mw': 146.19, 'class': 'amino_acid'},
    'arginine': {'formula': 'C6H14N4O2', 'mw': 174.20, 'class': 'amino_acid'},
    'histidine': {'formula': 'C6H9N3O2', 'mw': 155.16, 'class': 'amino_acid'},
    'phenylalanine': {'formula': 'C9H11NO2', 'mw': 165.19, 'class': 'amino_acid'},
    'tyrosine': {'formula': 'C9H11NO3', 'mw': 181.19, 'class': 'amino_acid'},
    'tryptophan': {'formula': 'C11H12N2O2', 'mw': 204.23, 'class': 'amino_acid'},
    'leucine': {'formula': 'C6H13NO2', 'mw': 131.18, 'class': 'amino_acid'},
    'isoleucine': {'formula': 'C6H13NO2', 'mw': 131.18, 'class': 'amino_acid'},
    'valine': {'formula': 'C5H11NO2', 'mw': 117.15, 'class': 'amino_acid'},
    'methionine': {'formula': 'C5H11NO2S', 'mw': 149.21, 'class': 'amino_acid'},
    'cysteine': {'formula': 'C3H7NO2S', 'mw': 121.16, 'class': 'amino_acid'},
    'proline': {'formula': 'C5H9NO2', 'mw': 115.13, 'class': 'amino_acid'},

    # Nucleotides (simplified)
    'atp': {'formula': 'C10H16N5O13P3', 'mw': 507.18, 'class': 'nucleotide'},
    'adp': {'formula': 'C10H15N5O10P2', 'mw': 427.20, 'class': 'nucleotide'},
    'amp': {'formula': 'C10H14N5O7P', 'mw': 347.22, 'class': 'nucleotide'},
    'nad': {'formula': 'C21H27N7O14P2', 'mw': 663.43, 'class': 'cofactor'},
    'nadh': {'formula': 'C21H29N7O14P2', 'mw': 665.44, 'class': 'cofactor'},
    'nadp': {'formula': 'C21H28N7O17P3', 'mw': 743.44, 'class': 'cofactor'},
    'nadph': {'formula': 'C21H30N7O17P3', 'mw': 745.45, 'class': 'cofactor'},
    'fadh2': {'formula': 'C21H30N7O14P2', 'mw': 785.55, 'class': 'cofactor'},
    'fadx': {'formula': 'C21H20N4O2', 'mw': 350.36, 'class': 'cofactor'},

    # Common organic acids
    'malate': {'formula': 'C4H6O5', 'mw': 134.09, 'class': 'carboxylic_acid'},
    'oxaloacetate': {'formula': 'C4H4O5', 'mw': 132.07, 'class': 'carboxylic_acid'},
    'succinate': {'formula': 'C4H6O4', 'mw': 118.09, 'class': 'carboxylic_acid'},
    'fumarate': {'formula': 'C4H4O4', 'mw': 116.07, 'class': 'carboxylic_acid'},
    'oxalosuccinate': {'formula': 'C6H6O7', 'mw': 190.12, 'class': 'carboxylic_acid'},
    'alpha-ketoglutarate': {'formula': 'C5H6O5', 'mw': 146.11, 'class': 'carboxylic_acid'},

    # Energy carriers
    'acetyl-coa': {'formula': 'C23H38N7O17P3S', 'mw': 809.51, 'class': 'cofactor'},
    'coenzyme-a': {'formula': 'C21H36N7O16P3S', 'mw': 767.54, 'class': 'cofactor'},
}

# Common cofactors and their roles
COFACTOR_ROLES = {
    'nad+': {'role': 'oxidation', 'oxidation_state': '+1'},
    'nadh': {'role': 'reduction', 'oxidation_state': '0'},
    'nadp+': {'role': 'oxidation', 'oxidation_state': '+1'},
    'nadph': {'role': 'reduction', 'oxidation_state': '0'},
    'fadx': {'role': 'oxidation', 'oxidation_state': '0'},
    'fadh2': {'role': 'reduction', 'oxidation_state': '-2'},
    'atp': {'role': 'phosphorylation', 'oxidation_state': '0'},
    'adp': {'role': 'energy', 'oxidation_state': '0'},
    'amp': {'role': 'energy', 'oxidation_state': '0'},
    'acetyl-coa': {'role': 'acetylation', 'oxidation_state': '0'},
    'coenzyme-a': {'role': 'thiolation', 'oxidation_state': '0'},
}


def identify_metabolite(metabolite_name: str) -> Dict[str, Any]:
    """Identify a metabolite from the database or create entry."""
    metabolite_name = metabolite_name.lower().strip()

    # Check if it's in the database
    if metabolite_name in METABOLITE_DATABASE:
        return {'name': metabolite_name, **METABOLITE_DATABASE[metabolite_name]}

    # Simple formula extraction from common patterns
    formula_patterns = {
        r'c(\d+)h(\d+)o(\d+)': lambda m: f"C{m[0]}H{m[1]}O{m[2]}",
        r'c(\d+)h(\d+)n(\d+)o(\d+)': lambda m: f"C{m[0]}H{m[1]}N{m[2]}O{m[3]}",
    }

    for pattern, formatter in formula_patterns.items():
        match = re.search(pattern, metabolite_name)
        if match:
            formula = formatter(match.groups())
            # Estimate molecular weight (C=12, H=1, N=14, O=16)
            mw = 0
            elements = re.findall(r'([A-Z])(\d*)', formula)
            for elem, count in elements:
                count = int(count) if count else 1
                if elem == 'C':
                    mw += count * 12.01
                elif elem == 'H':
                    mw += count * 1.008
                elif elem == 'N':
                    mw += count * 14.01
                elif elem == 'O':
                    mw += count * 16.00
                elif elem == 'P':
                    mw += count * 30.97
                elif elem == 'S':
                    mw += count * 32.07

            return {
                'name': metabolite_name,
                'formula': formula,
                'mw': mw,
                'class': 'unknown'
            }

    # Fallback - unknown metabolite
    return {
        'name': metabolite_name,
        'formula': 'Unknown',
        'mw': 0,
        'class': 'unknown'
    }


def infer_transformation_type(substrate: str, product: str) -> List[str]:
    """Infer the type of transformation based on substrate and product."""
    substrate_info = identify_metabolite(substrate)
    product_info = identify_metabolite(product)

    transformations = []

    # Check for oxidation/reduction patterns
    if 'alcohol' in substrate_info.get('class', '') and 'carboxylic_acid' in product_info.get('class', ''):
        transformations.append('oxidation')
    elif 'aldehyde' in substrate_info.get('class', '') and 'alcohol' in product_info.get('class', ''):
        transformations.append('reduction')
    elif 'alcohol' in substrate_info.get('class', '') and 'aldehyde' in product_info.get('class', ''):
        transformations.append('oxidation')

    # Check for phosphorylation/dephosphorylation
    if 'phosphate' in product and 'phosphate' not in substrate:
        transformations.append('phosphorylation')
    elif 'phosphate' in substrate and 'phosphate' not in product:
        transformations.append('dephosphorylation')

    # Check for carboxylation/decarboxylation
    if 'co2' in product and 'co2' not in substrate:
        transformations.append('carboxylation')
    elif 'co2' in substrate and 'co2' not in product:
        transformations.append('decarboxylation')

    # Check for hydrolysis (simple heuristic)
    if 'ester' in substrate.lower() and ('carboxylic_acid' in product_info.get('class', '') or 'alcohol' in product_info.get('class', '')):
        transformations.append('hydrolysis')

    # Check for transamination
    if 'amino_acid' in product_info.get('class', '') and 'amino_acid' not in substrate_info.get('class', ''):
        transformations.append('transamination')

    # Default to generic transformation
    if not transformations:
        transformations.append('generic')

    return transformations


def find_enzymes_for_transformation(substrate: str, product: str, limit: int = 10) -> List[Dict[str, Any]]:
    """Find enzymes that catalyze a specific transformation."""
    validate_dependencies()

    # Infer transformation types
    transformations = infer_transformation_type(substrate, product)

    all_enzymes = []

    # Try to find enzymes by product
    try:
        product_enzymes = search_enzymes_by_product(product, limit=limit)
        for enzyme in product_enzymes:
            # Check if substrate is in the reactants
            if substrate.lower() in enzyme.get('reaction', '').lower():
                enzyme['transformation'] = transformations[0] if transformations else 'generic'
                enzyme['substrate'] = substrate
                enzyme['product'] = product
                enzyme['confidence'] = 'high'
                all_enzymes.append(enzyme)
        time.sleep(0.5)  # Rate limiting
    except Exception as e:
        print(f"Error searching enzymes by product: {e}")

    # Try to find enzymes by substrate
    try:
        substrate_enzymes = search_enzymes_by_substrate(substrate, limit=limit)
        for enzyme in substrate_enzymes:
            # Check if product is mentioned in substrate data (limited approach)
            enzyme['transformation'] = transformations[0] if transformations else 'generic'
            enzyme['substrate'] = substrate
            enzyme['product'] = product
            enzyme['confidence'] = 'medium'
            all_enzymes.append(enzyme)
        time.sleep(0.5)  # Rate limiting
    except Exception as e:
        print(f"Error searching enzymes by substrate: {e}")

    # If no enzymes found, try common EC numbers for transformation types
    if not all_enzymes and transformations:
        for trans_type in transformations:
            if trans_type in COMMON_TRANSFORMATIONS:
                for ec_prefix in COMMON_TRANSFORMATIONS[trans_type]:
                    # This is a simplified approach - in practice you'd want
                    # to query the specific EC numbers with more detail
                    try:
                        generic_enzymes = search_by_pattern(trans_type, limit=5)
                        for enzyme in generic_enzymes:
                            enzyme['transformation'] = trans_type
                            enzyme['substrate'] = substrate
                            enzyme['product'] = product
                            enzyme['confidence'] = 'low'
                            all_enzymes.append(enzyme)
                        time.sleep(0.5)
                        break
                    except Exception as e:
                        print(f"Error searching for transformation type {trans_type}: {e}")

    # Remove duplicates and sort by confidence
    unique_enzymes = []
    seen = set()
    for enzyme in all_enzymes:
        key = (enzyme.get('ec_number', ''), enzyme.get('organism', ''))
        if key not in seen:
            seen.add(key)
            unique_enzymes.append(enzyme)

    # Sort by confidence (high > medium > low)
    confidence_order = {'high': 3, 'medium': 2, 'low': 1}
    unique_enzymes.sort(key=lambda x: confidence_order.get(x.get('confidence', 'low'), 0), reverse=True)

    return unique_enzymes[:limit]


def find_pathway_for_product(product: str, max_steps: int = 3, starting_materials: List[str] = None) -> Dict[str, Any]:
    """Find enzymatic pathways to synthesize a target product."""
    validate_dependencies()

    if starting_materials is None:
        # Common starting materials
        starting_materials = ['glucose', 'pyruvate', 'acetate', 'ethanol', 'glycerol']

    pathway = {
        'target': product,
        'max_steps': max_steps,
        'starting_materials': starting_materials,
        'steps': [],
        'alternative_pathways': [],
        'warnings': [],
        'confidence': 0
    }

    # Simple breadth-first search for pathway
    from collections import deque

    queue = deque([(product, 0, [product])])  # (current_metabolite, step_count, pathway)
    visited = set()

    while queue and len(pathway['steps']) == 0:
        current_metabolite, step_count, current_path = queue.popleft()

        if current_metabolite in visited or step_count >= max_steps:
            continue

        visited.add(current_metabolite)

        # Check if current metabolite is a starting material
        if current_metabolite.lower() in [sm.lower() for sm in starting_materials]:
            # Found a complete pathway
            pathway['steps'] = []
            for i in range(len(current_path) - 1):
                substrate = current_path[i + 1]
                product_step = current_path[i]
                enzymes = find_enzymes_for_transformation(substrate, product_step, limit=5)

                if enzymes:
                    pathway['steps'].append({
                        'step_number': i + 1,
                        'substrate': substrate,
                        'product': product_step,
                        'enzymes': enzymes,
                        'transformation': infer_transformation_type(substrate, product_step)
                    })
                else:
                    pathway['warnings'].append(f"No enzymes found for step: {substrate} -> {product_step}")

            pathway['confidence'] = 0.8  # High confidence for found pathway
            break

        # Try to find enzymes that produce current metabolite
        if step_count < max_steps:
            # Generate possible substrates (simplified - in practice you'd need metabolic knowledge)
            possible_substrates = []

            # Try common metabolic precursors
            common_precursors = ['glucose', 'pyruvate', 'acetate', 'ethanol', 'acetyl-CoA', 'oxaloacetate']
            for precursor in common_precursors:
                enzymes = find_enzymes_for_transformation(precursor, current_metabolite, limit=2)
                if enzymes:
                    possible_substrates.append(precursor)
                    pathway['alternative_pathways'].append({
                        'precursor': precursor,
                        'product': current_metabolite,
                        'enzymes': enzymes
                    })

            # Add found substrates to queue
            for substrate in possible_substrates:
                if substrate not in current_path:
                    new_path = [substrate] + current_path
                    queue.append((substrate, step_count + 1, new_path))

        time.sleep(0.2)  # Rate limiting

    # If no complete pathway found, create partial pathway
    if not pathway['steps'] and pathway['alternative_pathways']:
        # Create best guess pathway from alternatives
        best_alternative = max(pathway['alternative_pathways'],
                               key=lambda x: len(x.get('enzymes', [])))
        pathway['steps'] = [{
            'step_number': 1,
            'substrate': best_alternative['precursor'],
            'product': best_alternative['product'],
            'enzymes': best_alternative['enzymes'],
            'transformation': infer_transformation_type(best_alternative['precursor'], best_alternative['product'])
        }]
        pathway['confidence'] = 0.3  # Low confidence for partial pathway
        pathway['warnings'].append("Partial pathway only - complete synthesis route not found")

    elif not pathway['steps']:
        pathway['warnings'].append("No enzymatic pathway found for target product")
        pathway['confidence'] = 0.1

    return pathway


def build_retrosynthetic_tree(target: str, depth: int = 2) -> Dict[str, Any]:
    """Build a retrosynthetic tree for a target molecule."""
    validate_dependencies()

    tree = {
        'target': target,
        'depth': depth,
        'nodes': {target: {'level': 0, 'children': [], 'enzymes': []}},
        'edges': [],
        'alternative_routes': []
    }

    # Build tree recursively
    def build_node_recursive(metabolite: str, current_depth: int, parent: str = None) -> None:
        if current_depth >= depth:
            return

        # Find enzymes that can produce this metabolite
        potential_precursors = ['glucose', 'pyruvate', 'acetate', 'ethanol', 'acetyl-CoA',
                                'oxaloacetate', 'alpha-ketoglutarate', 'malate']

        for precursor in potential_precursors:
            enzymes = find_enzymes_for_transformation(precursor, metabolite, limit=3)

            if enzymes:
                # Add precursor as node if not exists
                if precursor not in tree['nodes']:
                    tree['nodes'][precursor] = {
                        'level': current_depth + 1,
                        'children': [],
                        'enzymes': enzymes
                    }
                    tree['nodes'][metabolite]['children'].append(precursor)
                    tree['edges'].append({
                        'from': precursor,
                        'to': metabolite,
                        'enzymes': enzymes,
                        'transformation': infer_transformation_type(precursor, metabolite)
                    })

                # Recursively build tree
                if current_depth + 1 < depth:
                    build_node_recursive(precursor, current_depth + 1, metabolite)

        # Try common metabolic transformations
        if current_depth < depth - 1:
            transformations = ['oxidation', 'reduction', 'hydrolysis', 'carboxylation', 'decarboxylation']
            for trans in transformations:
                try:
                    generic_enzymes = search_by_pattern(trans, limit=2)
                    if generic_enzymes:
                        # Create hypothetical precursor
                        hypothetical_precursor = f"precursor_{trans}_{metabolite}"
                        tree['nodes'][hypothetical_precursor] = {
                            'level': current_depth + 1,
                            'children': [],
                            'enzymes': generic_enzymes,
                            'hypothetical': True
                        }
                        tree['nodes'][metabolite]['children'].append(hypothetical_precursor)
                        tree['edges'].append({
                            'from': hypothetical_precursor,
                            'to': metabolite,
                            'enzymes': generic_enzymes,
                            'transformation': trans,
                            'hypothetical': True
                        })
                except Exception as e:
                    print(f"Error in retrosynthetic search for {trans}: {e}")

        time.sleep(0.3)  # Rate limiting

    # Start building from target
    build_node_recursive(target, 0)

    # Calculate tree statistics
    tree['total_nodes'] = len(tree['nodes'])
    tree['total_edges'] = len(tree['edges'])
    tree['max_depth'] = max(node['level'] for node in tree['nodes'].values()) if tree['nodes'] else 0

    return tree


def suggest_enzyme_substitutions(ec_number: str, criteria: Dict[str, Any] = None) -> List[Dict[str, Any]]:
    """Suggest alternative enzymes with improved properties."""
    validate_dependencies()

    if criteria is None:
        criteria = {
            'min_temperature': 30,
            'max_temperature': 70,
            'min_ph': 6.0,
            'max_ph': 8.0,
            'min_thermostability': 40,
            'prefer_organisms': ['Escherichia coli', 'Saccharomyces cerevisiae', 'Bacillus subtilis']
        }

    substitutions = []

    # Get organisms for the target enzyme
    try:
        organisms = compare_across_organisms(ec_number, criteria['prefer_organisms'])
        time.sleep(0.5)
    except Exception as e:
        print(f"Error comparing organisms: {e}")
        organisms = []

    # Find thermophilic homologs if temperature is a criterion
    if criteria.get('min_thermostability'):
        try:
            thermophilic = find_thermophilic_homologs(ec_number, criteria['min_thermostability'])
            time.sleep(0.5)

            for enzyme in thermophilic:
                enzyme['substitution_reason'] = f"Thermostable (optimal temp: {enzyme['optimal_temperature']}°C)"
                enzyme['score'] = 8.0 if enzyme['optimal_temperature'] >= criteria['min_thermostability'] else 6.0
                substitutions.append(enzyme)
        except Exception as e:
            print(f"Error finding thermophilic homologs: {e}")

    # Find pH-stable variants
    if criteria.get('min_ph') or criteria.get('max_ph'):
        try:
            ph_stable = find_ph_stable_variants(ec_number, criteria.get('min_ph'), criteria.get('max_ph'))
            time.sleep(0.5)

            for enzyme in ph_stable:
                enzyme['substitution_reason'] = f"pH stable ({enzyme['stability_type']} range: {enzyme['ph_range']})"
                enzyme['score'] = 7.5
                substitutions.append(enzyme)
        except Exception as e:
            print(f"Error finding pH-stable variants: {e}")

    # Add organism comparison results
    for org_data in organisms:
        if org_data.get('data_points', 0) > 0:
            org_data['substitution_reason'] = f"Well-characterized in {org_data['organism']}"
            org_data['score'] = 6.5 if org_data['organism'] in criteria['prefer_organisms'] else 5.0
            substitutions.append(org_data)

    # Sort by score
    substitutions.sort(key=lambda x: x.get('score', 0), reverse=True)

    return substitutions[:10]  # Return top 10 suggestions


def calculate_pathway_feasibility(pathway: Dict[str, Any]) -> Dict[str, Any]:
    """Calculate feasibility scores and potential issues for a pathway."""
    validate_dependencies()

    feasibility = {
        'overall_score': 0,
        'step_scores': [],
        'warnings': [],
        'recommendations': [],
        'thermodynamic_feasibility': 0,
        'enzyme_availability': 0,
        'cofactor_requirements': [],
        'optimal_conditions': {}
    }

    if not pathway.get('steps'):
        feasibility['warnings'].append("No steps in pathway")
        feasibility['overall_score'] = 0.1
        return feasibility

    total_score = 0
    step_scores = []

    for step in pathway['steps']:
        step_score = 0
        enzymes = step.get('enzymes', [])

        # Score based on number of available enzymes
        if len(enzymes) >= 3:
            step_score += 3  # Multiple enzyme options
        elif len(enzymes) >= 1:
            step_score += 2  # At least one enzyme
        else:
            step_score += 0  # No enzymes
            feasibility['warnings'].append(f"No enzymes found for step: {step['substrate']} -> {step['product']}")

        # Score based on enzyme confidence
        if enzymes:
            high_confidence = sum(1 for e in enzymes if e.get('confidence') == 'high')
            confidence_bonus = min(high_confidence, 2)  # Max 2 points for confidence
            step_score += confidence_bonus

        # Check for industrial viability
        industrial_organisms = ['Escherichia coli', 'Saccharomyces cerevisiae', 'Bacillus subtilis']
        industrial_enzymes = sum(1 for e in enzymes if e.get('organism') in industrial_organisms)
        if industrial_enzymes > 0:
            step_score += 1

        # Cap step score at 5
        step_score = min(step_score, 5)
        step_scores.append(step_score)
        total_score += step_score

        # Analyze cofactor requirements
        try:
            for enzyme in enzymes:
                ec_number = enzyme.get('ec_number', '')
                if ec_number:
                    cofactors = get_cofactor_requirements(ec_number)
                    for cofactor in cofactors:
                        if cofactor['name'] not in [c['name'] for c in feasibility['cofactor_requirements']]:
                            feasibility['cofactor_requirements'].append(cofactor)
            time.sleep(0.3)
        except Exception as e:
            print(f"Error analyzing cofactors: {e}")

    feasibility['step_scores'] = step_scores
    feasibility['enzyme_availability'] = total_score / (len(step_scores) * 5)  # Normalize to 0-1
    feasibility['overall_score'] = feasibility['enzyme_availability'] * 0.7  # Weight enzyme availability

    # Thermodynamic feasibility (simplified heuristic)
    pathway_length = len(pathway['steps'])
    if pathway_length <= 2:
        feasibility['thermodynamic_feasibility'] = 0.8  # Short pathways are often feasible
    elif pathway_length <= 4:
        feasibility['thermodynamic_feasibility'] = 0.6
    else:
        feasibility['thermodynamic_feasibility'] = 0.4  # Long pathways may have thermodynamic issues

    # Overall feasibility is weighted combination
    feasibility['overall_score'] = (
        feasibility['enzyme_availability'] * 0.6 +
        feasibility['thermodynamic_feasibility'] * 0.4
    )

    # Generate recommendations
    if feasibility['overall_score'] < 0.3:
        feasibility['warnings'].append("Low overall pathway feasibility")
        feasibility['recommendations'].append("Consider alternative starting materials or target molecules")
    elif feasibility['overall_score'] < 0.6:
        feasibility['warnings'].append("Moderate pathway feasibility")
        feasibility['recommendations'].append("Consider enzyme engineering or cofactor recycling")

    if feasibility['cofactor_requirements']:
        feasibility['recommendations'].append("Implement cofactor recycling system for: " +
                                            ", ".join([c['name'] for c in feasibility['cofactor_requirements']]))

    return feasibility


def optimize_pathway_conditions(pathway: Dict[str, Any]) -> Dict[str, Any]:
    """Suggest optimal conditions for the entire pathway."""
    validate_dependencies()

    optimization = {
        'optimal_temperature': 30.0,  # Default
        'optimal_ph': 7.0,           # Default
        'temperature_range': (20, 40),  # Default
        'ph_range': (6.5, 7.5),         # Default
        'cofactor_system': [],
        'organism_compatibility': {},
        'process_recommendations': []
    }

    temperatures = []
    phs = []
    organism_preferences = {}

    # Collect environmental data from all enzymes
    for step in pathway.get('steps', []):
        for enzyme in step.get('enzymes', []):
            ec_number = enzyme.get('ec_number', '')
            organism = enzyme.get('organism', '')

            if ec_number:
                try:
                    env_params = get_environmental_parameters(ec_number)
                    time.sleep(0.3)

                    if env_params.get('optimal_temperature'):
                        temperatures.append(env_params['optimal_temperature'])
                    if env_params.get('optimal_ph'):
                        phs.append(env_params['optimal_ph'])

                    # Track organism preferences
                    if organism not in organism_preferences:
                        organism_preferences[organism] = {
                            'temperature_optima': [],
                            'ph_optima': [],
                            'step_count': 0
                        }

                    organism_preferences[organism]['step_count'] += 1
                    if env_params.get('optimal_temperature'):
                        organism_preferences[organism]['temperature_optima'].append(env_params['optimal_temperature'])
                    if env_params.get('optimal_ph'):
                        organism_preferences[organism]['ph_optima'].append(env_params['optimal_ph'])

                except Exception as e:
                    print(f"Error getting environmental parameters for {ec_number}: {e}")

    # Calculate optimal conditions
    if temperatures:
        optimization['optimal_temperature'] = sum(temperatures) / len(temperatures)
        optimization['temperature_range'] = (min(temperatures) - 5, max(temperatures) + 5)

    if phs:
        optimization['optimal_ph'] = sum(phs) / len(phs)
        optimization['ph_range'] = (min(phs) - 0.5, max(phs) + 0.5)

    # Find best organism compatibility
    for organism, data in organism_preferences.items():
        if data['temperature_optima'] and data['ph_optima']:
            organism_preferences[organism]['avg_temp'] = sum(data['temperature_optima']) / len(data['temperature_optima'])
            organism_preferences[organism]['avg_ph'] = sum(data['ph_optima']) / len(data['ph_optima'])
            organism_preferences[organism]['compatibility_score'] = data['step_count']

    # Sort organisms by compatibility
    compatible_organisms = sorted(
        [(org, data) for org, data in organism_preferences.items() if data.get('compatibility_score', 0) > 0],
        key=lambda x: x[1]['compatibility_score'],
        reverse=True
    )

    optimization['organism_compatibility'] = dict(compatible_organisms[:5])  # Top 5 organisms

    # Generate process recommendations
    if len(optimization['organism_compatibility']) > 1:
        optimization['process_recommendations'].append("Consider multi-organism system or enzyme cocktails")

    if optimization['temperature_range'][1] - optimization['temperature_range'][0] > 30:
        optimization['process_recommendations'].append("Consider temperature gradient or staged process")

    if optimization['ph_range'][1] - optimization['ph_range'][0] > 2:
        optimization['process_recommendations'].append("Consider pH control system or buffer optimization")

    # Cofactor system optimization
    cofactor_types = {}
    for step in pathway.get('steps', []):
        for enzyme in step.get('enzymes', []):
            ec_number = enzyme.get('ec_number', '')
            if ec_number:
                try:
                    cofactors = get_cofactor_requirements(ec_number)
                    for cofactor in cofactors:
                        cofactor_type = cofactor.get('type', 'other')
                        if cofactor_type not in cofactor_types:
                            cofactor_types[cofactor_type] = []
                        if cofactor['name'] not in cofactor_types[cofactor_type]:
                            cofactor_types[cofactor_type].append(cofactor['name'])
                    time.sleep(0.3)
                except Exception as e:
                    print(f"Error getting cofactors for {ec_number}: {e}")

    optimization['cofactor_system'] = cofactor_types

    return optimization


def generate_pathway_report(pathway: Dict[str, Any], filename: str = None) -> str:
    """Generate a comprehensive pathway report."""
    validate_dependencies()

    if filename is None:
        target_name = pathway.get('target', 'pathway').replace(' ', '_').lower()
        filename = f"pathway_report_{target_name}.txt"

    # Calculate feasibility and optimization
    feasibility = calculate_pathway_feasibility(pathway)
    optimization = optimize_pathway_conditions(pathway)

    report = []
    report.append("=" * 80)
    report.append(f"ENZYMATIC PATHWAY REPORT")
    report.append("=" * 80)

    # Overview
    report.append(f"\nTARGET PRODUCT: {pathway.get('target', 'Unknown')}")
    report.append(f"PATHWAY LENGTH: {len(pathway.get('steps', []))} steps")
    report.append(f"OVERALL FEASIBILITY: {feasibility['overall_score']:.2f}/1.00")

    # Pathway steps
    if pathway.get('steps'):
        report.append("\n" + "=" * 40)
        report.append("PATHWAY STEPS")
        report.append("=" * 40)

        for i, step in enumerate(pathway['steps'], 1):
            report.append(f"\nStep {i}: {step['substrate']} -> {step['product']}")
            report.append(f"Transformation: {', '.join(step.get('transformation', ['Unknown']))}")

            if step.get('enzymes'):
                report.append(f"Available enzymes: {len(step['enzymes'])}")
                for j, enzyme in enumerate(step['enzymes'][:3], 1):  # Top 3 enzymes
                    report.append(f"  {j}. EC {enzyme.get('ec_number', 'Unknown')} - {enzyme.get('organism', 'Unknown')}")
                    report.append(f"     Confidence: {enzyme.get('confidence', 'Unknown')}")
                    if enzyme.get('reaction'):
                        report.append(f"     Reaction: {enzyme['reaction'][:100]}...")

                if len(step['enzymes']) > 3:
                    report.append(f"  ... and {len(step['enzymes']) - 3} additional enzymes")
            else:
                report.append("  No enzymes found for this step")

            if feasibility.get('step_scores') and i-1 < len(feasibility['step_scores']):
                report.append(f"Step feasibility score: {feasibility['step_scores'][i-1]}/5.0")

    # Cofactor requirements
    if feasibility.get('cofactor_requirements'):
        report.append("\n" + "=" * 40)
        report.append("COFACTOR REQUIREMENTS")
        report.append("=" * 40)

        for cofactor in feasibility['cofactor_requirements']:
            report.append(f"- {cofactor['name']} ({cofactor.get('type', 'Unknown')})")
            report.append(f"  Organism: {cofactor.get('organism', 'Unknown')}")
            report.append(f"  EC Number: {cofactor.get('ec_number', 'Unknown')}")

    # Optimal conditions
    report.append("\n" + "=" * 40)
    report.append("OPTIMAL CONDITIONS")
    report.append("=" * 40)

    report.append(f"Temperature: {optimization['optimal_temperature']:.1f}°C")
    report.append(f"pH: {optimization['optimal_ph']:.1f}")
    report.append(f"Temperature range: {optimization['temperature_range'][0]:.1f} - {optimization['temperature_range'][1]:.1f}°C")
    report.append(f"pH range: {optimization['ph_range'][0]:.1f} - {optimization['ph_range'][1]:.1f}")

    if optimization.get('organism_compatibility'):
        report.append("\nCompatible organisms (by preference):")
        for organism, data in list(optimization['organism_compatibility'].items())[:3]:
            report.append(f"- {organism} (compatibility score: {data.get('compatibility_score', 0)})")
            if data.get('avg_temp'):
                report.append(f"  Optimal temperature: {data['avg_temp']:.1f}°C")
            if data.get('avg_ph'):
                report.append(f"  Optimal pH: {data['avg_ph']:.1f}")

    # Warnings and recommendations
    if feasibility.get('warnings'):
        report.append("\n" + "=" * 40)
        report.append("WARNINGS")
        report.append("=" * 40)

        for warning in feasibility['warnings']:
            report.append(f"⚠️  {warning}")

    if feasibility.get('recommendations'):
        report.append("\n" + "=" * 40)
        report.append("RECOMMENDATIONS")
        report.append("=" * 40)

        for rec in feasibility['recommendations']:
            report.append(f"💡 {rec}")

    if optimization.get('process_recommendations'):
        for rec in optimization['process_recommendations']:
            report.append(f"🔧 {rec}")

    # Alternative pathways
    if pathway.get('alternative_pathways'):
        report.append("\n" + "=" * 40)
        report.append("ALTERNATIVE ROUTES")
        report.append("=" * 40)

        for alt in pathway['alternative_pathways'][:5]:  # Top 5 alternatives
            report.append(f"\n{alt['precursor']} -> {alt['product']}")
            report.append(f"Enzymes available: {len(alt.get('enzymes', []))}")
            for enzyme in alt.get('enzymes', [])[:2]:  # Top 2 enzymes
                report.append(f"  - {enzyme.get('ec_number', 'Unknown')} ({enzyme.get('organism', 'Unknown')})")

    # Feasibility analysis
    report.append("\n" + "=" * 40)
    report.append("FEASIBILITY ANALYSIS")
    report.append("=" * 40)

    report.append(f"Enzyme availability score: {feasibility['enzyme_availability']:.2f}/1.00")
    report.append(f"Thermodynamic feasibility: {feasibility['thermodynamic_feasibility']:.2f}/1.00")

    # Write report to file
    with open(filename, 'w') as f:
        f.write('\n'.join(report))

    print(f"Pathway report saved to {filename}")
    return filename


def visualize_pathway(pathway: Dict[str, Any], save_path: str = None) -> str:
    """Create a visual representation of the pathway."""
    validate_dependencies()

    if not NETWORKX_AVAILABLE or not MATPLOTLIB_AVAILABLE:
        print("networkx and matplotlib required for pathway visualization")
        return save_path or "pathway_visualization.png"

    try:
        # Create directed graph
        G = nx.DiGraph()

        # Add nodes and edges
        for step in pathway.get('steps', []):
            substrate = step['substrate']
            product = step['product']
            enzymes = step.get('enzymes', [])

            G.add_node(substrate, type='substrate')
            G.add_node(product, type='product')

            # Add edge with enzyme information
            edge_label = f"{len(enzymes)} enzymes"
            if enzymes:
                primary_ec = enzymes[0].get('ec_number', 'Unknown')
                edge_label += f"\nEC {primary_ec}"

            G.add_edge(substrate, product, label=edge_label)

        # Create figure
        plt.figure(figsize=(12, 8))

        # Layout
        pos = nx.spring_layout(G, k=2, iterations=50)

        # Draw nodes
        substrate_nodes = [n for n, d in G.nodes(data=True) if d.get('type') == 'substrate']
        product_nodes = [n for n, d in G.nodes(data=True) if d.get('type') == 'product']
        intermediate_nodes = [n for n in G.nodes() if n not in substrate_nodes and n not in product_nodes]

        nx.draw_networkx_nodes(G, pos, nodelist=substrate_nodes, node_color='lightblue', node_size=1500)
        nx.draw_networkx_nodes(G, pos, nodelist=product_nodes, node_color='lightgreen', node_size=1500)
        nx.draw_networkx_nodes(G, pos, nodelist=intermediate_nodes, node_color='lightyellow', node_size=1200)

        # Draw edges
        nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20)

        # Draw labels
        nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')

        # Draw edge labels
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=8)

        # Add title
        plt.title(f"Enzymatic Pathway to {pathway.get('target', 'Target')}", fontsize=14, fontweight='bold')

        # Add legend
        plt.scatter([], [], c='lightblue', s=150, label='Starting Materials')
        plt.scatter([], [], c='lightyellow', s=120, label='Intermediates')
        plt.scatter([], [], c='lightgreen', s=150, label='Products')
        plt.legend()

        plt.axis('off')
        plt.tight_layout()

        # Save or show
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Pathway visualization saved to {save_path}")
        else:
            plt.show()

        plt.close()
        return save_path or "pathway_visualization.png"

    except Exception as e:
        print(f"Error visualizing pathway: {e}")
        return save_path or "pathway_visualization.png"


if __name__ == "__main__":
    # Example usage
    print("Enzyme Pathway Builder Examples")
    print("=" * 50)

    try:
        # Example 1: Find pathway for lactate
        print("\n1. Finding pathway for lactate production:")
        pathway = find_pathway_for_product("lactate", max_steps=3)
        print(f"Found pathway with {len(pathway['steps'])} steps")
        print(f"Feasibility: {pathway['confidence']:.2f}")

        # Example 2: Build retrosynthetic tree
        print("\n2. Building retrosynthetic tree for ethanol:")
        tree = build_retrosynthetic_tree("ethanol", depth=2)
        print(f"Tree has {tree['total_nodes']} nodes and {tree['total_edges']} edges")

        # Example 3: Suggest enzyme substitutions
        print("\n3. Suggesting enzyme substitutions for alcohol dehydrogenase:")
        substitutions = suggest_enzyme_substitutions("1.1.1.1")
        for sub in substitutions[:3]:
            print(f"  - {sub.get('organism', 'Unknown')}: {sub.get('substitution_reason', 'No reason')}")

        # Example 4: Calculate feasibility
        print("\n4. Calculating pathway feasibility:")
        feasibility = calculate_pathway_feasibility(pathway)
        print(f"Overall score: {feasibility['overall_score']:.2f}")
        print(f"Warnings: {len(feasibility['warnings'])}")

        # Example 5: Generate pathway report
        print("\n5. Generating pathway report:")
        report_file = generate_pathway_report(pathway)
        print(f"Report saved to: {report_file}")

        # Example 6: Visualize pathway
        print("\n6. Visualizing pathway:")
        viz_file = visualize_pathway(pathway, "example_pathway.png")
        print(f"Visualization saved to: {viz_file}")

    except Exception as e:
        print(f"Example failed: {e}")