Improved Biomni support

2026-03-28 07:33:45 +08:00 · 2025-10-22 08:38:06 -07:00
parent 71a3c3750f
commit 77822efeed
9 changed files with 2512 additions and 3393 deletions
--- a/scientific-packages/biomni/scripts/setup_environment.py
+++ b/scientific-packages/biomni/scripts/setup_environment.py
@@ -1,230 +1,355 @@
 #!/usr/bin/env python3
 """
-Biomni Environment Setup and Validation Script
+Interactive setup script for biomni environment configuration.

-This script helps users set up and validate their Biomni environment,
-including checking dependencies, API keys, and data availability.
+This script helps users set up:
+1. Conda environment with required dependencies
+2. API keys for LLM providers
+3. Data lake directory configuration
+4. MCP server setup (optional)
+
+Usage:
+    python setup_environment.py
 """

 import os
 import sys
 import subprocess
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, Optional


-def check_python_version() -> Tuple[bool, str]:
-    """Check if Python version is compatible."""
-    version = sys.version_info
-    if version.major == 3 and version.minor >= 8:
-        return True, f"Python {version.major}.{version.minor}.{version.micro} ✓"
-    else:
-        return False, f"Python {version.major}.{version.minor} - requires Python 3.8+"
-
-
-def check_conda_env() -> Tuple[bool, str]:
-    """Check if running in biomni conda environment."""
-    conda_env = os.environ.get('CONDA_DEFAULT_ENV', None)
-    if conda_env == 'biomni_e1':
-        return True, f"Conda environment: {conda_env} ✓"
-    else:
-        return False, f"Not in biomni_e1 environment (current: {conda_env})"
-
-
-def check_package_installed(package: str) -> bool:
-    """Check if a Python package is installed."""
+def check_conda_installed() -> bool:
+    """Check if conda is available in the system."""
    try:
-        __import__(package)
+        subprocess.run(
+            ['conda', '--version'],
+            capture_output=True,
+            check=True
+        )
        return True
-    except ImportError:
+    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


-def check_dependencies() -> Tuple[bool, List[str]]:
-    """Check for required and optional dependencies."""
-    required = ['biomni']
-    optional = ['weasyprint', 'markdown2pdf']
+def setup_conda_environment():
+    """Guide user through conda environment setup."""
+    print("\n=== Conda Environment Setup ===")

-    missing_required = [pkg for pkg in required if not check_package_installed(pkg)]
-    missing_optional = [pkg for pkg in optional if not check_package_installed(pkg)]
+    if not check_conda_installed():
+        print("❌ Conda not found. Please install Miniconda or Anaconda:")
+        print("   https://docs.conda.io/en/latest/miniconda.html")
+        return False

-    messages = []
-    success = len(missing_required) == 0
+    print("✓ Conda is installed")

-    if missing_required:
-        messages.append(f"Missing required packages: {', '.join(missing_required)}")
-        messages.append("Install with: pip install biomni --upgrade")
+    # Check if biomni_e1 environment exists
+    result = subprocess.run(
+        ['conda', 'env', 'list'],
+        capture_output=True,
+        text=True
+    )
+
+    if 'biomni_e1' in result.stdout:
+        print("✓ biomni_e1 environment already exists")
+        return True
+
+    print("\nCreating biomni_e1 conda environment...")
+    print("This will install Python 3.10 and required dependencies.")
+
+    response = input("Proceed? [y/N]: ").strip().lower()
+    if response != 'y':
+        print("Skipping conda environment setup")
+        return False
+
+    try:
+        # Create conda environment
+        subprocess.run(
+            ['conda', 'create', '-n', 'biomni_e1', 'python=3.10', '-y'],
+            check=True
+        )
+
+        print("\n✓ Conda environment created successfully")
+        print("\nTo activate: conda activate biomni_e1")
+        print("Then install biomni: pip install biomni --upgrade")
+        return True
+
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Failed to create conda environment: {e}")
+        return False
+
+
+def setup_api_keys() -> Dict[str, str]:
+    """Interactive API key configuration."""
+    print("\n=== API Key Configuration ===")
+    print("Biomni supports multiple LLM providers.")
+    print("At minimum, configure one provider.")
+
+    api_keys = {}
+
+    # Anthropic (recommended)
+    print("\n1. Anthropic Claude (Recommended)")
+    print("   Get your API key from: https://console.anthropic.com/")
+    anthropic_key = input("   Enter ANTHROPIC_API_KEY (or press Enter to skip): ").strip()
+    if anthropic_key:
+        api_keys['ANTHROPIC_API_KEY'] = anthropic_key
+
+    # OpenAI
+    print("\n2. OpenAI")
+    print("   Get your API key from: https://platform.openai.com/api-keys")
+    openai_key = input("   Enter OPENAI_API_KEY (or press Enter to skip): ").strip()
+    if openai_key:
+        api_keys['OPENAI_API_KEY'] = openai_key
+
+    # Google Gemini
+    print("\n3. Google Gemini")
+    print("   Get your API key from: https://makersuite.google.com/app/apikey")
+    google_key = input("   Enter GOOGLE_API_KEY (or press Enter to skip): ").strip()
+    if google_key:
+        api_keys['GOOGLE_API_KEY'] = google_key
+
+    # Groq
+    print("\n4. Groq")
+    print("   Get your API key from: https://console.groq.com/keys")
+    groq_key = input("   Enter GROQ_API_KEY (or press Enter to skip): ").strip()
+    if groq_key:
+        api_keys['GROQ_API_KEY'] = groq_key
+
+    if not api_keys:
+        print("\n⚠️  No API keys configured. You'll need at least one to use biomni.")
+        return {}
+
+    return api_keys
+
+
+def save_api_keys(api_keys: Dict[str, str], method: str = 'env_file'):
+    """Save API keys using specified method."""
+    if method == 'env_file':
+        env_file = Path.cwd() / '.env'
+
+        # Read existing .env if present
+        existing_vars = {}
+        if env_file.exists():
+            with open(env_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        if '=' in line:
+                            key, val = line.split('=', 1)
+                            existing_vars[key.strip()] = val.strip()
+
+        # Update with new keys
+        existing_vars.update(api_keys)
+
+        # Write to .env
+        with open(env_file, 'w') as f:
+            f.write("# Biomni API Keys\n")
+            f.write(f"# Generated by setup_environment.py\n\n")
+            for key, value in existing_vars.items():
+                f.write(f"{key}={value}\n")
+
+        print(f"\n✓ API keys saved to {env_file}")
+        print("  Keys will be loaded automatically when biomni runs in this directory")
+
+    elif method == 'shell_export':
+        shell_file = Path.home() / '.bashrc'  # or .zshrc for zsh users
+
+        print("\n📋 Add these lines to your shell configuration:")
+        for key, value in api_keys.items():
+            print(f"   export {key}=\"{value}\"")
+
+        print(f"\nThen run: source {shell_file}")
+
+
+def setup_data_directory() -> Optional[Path]:
+    """Configure biomni data lake directory."""
+    print("\n=== Data Lake Configuration ===")
+    print("Biomni requires ~11GB for integrated biomedical databases.")
+
+    default_path = Path.cwd() / 'biomni_data'
+    print(f"\nDefault location: {default_path}")
+
+    response = input("Use default location? [Y/n]: ").strip().lower()
+
+    if response == 'n':
+        custom_path = input("Enter custom path: ").strip()
+        data_path = Path(custom_path).expanduser().resolve()
    else:
-        messages.append("Required packages: ✓")
+        data_path = default_path

-    if missing_optional:
-        messages.append(f"Missing optional packages: {', '.join(missing_optional)}")
-        messages.append("For PDF reports, install: pip install weasyprint")
+    # Create directory if it doesn't exist
+    data_path.mkdir(parents=True, exist_ok=True)

-    return success, messages
+    print(f"\n✓ Data directory configured: {data_path}")
+    print("  Data will be downloaded automatically on first use")
+
+    return data_path


-def check_api_keys() -> Tuple[bool, Dict[str, bool]]:
-    """Check which API keys are configured."""
-    api_keys = {
-        'ANTHROPIC_API_KEY': os.environ.get('ANTHROPIC_API_KEY'),
-        'OPENAI_API_KEY': os.environ.get('OPENAI_API_KEY'),
-        'GEMINI_API_KEY': os.environ.get('GEMINI_API_KEY'),
-        'GROQ_API_KEY': os.environ.get('GROQ_API_KEY'),
-    }
+def test_installation(data_path: Path):
+    """Test biomni installation with a simple query."""
+    print("\n=== Installation Test ===")
+    print("Testing biomni installation with a simple query...")

-    configured = {key: bool(value) for key, value in api_keys.items()}
-    has_any = any(configured.values())
+    response = input("Run test? [Y/n]: ").strip().lower()
+    if response == 'n':
+        print("Skipping test")
+        return

-    return has_any, configured
+    test_code = f'''
+import os
+from biomni.agent import A1

+# Use environment variables for API keys
+agent = A1(path='{data_path}', llm='claude-sonnet-4-20250514')

-def check_data_directory(data_path: str = './data') -> Tuple[bool, str]:
-    """Check if Biomni data directory exists and has content."""
-    path = Path(data_path)
+# Simple test query
+result = agent.go("What is the primary function of the TP53 gene?")
+print("Test result:", result)
+'''

-    if not path.exists():
-        return False, f"Data directory not found at {data_path}"
+    test_file = Path('test_biomni.py')
+    with open(test_file, 'w') as f:
+        f.write(test_code)

-    # Check if directory has files (data has been downloaded)
-    files = list(path.glob('*'))
-    if len(files) == 0:
-        return False, f"Data directory exists but is empty. Run agent once to download."
+    print(f"\nTest script created: {test_file}")
+    print("Running test...")

-    # Rough size check (should be ~11GB)
-    total_size = sum(f.stat().st_size for f in path.rglob('*') if f.is_file())
-    size_gb = total_size / (1024**3)
-
-    if size_gb < 1:
-        return False, f"Data directory exists but seems incomplete ({size_gb:.1f} GB)"
-
-    return True, f"Data directory: {data_path} ({size_gb:.1f} GB) ✓"
-
-
-def check_disk_space(required_gb: float = 20) -> Tuple[bool, str]:
-    """Check if sufficient disk space is available."""
    try:
-        import shutil
-        stat = shutil.disk_usage('.')
-        free_gb = stat.free / (1024**3)
-
-        if free_gb >= required_gb:
-            return True, f"Disk space: {free_gb:.1f} GB available ✓"
-        else:
-            return False, f"Low disk space: {free_gb:.1f} GB (need {required_gb} GB)"
-    except Exception as e:
-        return False, f"Could not check disk space: {e}"
+        subprocess.run([sys.executable, str(test_file)], check=True)
+        print("\n✓ Test completed successfully!")
+        test_file.unlink()  # Clean up test file
+    except subprocess.CalledProcessError:
+        print("\n❌ Test failed. Check your configuration.")
+        print(f"   Test script saved as {test_file} for debugging")


-def test_biomni_import() -> Tuple[bool, str]:
-    """Test if Biomni can be imported and initialized."""
-    try:
-        from biomni.agent import A1
-        from biomni.config import default_config
-        return True, "Biomni import successful ✓"
-    except ImportError as e:
-        return False, f"Cannot import Biomni: {e}"
-    except Exception as e:
-        return False, f"Biomni import error: {e}"
+def generate_example_script(data_path: Path):
+    """Generate example usage script."""
+    example_code = f'''#!/usr/bin/env python3
+"""
+Example biomni usage script

+This demonstrates basic biomni usage patterns.
+Modify this script for your research tasks.
+"""

-def suggest_fixes(results: Dict[str, Tuple[bool, any]]) -> List[str]:
-    """Generate suggestions for fixing issues."""
-    suggestions = []
+from biomni.agent import A1

-    if not results['python'][0]:
-        suggestions.append("➜ Upgrade Python to 3.8 or higher")
+# Initialize agent
+agent = A1(
+    path='{data_path}',
+    llm='claude-sonnet-4-20250514'  # or your preferred LLM
+)

-    if not results['conda'][0]:
-        suggestions.append("➜ Activate biomni environment: conda activate biomni_e1")
+# Example 1: Simple gene query
+print("Example 1: Gene function query")
+result = agent.go("""
+What are the main functions of the BRCA1 gene?
+Include information about:
+- Molecular function
+- Associated diseases
+- Protein interactions
+""")
+print(result)
+print("-" * 80)

-    if not results['dependencies'][0]:
-        suggestions.append("➜ Install Biomni: pip install biomni --upgrade")
+# Example 2: Data analysis
+print("\\nExample 2: GWAS analysis")
+result = agent.go("""
+Explain how to analyze GWAS summary statistics for:
+1. Identifying genome-wide significant variants
+2. Mapping variants to genes
+3. Pathway enrichment analysis
+""")
+print(result)

-    if not results['api_keys'][0]:
-        suggestions.append("➜ Set API key: export ANTHROPIC_API_KEY='your-key'")
-        suggestions.append("   Or create .env file with API keys")
+# Save conversation history
+agent.save_conversation_history("example_results.pdf")
+print("\\nResults saved to example_results.pdf")
+'''

-    if not results['data'][0]:
-        suggestions.append("➜ Data will auto-download on first agent.go() call")
+    example_file = Path('example_biomni_usage.py')
+    with open(example_file, 'w') as f:
+        f.write(example_code)

-    if not results['disk_space'][0]:
-        suggestions.append("➜ Free up disk space (need ~20GB total)")
-
-    return suggestions
+    print(f"\n✓ Example script created: {example_file}")


 def main():
-    """Run all environment checks and display results."""
+    """Main setup workflow."""
    print("=" * 60)
-    print("Biomni Environment Validation")
+    print("Biomni Environment Setup")
    print("=" * 60)
-    print()

-    # Run all checks
-    results = {}
+    # Step 1: Conda environment
+    conda_success = setup_conda_environment()

-    print("Checking Python version...")
-    results['python'] = check_python_version()
-    print(f"  {results['python'][1]}")
-    print()
+    if conda_success:
+        print("\n⚠️  Remember to activate the environment:")
+        print("   conda activate biomni_e1")
+        print("   pip install biomni --upgrade")

-    print("Checking conda environment...")
-    results['conda'] = check_conda_env()
-    print(f"  {results['conda'][1]}")
-    print()
+    # Step 2: API keys
+    api_keys = setup_api_keys()

-    print("Checking dependencies...")
-    results['dependencies'] = check_dependencies()
-    for msg in results['dependencies'][1]:
-        print(f"  {msg}")
-    print()
+    if api_keys:
+        print("\nHow would you like to store API keys?")
+        print("1. .env file (recommended, local to this directory)")
+        print("2. Shell export (add to .bashrc/.zshrc)")

-    print("Checking API keys...")
-    results['api_keys'] = check_api_keys()
-    has_keys, key_status = results['api_keys']
-    for key, configured in key_status.items():
-        status = "✓" if configured else "✗"
-        print(f"  {key}: {status}")
-    print()
+        choice = input("Choose [1/2]: ").strip()

-    print("Checking Biomni data directory...")
-    results['data'] = check_data_directory()
-    print(f"  {results['data'][1]}")
-    print()
+        if choice == '2':
+            save_api_keys(api_keys, method='shell_export')
+        else:
+            save_api_keys(api_keys, method='env_file')

-    print("Checking disk space...")
-    results['disk_space'] = check_disk_space()
-    print(f"  {results['disk_space'][1]}")
-    print()
+    # Step 3: Data directory
+    data_path = setup_data_directory()

-    print("Testing Biomni import...")
-    results['biomni_import'] = test_biomni_import()
-    print(f"  {results['biomni_import'][1]}")
-    print()
+    # Step 4: Generate example script
+    if data_path:
+        generate_example_script(data_path)
+
+    # Step 5: Test installation (optional)
+    if api_keys and data_path:
+        test_installation(data_path)

    # Summary
+    print("\n" + "=" * 60)
+    print("Setup Complete!")
    print("=" * 60)
-    all_passed = all(result[0] for result in results.values())

-    if all_passed:
-        print("✓ All checks passed! Environment is ready.")
-        print()
-        print("Quick start:")
-        print("  from biomni.agent import A1")
-        print("  agent = A1(path='./data', llm='claude-sonnet-4-20250514')")
-        print("  agent.go('Your biomedical task')")
+    if conda_success:
+        print("✓ Conda environment: biomni_e1")
+
+    if api_keys:
+        print(f"✓ API keys configured: {', '.join(api_keys.keys())}")
+
+    if data_path:
+        print(f"✓ Data directory: {data_path}")
+
+    print("\nNext steps:")
+    if conda_success:
+        print("1. conda activate biomni_e1")
+        print("2. pip install biomni --upgrade")
+        print("3. Run example_biomni_usage.py to test")
    else:
-        print("⚠ Some checks failed. See suggestions below:")
-        print()
-        suggestions = suggest_fixes(results)
-        for suggestion in suggestions:
-            print(suggestion)
+        print("1. Install conda/miniconda")
+        print("2. Run this script again")

-    print("=" * 60)
-
-    return 0 if all_passed else 1
+    print("\nFor documentation, see:")
+    print("  - GitHub: https://github.com/snap-stanford/biomni")
+    print("  - Paper: https://www.biorxiv.org/content/10.1101/2025.05.30.656746v1")


 if __name__ == "__main__":
-    sys.exit(main())
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nSetup interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Error during setup: {e}")
+        sys.exit(1)