claude-scientific-skills/scientific-packages/biopython/scripts/ncbi_entrez.py

#!/usr/bin/env python3
"""
NCBI Entrez database access using BioPython.

This script demonstrates:
- Searching NCBI databases
- Downloading sequences by accession
- Retrieving PubMed articles
- Batch downloading with WebEnv
- Proper error handling and rate limiting
"""

import time
from Bio import Entrez, SeqIO

# IMPORTANT: Always set your email
Entrez.email = "your.email@example.com"  # Change this!


def search_nucleotide(query, max_results=10):
    """Search NCBI nucleotide database."""

    print(f"Searching nucleotide database for: {query}")
    print("-" * 60)

    handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    print(f"Found {record['Count']} total matches")
    print(f"Returning top {len(record['IdList'])} IDs:")
    print(record["IdList"])
    print()

    return record["IdList"]


def fetch_sequence_by_accession(accession):
    """Download a sequence by accession number."""

    print(f"Fetching sequence: {accession}")

    try:
        handle = Entrez.efetch(
            db="nucleotide", id=accession, rettype="gb", retmode="text"
        )
        record = SeqIO.read(handle, "genbank")
        handle.close()

        print(f"Successfully retrieved: {record.id}")
        print(f"Description: {record.description}")
        print(f"Length: {len(record.seq)} bp")
        print(f"Organism: {record.annotations.get('organism', 'Unknown')}")
        print()

        return record

    except Exception as e:
        print(f"Error fetching {accession}: {e}")
        return None


def fetch_multiple_sequences(id_list, output_file="downloaded_sequences.fasta"):
    """Download multiple sequences and save to file."""

    print(f"Fetching {len(id_list)} sequences...")

    try:
        # For >200 IDs, efetch automatically uses POST
        handle = Entrez.efetch(
            db="nucleotide", id=id_list, rettype="fasta", retmode="text"
        )

        # Parse and save
        records = list(SeqIO.parse(handle, "fasta"))
        handle.close()

        SeqIO.write(records, output_file, "fasta")

        print(f"Successfully downloaded {len(records)} sequences to {output_file}")
        print()

        return records

    except Exception as e:
        print(f"Error fetching sequences: {e}")
        return []


def search_and_download(query, output_file, max_results=100):
    """Complete workflow: search and download sequences."""

    print(f"Searching and downloading: {query}")
    print("=" * 60)

    # Search
    handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    print(f"Found {len(id_list)} sequences")

    if not id_list:
        print("No results found")
        return

    # Download in batches to be polite
    batch_size = 100
    all_records = []

    for start in range(0, len(id_list), batch_size):
        end = min(start + batch_size, len(id_list))
        batch_ids = id_list[start:end]

        print(f"Downloading batch {start // batch_size + 1} ({len(batch_ids)} sequences)...")

        handle = Entrez.efetch(
            db="nucleotide", id=batch_ids, rettype="fasta", retmode="text"
        )
        batch_records = list(SeqIO.parse(handle, "fasta"))
        handle.close()

        all_records.extend(batch_records)

        # Be polite - wait between requests
        time.sleep(0.5)

    # Save all records
    SeqIO.write(all_records, output_file, "fasta")
    print(f"Downloaded {len(all_records)} sequences to {output_file}")
    print()


def use_history_for_large_queries(query, max_results=1000):
    """Use NCBI History server for large queries."""

    print("Using NCBI History server for large query")
    print("-" * 60)

    # Search with history
    search_handle = Entrez.esearch(
        db="nucleotide", term=query, retmax=max_results, usehistory="y"
    )
    search_results = Entrez.read(search_handle)
    search_handle.close()

    count = int(search_results["Count"])
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    print(f"Found {count} total sequences")
    print(f"WebEnv: {webenv[:20]}...")
    print(f"QueryKey: {query_key}")
    print()

    # Fetch in batches using history
    batch_size = 500
    all_records = []

    for start in range(0, min(count, max_results), batch_size):
        end = min(start + batch_size, max_results)

        print(f"Downloading records {start + 1} to {end}...")

        fetch_handle = Entrez.efetch(
            db="nucleotide",
            rettype="fasta",
            retmode="text",
            retstart=start,
            retmax=batch_size,
            webenv=webenv,
            query_key=query_key,
        )

        batch_records = list(SeqIO.parse(fetch_handle, "fasta"))
        fetch_handle.close()

        all_records.extend(batch_records)

        # Be polite
        time.sleep(0.5)

    print(f"Downloaded {len(all_records)} sequences total")
    return all_records


def search_pubmed(query, max_results=10):
    """Search PubMed for articles."""

    print(f"Searching PubMed for: {query}")
    print("-" * 60)

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    print(f"Found {record['Count']} total articles")
    print(f"Returning {len(id_list)} PMIDs:")
    print(id_list)
    print()

    return id_list


def fetch_pubmed_abstracts(pmid_list):
    """Fetch PubMed article summaries."""

    print(f"Fetching summaries for {len(pmid_list)} articles...")

    handle = Entrez.efetch(db="pubmed", id=pmid_list, rettype="abstract", retmode="text")
    abstracts = handle.read()
    handle.close()

    print(abstracts[:500])  # Show first 500 characters
    print("...")
    print()


def get_database_info(database="nucleotide"):
    """Get information about an NCBI database."""

    print(f"Getting info for database: {database}")
    print("-" * 60)

    handle = Entrez.einfo(db=database)
    record = Entrez.read(handle)
    handle.close()

    db_info = record["DbInfo"]
    print(f"Name: {db_info['DbName']}")
    print(f"Description: {db_info['Description']}")
    print(f"Record count: {db_info['Count']}")
    print(f"Last update: {db_info['LastUpdate']}")
    print()


def link_databases(db_from, db_to, id_):
    """Find related records in other databases."""

    print(f"Finding links from {db_from} ID {id_} to {db_to}")
    print("-" * 60)

    handle = Entrez.elink(dbfrom=db_from, db=db_to, id=id_)
    record = Entrez.read(handle)
    handle.close()

    if record[0]["LinkSetDb"]:
        linked_ids = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
        print(f"Found {len(linked_ids)} linked records")
        print(f"IDs: {linked_ids[:10]}")
    else:
        print("No linked records found")

    print()


def example_workflow():
    """Demonstrate complete Entrez workflow."""

    print("=" * 60)
    print("BioPython Entrez Example Workflow")
    print("=" * 60)
    print()

    # Note: These are examples - uncomment to run with your email set

    # # Example 1: Search and get IDs
    # ids = search_nucleotide("Homo sapiens[Organism] AND COX1[Gene]", max_results=5)
    #
    # # Example 2: Fetch a specific sequence
    # fetch_sequence_by_accession("NM_001301717")
    #
    # # Example 3: Complete search and download
    # search_and_download("Escherichia coli[Organism] AND 16S", "ecoli_16s.fasta", max_results=50)
    #
    # # Example 4: PubMed search
    # pmids = search_pubmed("CRISPR[Title] AND 2023[PDAT]", max_results=5)
    # fetch_pubmed_abstracts(pmids[:2])
    #
    # # Example 5: Get database info
    # get_database_info("nucleotide")

    print("Examples are commented out. Uncomment and set your email to run.")


if __name__ == "__main__":
    example_workflow()

    print()
    print("IMPORTANT: Always set Entrez.email before using these functions!")
    print("NCBI requires an email address for their E-utilities.")