Add more scientific skills

2026-01-26 16:58:56 +08:00 · 2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions
--- a/scientific-packages/biopython/scripts/ncbi_entrez.py
+++ b/scientific-packages/biopython/scripts/ncbi_entrez.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+NCBI Entrez database access using BioPython.
+
+This script demonstrates:
+- Searching NCBI databases
+- Downloading sequences by accession
+- Retrieving PubMed articles
+- Batch downloading with WebEnv
+- Proper error handling and rate limiting
+"""
+
+import time
+from Bio import Entrez, SeqIO
+
+# IMPORTANT: Always set your email
+Entrez.email = "your.email@example.com"  # Change this!
+
+
+def search_nucleotide(query, max_results=10):
+    """Search NCBI nucleotide database."""
+
+    print(f"Searching nucleotide database for: {query}")
+    print("-" * 60)
+
+    handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
+    record = Entrez.read(handle)
+    handle.close()
+
+    print(f"Found {record['Count']} total matches")
+    print(f"Returning top {len(record['IdList'])} IDs:")
+    print(record["IdList"])
+    print()
+
+    return record["IdList"]
+
+
+def fetch_sequence_by_accession(accession):
+    """Download a sequence by accession number."""
+
+    print(f"Fetching sequence: {accession}")
+
+    try:
+        handle = Entrez.efetch(
+            db="nucleotide", id=accession, rettype="gb", retmode="text"
+        )
+        record = SeqIO.read(handle, "genbank")
+        handle.close()
+
+        print(f"Successfully retrieved: {record.id}")
+        print(f"Description: {record.description}")
+        print(f"Length: {len(record.seq)} bp")
+        print(f"Organism: {record.annotations.get('organism', 'Unknown')}")
+        print()
+
+        return record
+
+    except Exception as e:
+        print(f"Error fetching {accession}: {e}")
+        return None
+
+
+def fetch_multiple_sequences(id_list, output_file="downloaded_sequences.fasta"):
+    """Download multiple sequences and save to file."""
+
+    print(f"Fetching {len(id_list)} sequences...")
+
+    try:
+        # For >200 IDs, efetch automatically uses POST
+        handle = Entrez.efetch(
+            db="nucleotide", id=id_list, rettype="fasta", retmode="text"
+        )
+
+        # Parse and save
+        records = list(SeqIO.parse(handle, "fasta"))
+        handle.close()
+
+        SeqIO.write(records, output_file, "fasta")
+
+        print(f"Successfully downloaded {len(records)} sequences to {output_file}")
+        print()
+
+        return records
+
+    except Exception as e:
+        print(f"Error fetching sequences: {e}")
+        return []
+
+
+def search_and_download(query, output_file, max_results=100):
+    """Complete workflow: search and download sequences."""
+
+    print(f"Searching and downloading: {query}")
+    print("=" * 60)
+
+    # Search
+    handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
+    record = Entrez.read(handle)
+    handle.close()
+
+    id_list = record["IdList"]
+    print(f"Found {len(id_list)} sequences")
+
+    if not id_list:
+        print("No results found")
+        return
+
+    # Download in batches to be polite
+    batch_size = 100
+    all_records = []
+
+    for start in range(0, len(id_list), batch_size):
+        end = min(start + batch_size, len(id_list))
+        batch_ids = id_list[start:end]
+
+        print(f"Downloading batch {start // batch_size + 1} ({len(batch_ids)} sequences)...")
+
+        handle = Entrez.efetch(
+            db="nucleotide", id=batch_ids, rettype="fasta", retmode="text"
+        )
+        batch_records = list(SeqIO.parse(handle, "fasta"))
+        handle.close()
+
+        all_records.extend(batch_records)
+
+        # Be polite - wait between requests
+        time.sleep(0.5)
+
+    # Save all records
+    SeqIO.write(all_records, output_file, "fasta")
+    print(f"Downloaded {len(all_records)} sequences to {output_file}")
+    print()
+
+
+def use_history_for_large_queries(query, max_results=1000):
+    """Use NCBI History server for large queries."""
+
+    print("Using NCBI History server for large query")
+    print("-" * 60)
+
+    # Search with history
+    search_handle = Entrez.esearch(
+        db="nucleotide", term=query, retmax=max_results, usehistory="y"
+    )
+    search_results = Entrez.read(search_handle)
+    search_handle.close()
+
+    count = int(search_results["Count"])
+    webenv = search_results["WebEnv"]
+    query_key = search_results["QueryKey"]
+
+    print(f"Found {count} total sequences")
+    print(f"WebEnv: {webenv[:20]}...")
+    print(f"QueryKey: {query_key}")
+    print()
+
+    # Fetch in batches using history
+    batch_size = 500
+    all_records = []
+
+    for start in range(0, min(count, max_results), batch_size):
+        end = min(start + batch_size, max_results)
+
+        print(f"Downloading records {start + 1} to {end}...")
+
+        fetch_handle = Entrez.efetch(
+            db="nucleotide",
+            rettype="fasta",
+            retmode="text",
+            retstart=start,
+            retmax=batch_size,
+            webenv=webenv,
+            query_key=query_key,
+        )
+
+        batch_records = list(SeqIO.parse(fetch_handle, "fasta"))
+        fetch_handle.close()
+
+        all_records.extend(batch_records)
+
+        # Be polite
+        time.sleep(0.5)
+
+    print(f"Downloaded {len(all_records)} sequences total")
+    return all_records
+
+
+def search_pubmed(query, max_results=10):
+    """Search PubMed for articles."""
+
+    print(f"Searching PubMed for: {query}")
+    print("-" * 60)
+
+    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
+    record = Entrez.read(handle)
+    handle.close()
+
+    id_list = record["IdList"]
+    print(f"Found {record['Count']} total articles")
+    print(f"Returning {len(id_list)} PMIDs:")
+    print(id_list)
+    print()
+
+    return id_list
+
+
+def fetch_pubmed_abstracts(pmid_list):
+    """Fetch PubMed article summaries."""
+
+    print(f"Fetching summaries for {len(pmid_list)} articles...")
+
+    handle = Entrez.efetch(db="pubmed", id=pmid_list, rettype="abstract", retmode="text")
+    abstracts = handle.read()
+    handle.close()
+
+    print(abstracts[:500])  # Show first 500 characters
+    print("...")
+    print()
+
+
+def get_database_info(database="nucleotide"):
+    """Get information about an NCBI database."""
+
+    print(f"Getting info for database: {database}")
+    print("-" * 60)
+
+    handle = Entrez.einfo(db=database)
+    record = Entrez.read(handle)
+    handle.close()
+
+    db_info = record["DbInfo"]
+    print(f"Name: {db_info['DbName']}")
+    print(f"Description: {db_info['Description']}")
+    print(f"Record count: {db_info['Count']}")
+    print(f"Last update: {db_info['LastUpdate']}")
+    print()
+
+
+def link_databases(db_from, db_to, id_):
+    """Find related records in other databases."""
+
+    print(f"Finding links from {db_from} ID {id_} to {db_to}")
+    print("-" * 60)
+
+    handle = Entrez.elink(dbfrom=db_from, db=db_to, id=id_)
+    record = Entrez.read(handle)
+    handle.close()
+
+    if record[0]["LinkSetDb"]:
+        linked_ids = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
+        print(f"Found {len(linked_ids)} linked records")
+        print(f"IDs: {linked_ids[:10]}")
+    else:
+        print("No linked records found")
+
+    print()
+
+
+def example_workflow():
+    """Demonstrate complete Entrez workflow."""
+
+    print("=" * 60)
+    print("BioPython Entrez Example Workflow")
+    print("=" * 60)
+    print()
+
+    # Note: These are examples - uncomment to run with your email set
+
+    # # Example 1: Search and get IDs
+    # ids = search_nucleotide("Homo sapiens[Organism] AND COX1[Gene]", max_results=5)
+    #
+    # # Example 2: Fetch a specific sequence
+    # fetch_sequence_by_accession("NM_001301717")
+    #
+    # # Example 3: Complete search and download
+    # search_and_download("Escherichia coli[Organism] AND 16S", "ecoli_16s.fasta", max_results=50)
+    #
+    # # Example 4: PubMed search
+    # pmids = search_pubmed("CRISPR[Title] AND 2023[PDAT]", max_results=5)
+    # fetch_pubmed_abstracts(pmids[:2])
+    #
+    # # Example 5: Get database info
+    # get_database_info("nucleotide")
+
+    print("Examples are commented out. Uncomment and set your email to run.")
+
+
+if __name__ == "__main__":
+    example_workflow()
+
+    print()
+    print("IMPORTANT: Always set Entrez.email before using these functions!")
+    print("NCBI requires an email address for their E-utilities.")