Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
NCBI Entrez database access using BioPython.
This script demonstrates:
- Searching NCBI databases
- Downloading sequences by accession
- Retrieving PubMed articles
- Batch downloading with WebEnv
- Proper error handling and rate limiting
"""
import time
from Bio import Entrez, SeqIO
# IMPORTANT: Always set your email
Entrez.email = "your.email@example.com" # Change this!
def search_nucleotide(query, max_results=10):
"""Search NCBI nucleotide database."""
print(f"Searching nucleotide database for: {query}")
print("-" * 60)
handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
record = Entrez.read(handle)
handle.close()
print(f"Found {record['Count']} total matches")
print(f"Returning top {len(record['IdList'])} IDs:")
print(record["IdList"])
print()
return record["IdList"]
def fetch_sequence_by_accession(accession):
"""Download a sequence by accession number."""
print(f"Fetching sequence: {accession}")
try:
handle = Entrez.efetch(
db="nucleotide", id=accession, rettype="gb", retmode="text"
)
record = SeqIO.read(handle, "genbank")
handle.close()
print(f"Successfully retrieved: {record.id}")
print(f"Description: {record.description}")
print(f"Length: {len(record.seq)} bp")
print(f"Organism: {record.annotations.get('organism', 'Unknown')}")
print()
return record
except Exception as e:
print(f"Error fetching {accession}: {e}")
return None
def fetch_multiple_sequences(id_list, output_file="downloaded_sequences.fasta"):
"""Download multiple sequences and save to file."""
print(f"Fetching {len(id_list)} sequences...")
try:
# For >200 IDs, efetch automatically uses POST
handle = Entrez.efetch(
db="nucleotide", id=id_list, rettype="fasta", retmode="text"
)
# Parse and save
records = list(SeqIO.parse(handle, "fasta"))
handle.close()
SeqIO.write(records, output_file, "fasta")
print(f"Successfully downloaded {len(records)} sequences to {output_file}")
print()
return records
except Exception as e:
print(f"Error fetching sequences: {e}")
return []
def search_and_download(query, output_file, max_results=100):
"""Complete workflow: search and download sequences."""
print(f"Searching and downloading: {query}")
print("=" * 60)
# Search
handle = Entrez.esearch(db="nucleotide", term=query, retmax=max_results)
record = Entrez.read(handle)
handle.close()
id_list = record["IdList"]
print(f"Found {len(id_list)} sequences")
if not id_list:
print("No results found")
return
# Download in batches to be polite
batch_size = 100
all_records = []
for start in range(0, len(id_list), batch_size):
end = min(start + batch_size, len(id_list))
batch_ids = id_list[start:end]
print(f"Downloading batch {start // batch_size + 1} ({len(batch_ids)} sequences)...")
handle = Entrez.efetch(
db="nucleotide", id=batch_ids, rettype="fasta", retmode="text"
)
batch_records = list(SeqIO.parse(handle, "fasta"))
handle.close()
all_records.extend(batch_records)
# Be polite - wait between requests
time.sleep(0.5)
# Save all records
SeqIO.write(all_records, output_file, "fasta")
print(f"Downloaded {len(all_records)} sequences to {output_file}")
print()
def use_history_for_large_queries(query, max_results=1000):
"""Use NCBI History server for large queries."""
print("Using NCBI History server for large query")
print("-" * 60)
# Search with history
search_handle = Entrez.esearch(
db="nucleotide", term=query, retmax=max_results, usehistory="y"
)
search_results = Entrez.read(search_handle)
search_handle.close()
count = int(search_results["Count"])
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]
print(f"Found {count} total sequences")
print(f"WebEnv: {webenv[:20]}...")
print(f"QueryKey: {query_key}")
print()
# Fetch in batches using history
batch_size = 500
all_records = []
for start in range(0, min(count, max_results), batch_size):
end = min(start + batch_size, max_results)
print(f"Downloading records {start + 1} to {end}...")
fetch_handle = Entrez.efetch(
db="nucleotide",
rettype="fasta",
retmode="text",
retstart=start,
retmax=batch_size,
webenv=webenv,
query_key=query_key,
)
batch_records = list(SeqIO.parse(fetch_handle, "fasta"))
fetch_handle.close()
all_records.extend(batch_records)
# Be polite
time.sleep(0.5)
print(f"Downloaded {len(all_records)} sequences total")
return all_records
def search_pubmed(query, max_results=10):
"""Search PubMed for articles."""
print(f"Searching PubMed for: {query}")
print("-" * 60)
handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
record = Entrez.read(handle)
handle.close()
id_list = record["IdList"]
print(f"Found {record['Count']} total articles")
print(f"Returning {len(id_list)} PMIDs:")
print(id_list)
print()
return id_list
def fetch_pubmed_abstracts(pmid_list):
"""Fetch PubMed article summaries."""
print(f"Fetching summaries for {len(pmid_list)} articles...")
handle = Entrez.efetch(db="pubmed", id=pmid_list, rettype="abstract", retmode="text")
abstracts = handle.read()
handle.close()
print(abstracts[:500]) # Show first 500 characters
print("...")
print()
def get_database_info(database="nucleotide"):
"""Get information about an NCBI database."""
print(f"Getting info for database: {database}")
print("-" * 60)
handle = Entrez.einfo(db=database)
record = Entrez.read(handle)
handle.close()
db_info = record["DbInfo"]
print(f"Name: {db_info['DbName']}")
print(f"Description: {db_info['Description']}")
print(f"Record count: {db_info['Count']}")
print(f"Last update: {db_info['LastUpdate']}")
print()
def link_databases(db_from, db_to, id_):
"""Find related records in other databases."""
print(f"Finding links from {db_from} ID {id_} to {db_to}")
print("-" * 60)
handle = Entrez.elink(dbfrom=db_from, db=db_to, id=id_)
record = Entrez.read(handle)
handle.close()
if record[0]["LinkSetDb"]:
linked_ids = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]]
print(f"Found {len(linked_ids)} linked records")
print(f"IDs: {linked_ids[:10]}")
else:
print("No linked records found")
print()
def example_workflow():
"""Demonstrate complete Entrez workflow."""
print("=" * 60)
print("BioPython Entrez Example Workflow")
print("=" * 60)
print()
# Note: These are examples - uncomment to run with your email set
# # Example 1: Search and get IDs
# ids = search_nucleotide("Homo sapiens[Organism] AND COX1[Gene]", max_results=5)
#
# # Example 2: Fetch a specific sequence
# fetch_sequence_by_accession("NM_001301717")
#
# # Example 3: Complete search and download
# search_and_download("Escherichia coli[Organism] AND 16S", "ecoli_16s.fasta", max_results=50)
#
# # Example 4: PubMed search
# pmids = search_pubmed("CRISPR[Title] AND 2023[PDAT]", max_results=5)
# fetch_pubmed_abstracts(pmids[:2])
#
# # Example 5: Get database info
# get_database_info("nucleotide")
print("Examples are commented out. Uncomment and set your email to run.")
if __name__ == "__main__":
example_workflow()
print()
print("IMPORTANT: Always set Entrez.email before using these functions!")
print("NCBI requires an email address for their E-utilities.")