Add more scientific skills

This commit is contained in:
Timothy Kassis
2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions

View File

@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""
BLAST searches and result parsing using BioPython.
This script demonstrates:
- Running BLAST searches via NCBI (qblast)
- Parsing BLAST XML output
- Filtering and analyzing results
- Working with alignments and HSPs
"""
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
def run_blast_online(sequence, program="blastn", database="nt", expect=0.001):
"""
Run BLAST search via NCBI's qblast.
Parameters:
- sequence: Sequence string or Seq object
- program: blastn, blastp, blastx, tblastn, tblastx
- database: nt (nucleotide), nr (protein), refseq_rna, etc.
- expect: E-value threshold
"""
print(f"Running {program} search against {database} database...")
print(f"E-value threshold: {expect}")
print("-" * 60)
# Run BLAST
result_handle = NCBIWWW.qblast(
program=program,
database=database,
sequence=sequence,
expect=expect,
hitlist_size=50, # Number of sequences to show alignments for
)
# Save results
output_file = "blast_results.xml"
with open(output_file, "w") as out:
out.write(result_handle.read())
result_handle.close()
print(f"BLAST search complete. Results saved to {output_file}")
print()
return output_file
def parse_blast_results(xml_file, max_hits=10, evalue_threshold=0.001):
"""Parse BLAST XML results."""
print(f"Parsing BLAST results from: {xml_file}")
print(f"E-value threshold: {evalue_threshold}")
print("=" * 60)
with open(xml_file) as result_handle:
blast_record = NCBIXML.read(result_handle)
print(f"Query: {blast_record.query}")
print(f"Query length: {blast_record.query_length} residues")
print(f"Database: {blast_record.database}")
print(f"Number of alignments: {len(blast_record.alignments)}")
print()
hit_count = 0
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect <= evalue_threshold:
hit_count += 1
if hit_count <= max_hits:
print(f"Hit {hit_count}:")
print(f" Sequence: {alignment.title}")
print(f" Length: {alignment.length}")
print(f" E-value: {hsp.expect:.2e}")
print(f" Score: {hsp.score}")
print(f" Identities: {hsp.identities}/{hsp.align_length} ({hsp.identities / hsp.align_length * 100:.1f}%)")
print(f" Positives: {hsp.positives}/{hsp.align_length} ({hsp.positives / hsp.align_length * 100:.1f}%)")
print(f" Gaps: {hsp.gaps}/{hsp.align_length}")
print(f" Query range: {hsp.query_start} - {hsp.query_end}")
print(f" Subject range: {hsp.sbjct_start} - {hsp.sbjct_end}")
print()
# Show alignment (first 100 characters)
print(" Alignment preview:")
print(f" Query: {hsp.query[:100]}")
print(f" Match: {hsp.match[:100]}")
print(f" Sbjct: {hsp.sbjct[:100]}")
print()
print(f"Total significant hits (E-value <= {evalue_threshold}): {hit_count}")
print()
return blast_record
def parse_multiple_queries(xml_file):
"""Parse BLAST results with multiple queries."""
print(f"Parsing multiple queries from: {xml_file}")
print("=" * 60)
with open(xml_file) as result_handle:
blast_records = NCBIXML.parse(result_handle)
for i, blast_record in enumerate(blast_records, 1):
print(f"\nQuery {i}: {blast_record.query}")
print(f" Number of hits: {len(blast_record.alignments)}")
if blast_record.alignments:
best_hit = blast_record.alignments[0]
best_hsp = best_hit.hsps[0]
print(f" Best hit: {best_hit.title[:80]}...")
print(f" Best E-value: {best_hsp.expect:.2e}")
def filter_blast_results(blast_record, min_identity=0.7, min_coverage=0.5):
"""Filter BLAST results by identity and coverage."""
print(f"Filtering results:")
print(f" Minimum identity: {min_identity * 100}%")
print(f" Minimum coverage: {min_coverage * 100}%")
print("-" * 60)
filtered_hits = []
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
identity_fraction = hsp.identities / hsp.align_length
coverage = hsp.align_length / blast_record.query_length
if identity_fraction >= min_identity and coverage >= min_coverage:
filtered_hits.append(
{
"title": alignment.title,
"length": alignment.length,
"evalue": hsp.expect,
"identity": identity_fraction,
"coverage": coverage,
"alignment": alignment,
"hsp": hsp,
}
)
print(f"Found {len(filtered_hits)} hits matching criteria")
print()
# Sort by E-value
filtered_hits.sort(key=lambda x: x["evalue"])
# Display top hits
for i, hit in enumerate(filtered_hits[:5], 1):
print(f"{i}. {hit['title'][:80]}")
print(f" Identity: {hit['identity']*100:.1f}%, Coverage: {hit['coverage']*100:.1f}%, E-value: {hit['evalue']:.2e}")
print()
return filtered_hits
def extract_hit_sequences(blast_record, output_file="blast_hits.fasta"):
"""Extract aligned sequences from BLAST results."""
print(f"Extracting hit sequences to {output_file}...")
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
records = []
for i, alignment in enumerate(blast_record.alignments[:10]): # Top 10 hits
hsp = alignment.hsps[0] # Best HSP for this alignment
# Extract accession from title
accession = alignment.title.split()[0]
# Create SeqRecord from aligned subject sequence
record = SeqRecord(
Seq(hsp.sbjct.replace("-", "")), # Remove gaps
id=accession,
description=f"E-value: {hsp.expect:.2e}, Identity: {hsp.identities}/{hsp.align_length}",
)
records.append(record)
# Write to FASTA
SeqIO.write(records, output_file, "fasta")
print(f"Extracted {len(records)} sequences")
print()
def analyze_blast_statistics(blast_record):
"""Compute statistics from BLAST results."""
print("BLAST Result Statistics:")
print("-" * 60)
if not blast_record.alignments:
print("No hits found")
return
evalues = []
identities = []
scores = []
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
evalues.append(hsp.expect)
identities.append(hsp.identities / hsp.align_length)
scores.append(hsp.score)
import statistics
print(f"Total HSPs: {len(evalues)}")
print(f"\nE-values:")
print(f" Min: {min(evalues):.2e}")
print(f" Max: {max(evalues):.2e}")
print(f" Median: {statistics.median(evalues):.2e}")
print(f"\nIdentity percentages:")
print(f" Min: {min(identities)*100:.1f}%")
print(f" Max: {max(identities)*100:.1f}%")
print(f" Mean: {statistics.mean(identities)*100:.1f}%")
print(f"\nBit scores:")
print(f" Min: {min(scores):.1f}")
print(f" Max: {max(scores):.1f}")
print(f" Mean: {statistics.mean(scores):.1f}")
print()
def example_workflow():
"""Demonstrate BLAST workflow."""
print("=" * 60)
print("BioPython BLAST Example Workflow")
print("=" * 60)
print()
# Example sequence (human beta-globin)
example_sequence = """
ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGC
""".replace("\n", "").replace(" ", "")
print("Example: Human beta-globin sequence")
print(f"Length: {len(example_sequence)} bp")
print()
# Note: Uncomment to run actual BLAST search (takes time)
# xml_file = run_blast_online(example_sequence, program="blastn", database="nt", expect=0.001)
# For demonstration, use a pre-existing results file
print("To run a real BLAST search, uncomment the run_blast_online() line")
print("For now, demonstrating parsing with example results file")
print()
# If you have results, parse them:
# blast_record = parse_blast_results("blast_results.xml", max_hits=5)
# filtered = filter_blast_results(blast_record, min_identity=0.9)
# analyze_blast_statistics(blast_record)
# extract_hit_sequences(blast_record)
if __name__ == "__main__":
example_workflow()
print()
print("Note: BLAST searches can take several minutes.")
print("For production use, consider running local BLAST instead.")