#!/usr/bin/env python3 """ BLAST searches and result parsing using BioPython. This script demonstrates: - Running BLAST searches via NCBI (qblast) - Parsing BLAST XML output - Filtering and analyzing results - Working with alignments and HSPs """ from Bio.Blast import NCBIWWW, NCBIXML from Bio import SeqIO def run_blast_online(sequence, program="blastn", database="nt", expect=0.001): """ Run BLAST search via NCBI's qblast. Parameters: - sequence: Sequence string or Seq object - program: blastn, blastp, blastx, tblastn, tblastx - database: nt (nucleotide), nr (protein), refseq_rna, etc. - expect: E-value threshold """ print(f"Running {program} search against {database} database...") print(f"E-value threshold: {expect}") print("-" * 60) # Run BLAST result_handle = NCBIWWW.qblast( program=program, database=database, sequence=sequence, expect=expect, hitlist_size=50, # Number of sequences to show alignments for ) # Save results output_file = "blast_results.xml" with open(output_file, "w") as out: out.write(result_handle.read()) result_handle.close() print(f"BLAST search complete. Results saved to {output_file}") print() return output_file def parse_blast_results(xml_file, max_hits=10, evalue_threshold=0.001): """Parse BLAST XML results.""" print(f"Parsing BLAST results from: {xml_file}") print(f"E-value threshold: {evalue_threshold}") print("=" * 60) with open(xml_file) as result_handle: blast_record = NCBIXML.read(result_handle) print(f"Query: {blast_record.query}") print(f"Query length: {blast_record.query_length} residues") print(f"Database: {blast_record.database}") print(f"Number of alignments: {len(blast_record.alignments)}") print() hit_count = 0 for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect <= evalue_threshold: hit_count += 1 if hit_count <= max_hits: print(f"Hit {hit_count}:") print(f" Sequence: {alignment.title}") print(f" Length: {alignment.length}") print(f" E-value: {hsp.expect:.2e}") print(f" Score: {hsp.score}") print(f" Identities: {hsp.identities}/{hsp.align_length} ({hsp.identities / hsp.align_length * 100:.1f}%)") print(f" Positives: {hsp.positives}/{hsp.align_length} ({hsp.positives / hsp.align_length * 100:.1f}%)") print(f" Gaps: {hsp.gaps}/{hsp.align_length}") print(f" Query range: {hsp.query_start} - {hsp.query_end}") print(f" Subject range: {hsp.sbjct_start} - {hsp.sbjct_end}") print() # Show alignment (first 100 characters) print(" Alignment preview:") print(f" Query: {hsp.query[:100]}") print(f" Match: {hsp.match[:100]}") print(f" Sbjct: {hsp.sbjct[:100]}") print() print(f"Total significant hits (E-value <= {evalue_threshold}): {hit_count}") print() return blast_record def parse_multiple_queries(xml_file): """Parse BLAST results with multiple queries.""" print(f"Parsing multiple queries from: {xml_file}") print("=" * 60) with open(xml_file) as result_handle: blast_records = NCBIXML.parse(result_handle) for i, blast_record in enumerate(blast_records, 1): print(f"\nQuery {i}: {blast_record.query}") print(f" Number of hits: {len(blast_record.alignments)}") if blast_record.alignments: best_hit = blast_record.alignments[0] best_hsp = best_hit.hsps[0] print(f" Best hit: {best_hit.title[:80]}...") print(f" Best E-value: {best_hsp.expect:.2e}") def filter_blast_results(blast_record, min_identity=0.7, min_coverage=0.5): """Filter BLAST results by identity and coverage.""" print(f"Filtering results:") print(f" Minimum identity: {min_identity * 100}%") print(f" Minimum coverage: {min_coverage * 100}%") print("-" * 60) filtered_hits = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: identity_fraction = hsp.identities / hsp.align_length coverage = hsp.align_length / blast_record.query_length if identity_fraction >= min_identity and coverage >= min_coverage: filtered_hits.append( { "title": alignment.title, "length": alignment.length, "evalue": hsp.expect, "identity": identity_fraction, "coverage": coverage, "alignment": alignment, "hsp": hsp, } ) print(f"Found {len(filtered_hits)} hits matching criteria") print() # Sort by E-value filtered_hits.sort(key=lambda x: x["evalue"]) # Display top hits for i, hit in enumerate(filtered_hits[:5], 1): print(f"{i}. {hit['title'][:80]}") print(f" Identity: {hit['identity']*100:.1f}%, Coverage: {hit['coverage']*100:.1f}%, E-value: {hit['evalue']:.2e}") print() return filtered_hits def extract_hit_sequences(blast_record, output_file="blast_hits.fasta"): """Extract aligned sequences from BLAST results.""" print(f"Extracting hit sequences to {output_file}...") from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord records = [] for i, alignment in enumerate(blast_record.alignments[:10]): # Top 10 hits hsp = alignment.hsps[0] # Best HSP for this alignment # Extract accession from title accession = alignment.title.split()[0] # Create SeqRecord from aligned subject sequence record = SeqRecord( Seq(hsp.sbjct.replace("-", "")), # Remove gaps id=accession, description=f"E-value: {hsp.expect:.2e}, Identity: {hsp.identities}/{hsp.align_length}", ) records.append(record) # Write to FASTA SeqIO.write(records, output_file, "fasta") print(f"Extracted {len(records)} sequences") print() def analyze_blast_statistics(blast_record): """Compute statistics from BLAST results.""" print("BLAST Result Statistics:") print("-" * 60) if not blast_record.alignments: print("No hits found") return evalues = [] identities = [] scores = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: evalues.append(hsp.expect) identities.append(hsp.identities / hsp.align_length) scores.append(hsp.score) import statistics print(f"Total HSPs: {len(evalues)}") print(f"\nE-values:") print(f" Min: {min(evalues):.2e}") print(f" Max: {max(evalues):.2e}") print(f" Median: {statistics.median(evalues):.2e}") print(f"\nIdentity percentages:") print(f" Min: {min(identities)*100:.1f}%") print(f" Max: {max(identities)*100:.1f}%") print(f" Mean: {statistics.mean(identities)*100:.1f}%") print(f"\nBit scores:") print(f" Min: {min(scores):.1f}") print(f" Max: {max(scores):.1f}") print(f" Mean: {statistics.mean(scores):.1f}") print() def example_workflow(): """Demonstrate BLAST workflow.""" print("=" * 60) print("BioPython BLAST Example Workflow") print("=" * 60) print() # Example sequence (human beta-globin) example_sequence = """ ATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGC """.replace("\n", "").replace(" ", "") print("Example: Human beta-globin sequence") print(f"Length: {len(example_sequence)} bp") print() # Note: Uncomment to run actual BLAST search (takes time) # xml_file = run_blast_online(example_sequence, program="blastn", database="nt", expect=0.001) # For demonstration, use a pre-existing results file print("To run a real BLAST search, uncomment the run_blast_online() line") print("For now, demonstrating parsing with example results file") print() # If you have results, parse them: # blast_record = parse_blast_results("blast_results.xml", max_hits=5) # filtered = filter_blast_results(blast_record, min_identity=0.9) # analyze_blast_statistics(blast_record) # extract_hit_sequences(blast_record) if __name__ == "__main__": example_workflow() print() print("Note: BLAST searches can take several minutes.") print("For production use, consider running local BLAST instead.")