Update biopython skill

This commit is contained in:
Timothy Kassis
2025-10-21 10:47:34 -07:00
parent 11da596765
commit 998a514f74
17 changed files with 3651 additions and 2796 deletions

View File

@@ -0,0 +1,577 @@
# Advanced Biopython Features
## Sequence Motifs with Bio.motifs
### Creating Motifs
```python
from Bio import motifs
from Bio.Seq import Seq
# Create motif from instances
instances = [
Seq("TACAA"),
Seq("TACGC"),
Seq("TACAC"),
Seq("TACCC"),
Seq("AACCC"),
Seq("AATGC"),
Seq("AATGC"),
]
motif = motifs.create(instances)
```
### Motif Consensus and Degenerate Sequences
```python
# Get consensus sequence
print(motif.counts.consensus)
# Get degenerate consensus (IUPAC ambiguity codes)
print(motif.counts.degenerate_consensus)
# Access counts matrix
print(motif.counts)
```
### Position Weight Matrix (PWM)
```python
# Create position weight matrix
pwm = motif.counts.normalize(pseudocounts=0.5)
print(pwm)
# Calculate information content
ic = motif.counts.information_content()
print(f"Information content: {ic:.2f} bits")
```
### Searching for Motifs
```python
from Bio.Seq import Seq
# Search sequence for motif
test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC")
# Get Position Specific Scoring Matrix (PSSM)
pssm = pwm.log_odds()
# Search sequence
for position, score in pssm.search(test_seq, threshold=5.0):
print(f"Position {position}: score = {score:.2f}")
```
### Reading Motifs from Files
```python
# Read motif from JASPAR format
with open("motif.jaspar") as handle:
motif = motifs.read(handle, "jaspar")
# Read multiple motifs
with open("motifs.jaspar") as handle:
for m in motifs.parse(handle, "jaspar"):
print(m.name)
# Supported formats: jaspar, meme, transfac, pfm
```
### Writing Motifs
```python
# Write motif in JASPAR format
with open("output.jaspar", "w") as handle:
handle.write(motif.format("jaspar"))
```
## Population Genetics with Bio.PopGen
### Working with GenePop Files
```python
from Bio.PopGen import GenePop
# Read GenePop file
with open("data.gen") as handle:
record = GenePop.read(handle)
# Access populations
print(f"Number of populations: {len(record.populations)}")
print(f"Loci: {record.loci_list}")
# Iterate through populations
for pop_idx, pop in enumerate(record.populations):
print(f"\nPopulation {pop_idx + 1}:")
for individual in pop:
print(f" {individual[0]}: {individual[1]}")
```
### Calculating Population Statistics
```python
from Bio.PopGen.GenePop.Controller import GenePopController
# Create controller
ctrl = GenePopController()
# Calculate basic statistics
result = ctrl.calc_allele_genotype_freqs("data.gen")
# Calculate Fst
fst_result = ctrl.calc_fst_all("data.gen")
print(f"Fst: {fst_result}")
# Test Hardy-Weinberg equilibrium
hw_result = ctrl.test_hw_pop("data.gen", "probability")
```
## Sequence Utilities with Bio.SeqUtils
### GC Content
```python
from Bio.SeqUtils import gc_fraction
from Bio.Seq import Seq
seq = Seq("ATCGATCGATCG")
gc = gc_fraction(seq)
print(f"GC content: {gc:.2%}")
```
### Molecular Weight
```python
from Bio.SeqUtils import molecular_weight
# DNA molecular weight
dna_seq = Seq("ATCG")
mw = molecular_weight(dna_seq, seq_type="DNA")
print(f"DNA MW: {mw:.2f} g/mol")
# Protein molecular weight
protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY")
mw = molecular_weight(protein_seq, seq_type="protein")
print(f"Protein MW: {mw:.2f} Da")
```
### Melting Temperature
```python
from Bio.SeqUtils import MeltingTemp as mt
# Calculate Tm using nearest-neighbor method
seq = Seq("ATCGATCGATCG")
tm = mt.Tm_NN(seq)
print(f"Tm: {tm:.1f}°C")
# Use different salt concentration
tm = mt.Tm_NN(seq, Na=50, Mg=1.5) # 50 mM Na+, 1.5 mM Mg2+
# Wallace rule (for primers)
tm_wallace = mt.Tm_Wallace(seq)
```
### GC Skew
```python
from Bio.SeqUtils import gc_skew
# Calculate GC skew
seq = Seq("ATCGATCGGGCCCAAATTT")
skew = gc_skew(seq, window=100)
print(f"GC skew: {skew}")
```
### ProtParam - Protein Analysis
```python
from Bio.SeqUtils.ProtParam import ProteinAnalysis
protein_seq = "ACDEFGHIKLMNPQRSTVWY"
analyzed_seq = ProteinAnalysis(protein_seq)
# Molecular weight
print(f"MW: {analyzed_seq.molecular_weight():.2f} Da")
# Isoelectric point
print(f"pI: {analyzed_seq.isoelectric_point():.2f}")
# Amino acid composition
print(f"Composition: {analyzed_seq.get_amino_acids_percent()}")
# Instability index
print(f"Instability: {analyzed_seq.instability_index():.2f}")
# Aromaticity
print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}")
# Secondary structure fraction
ss = analyzed_seq.secondary_structure_fraction()
print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}")
# Extinction coefficient (assumes Cys reduced, no disulfide bonds)
print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}")
# Gravy (grand average of hydropathy)
print(f"GRAVY: {analyzed_seq.gravy():.3f}")
```
## Restriction Analysis with Bio.Restriction
```python
from Bio import Restriction
from Bio.Seq import Seq
# Analyze sequence for restriction sites
seq = Seq("GAATTCATCGATCGATGAATTC")
# Use specific enzyme
ecori = Restriction.EcoRI
sites = ecori.search(seq)
print(f"EcoRI sites at: {sites}")
# Use multiple enzymes
rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"])
results = rb.search(seq)
for enzyme, sites in results.items():
if sites:
print(f"{enzyme}: {sites}")
# Get all enzymes that cut sequence
all_enzymes = Restriction.Analysis(rb, seq)
print(f"Cutting enzymes: {all_enzymes.with_sites()}")
```
## Sequence Translation Tables
```python
from Bio.Data import CodonTable
# Standard genetic code
standard_table = CodonTable.unambiguous_dna_by_id[1]
print(standard_table)
# Mitochondrial code
mito_table = CodonTable.unambiguous_dna_by_id[2]
# Get specific codon
print(f"ATG codes for: {standard_table.forward_table['ATG']}")
# Get stop codons
print(f"Stop codons: {standard_table.stop_codons}")
# Get start codons
print(f"Start codons: {standard_table.start_codons}")
```
## Cluster Analysis with Bio.Cluster
```python
from Bio.Cluster import kcluster
import numpy as np
# Sample data matrix (genes x conditions)
data = np.array([
[1.2, 0.8, 0.5, 1.5],
[0.9, 1.1, 0.7, 1.3],
[0.2, 0.3, 2.1, 2.5],
[0.1, 0.4, 2.3, 2.2],
])
# Perform k-means clustering
clusterid, error, nfound = kcluster(data, nclusters=2)
print(f"Cluster assignments: {clusterid}")
print(f"Error: {error}")
```
## Genome Diagrams with GenomeDiagram
```python
from Bio.Graphics import GenomeDiagram
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from reportlab.lib import colors
# Read GenBank file
record = SeqIO.read("sequence.gb", "genbank")
# Create diagram
gd_diagram = GenomeDiagram.Diagram("Genome Diagram")
gd_track = gd_diagram.new_track(1, greytrack=True)
gd_feature_set = gd_track.new_set()
# Add features
for feature in record.features:
if feature.type == "CDS":
color = colors.blue
elif feature.type == "gene":
color = colors.lightblue
else:
color = colors.grey
gd_feature_set.add_feature(
feature,
color=color,
label=True,
label_size=6,
label_angle=45
)
# Draw and save
gd_diagram.draw(format="linear", pagesize="A4", fragments=1)
gd_diagram.write("genome_diagram.pdf", "PDF")
```
## Sequence Comparison with Bio.pairwise2
**Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md).
However, for legacy code:
```python
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# Global alignment
alignments = pairwise2.align.globalxx("ACCGT", "ACGT")
# Print top alignments
for alignment in alignments[:3]:
print(format_alignment(*alignment))
```
## Working with PubChem
```python
from Bio import Entrez
Entrez.email = "your.email@example.com"
# Search PubChem
handle = Entrez.esearch(db="pccompound", term="aspirin")
result = Entrez.read(handle)
handle.close()
compound_id = result["IdList"][0]
# Get compound information
handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml")
compound_data = handle.read()
handle.close()
```
## Sequence Features with Bio.SeqFeature
```python
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# Create a feature
feature = SeqFeature(
location=FeatureLocation(start=10, end=50),
type="CDS",
strand=1,
qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]}
)
# Add feature to record
record = SeqRecord(Seq("ATCG" * 20), id="seq1")
record.features.append(feature)
# Extract feature sequence
feature_seq = feature.extract(record.seq)
print(feature_seq)
```
## Sequence Ambiguity
```python
from Bio.Data import IUPACData
# DNA ambiguity codes
print(IUPACData.ambiguous_dna_letters)
# Protein ambiguity codes
print(IUPACData.ambiguous_protein_letters)
# Resolve ambiguous bases
print(IUPACData.ambiguous_dna_values["N"]) # Any base
print(IUPACData.ambiguous_dna_values["R"]) # A or G
```
## Quality Scores (FASTQ)
```python
from Bio import SeqIO
# Read FASTQ with quality scores
for record in SeqIO.parse("reads.fastq", "fastq"):
print(f"ID: {record.id}")
print(f"Sequence: {record.seq}")
print(f"Quality: {record.letter_annotations['phred_quality']}")
# Calculate average quality
avg_quality = sum(record.letter_annotations['phred_quality']) / len(record)
print(f"Average quality: {avg_quality:.2f}")
# Filter by quality
min_quality = min(record.letter_annotations['phred_quality'])
if min_quality >= 20:
print("High quality read")
```
## Best Practices
1. **Use appropriate modules** - Choose the right tool for your analysis
2. **Handle pseudocounts** - Important for motif analysis
3. **Validate input data** - Check file formats and data quality
4. **Consider performance** - Some operations can be computationally intensive
5. **Cache results** - Store intermediate results for large analyses
6. **Use proper genetic codes** - Select appropriate translation tables
7. **Document parameters** - Record thresholds and settings used
8. **Validate statistical results** - Understand limitations of tests
9. **Handle edge cases** - Check for empty results or invalid input
10. **Combine modules** - Leverage multiple Biopython tools together
## Common Use Cases
### Find ORFs
```python
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
def find_orfs(seq, min_length=100):
"""Find all ORFs in sequence."""
orfs = []
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
for frame in range(3):
trans = nuc[frame:].translate()
trans_len = len(trans)
aa_start = 0
while aa_start < trans_len:
aa_end = trans.find("*", aa_start)
if aa_end == -1:
aa_end = trans_len
if aa_end - aa_start >= min_length // 3:
start = frame + aa_start * 3
end = frame + aa_end * 3
orfs.append({
'start': start,
'end': end,
'strand': strand,
'frame': frame,
'length': end - start,
'sequence': nuc[start:end]
})
aa_start = aa_end + 1
return orfs
# Use it
record = SeqIO.read("sequence.fasta", "fasta")
orfs = find_orfs(record.seq, min_length=300)
for orf in orfs:
print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}")
```
### Analyze Codon Usage
```python
from Bio import SeqIO
from Bio.SeqUtils import CodonUsage
def analyze_codon_usage(fasta_file):
"""Analyze codon usage in coding sequences."""
codon_counts = {}
for record in SeqIO.parse(fasta_file, "fasta"):
# Ensure sequence is multiple of 3
seq = record.seq[:len(record.seq) - len(record.seq) % 3]
# Count codons
for i in range(0, len(seq), 3):
codon = str(seq[i:i+3])
codon_counts[codon] = codon_counts.get(codon, 0) + 1
# Calculate frequencies
total = sum(codon_counts.values())
codon_freq = {k: v/total for k, v in codon_counts.items()}
return codon_freq
```
### Calculate Sequence Complexity
```python
def sequence_complexity(seq, k=2):
"""Calculate k-mer complexity (Shannon entropy)."""
import math
from collections import Counter
# Generate k-mers
kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)]
# Count k-mers
counts = Counter(kmers)
total = len(kmers)
# Calculate entropy
entropy = 0
for count in counts.values():
freq = count / total
entropy -= freq * math.log2(freq)
# Normalize by maximum possible entropy
max_entropy = math.log2(4 ** k) # For DNA
return entropy / max_entropy if max_entropy > 0 else 0
# Use it
from Bio.Seq import Seq
seq = Seq("ATCGATCGATCGATCG")
complexity = sequence_complexity(seq, k=2)
print(f"Sequence complexity: {complexity:.3f}")
```
### Extract Promoter Regions
```python
def extract_promoters(genbank_file, upstream=500):
"""Extract promoter regions upstream of genes."""
from Bio import SeqIO
record = SeqIO.read(genbank_file, "genbank")
promoters = []
for feature in record.features:
if feature.type == "gene":
if feature.strand == 1:
# Forward strand
start = max(0, feature.location.start - upstream)
end = feature.location.start
else:
# Reverse strand
start = feature.location.end
end = min(len(record.seq), feature.location.end + upstream)
promoter_seq = record.seq[start:end]
if feature.strand == -1:
promoter_seq = promoter_seq.reverse_complement()
promoters.append({
'gene': feature.qualifiers.get('gene', ['Unknown'])[0],
'sequence': promoter_seq,
'start': start,
'end': end
})
return promoters
```

View File

@@ -0,0 +1,362 @@
# Sequence Alignments with Bio.Align and Bio.AlignIO
## Overview
Bio.Align provides tools for pairwise sequence alignment using various algorithms, while Bio.AlignIO handles reading and writing multiple sequence alignment files in various formats.
## Pairwise Alignment with Bio.Align
### The PairwiseAligner Class
The `PairwiseAligner` class performs pairwise sequence alignments using Needleman-Wunsch (global), Smith-Waterman (local), Gotoh (three-state), and Waterman-Smith-Beyer algorithms. The appropriate algorithm is automatically selected based on gap score parameters.
### Creating an Aligner
```python
from Bio import Align
# Create aligner with default parameters
aligner = Align.PairwiseAligner()
# Default scores (as of Biopython 1.85+):
# - Match score: +1.0
# - Mismatch score: 0.0
# - All gap scores: -1.0
```
### Customizing Alignment Parameters
```python
# Set scoring parameters
aligner.match_score = 2.0
aligner.mismatch_score = -1.0
aligner.gap_score = -0.5
# Or use separate gap opening/extension penalties
aligner.open_gap_score = -2.0
aligner.extend_gap_score = -0.5
# Set internal gap scores separately
aligner.internal_open_gap_score = -2.0
aligner.internal_extend_gap_score = -0.5
# Set end gap scores (for semi-global alignment)
aligner.left_open_gap_score = 0.0
aligner.left_extend_gap_score = 0.0
aligner.right_open_gap_score = 0.0
aligner.right_extend_gap_score = 0.0
```
### Alignment Modes
```python
# Global alignment (default)
aligner.mode = 'global'
# Local alignment
aligner.mode = 'local'
```
### Performing Alignments
```python
from Bio.Seq import Seq
seq1 = Seq("ACCGGT")
seq2 = Seq("ACGGT")
# Get all optimal alignments
alignments = aligner.align(seq1, seq2)
# Iterate through alignments
for alignment in alignments:
print(alignment)
print(f"Score: {alignment.score}")
# Get just the score
score = aligner.score(seq1, seq2)
```
### Using Substitution Matrices
```python
from Bio.Align import substitution_matrices
# Load a substitution matrix
matrix = substitution_matrices.load("BLOSUM62")
aligner.substitution_matrix = matrix
# Align protein sequences
protein1 = Seq("KEVLA")
protein2 = Seq("KSVLA")
alignments = aligner.align(protein1, protein2)
```
### Available Substitution Matrices
Common matrices include:
- **BLOSUM** series (BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90)
- **PAM** series (PAM30, PAM70, PAM250)
- **MATCH** - Simple match/mismatch matrix
```python
# List available matrices
available = substitution_matrices.load()
print(available)
```
## Multiple Sequence Alignments with Bio.AlignIO
### Reading Alignments
Bio.AlignIO provides similar API to Bio.SeqIO but for alignment files:
```python
from Bio import AlignIO
# Read a single alignment
alignment = AlignIO.read("alignment.aln", "clustal")
# Parse multiple alignments from a file
for alignment in AlignIO.parse("alignments.aln", "clustal"):
print(f"Alignment with {len(alignment)} sequences")
print(f"Alignment length: {alignment.get_alignment_length()}")
```
### Supported Alignment Formats
Common formats include:
- **clustal** - Clustal format
- **phylip** - PHYLIP format
- **phylip-relaxed** - Relaxed PHYLIP (longer names)
- **stockholm** - Stockholm format
- **fasta** - FASTA format (aligned)
- **nexus** - NEXUS format
- **emboss** - EMBOSS alignment format
- **msf** - MSF format
- **maf** - Multiple Alignment Format
### Writing Alignments
```python
# Write alignment to file
AlignIO.write(alignment, "output.aln", "clustal")
# Convert between formats
count = AlignIO.convert("input.aln", "clustal", "output.phy", "phylip")
```
### Working with Alignment Objects
```python
from Bio import AlignIO
alignment = AlignIO.read("alignment.aln", "clustal")
# Get alignment properties
print(f"Number of sequences: {len(alignment)}")
print(f"Alignment length: {alignment.get_alignment_length()}")
# Access individual sequences
for record in alignment:
print(f"{record.id}: {record.seq}")
# Get alignment column
column = alignment[:, 0] # First column
# Get alignment slice
sub_alignment = alignment[:, 10:20] # Positions 10-20
# Get specific sequence
seq_record = alignment[0] # First sequence
```
### Alignment Analysis
```python
# Calculate alignment statistics
from Bio.Align import AlignInfo
summary = AlignInfo.SummaryInfo(alignment)
# Get consensus sequence
consensus = summary.gap_consensus(threshold=0.7)
# Position-specific scoring matrix (PSSM)
pssm = summary.pos_specific_score_matrix(consensus)
# Calculate information content
from Bio import motifs
motif = motifs.create([record.seq for record in alignment])
information = motif.counts.information_content()
```
## Creating Alignments Programmatically
### From SeqRecord Objects
```python
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
# Create records
records = [
SeqRecord(Seq("ACTGCTAGCTAG"), id="seq1"),
SeqRecord(Seq("ACT-CTAGCTAG"), id="seq2"),
SeqRecord(Seq("ACTGCTA-CTAG"), id="seq3"),
]
# Create alignment
alignment = MultipleSeqAlignment(records)
```
### Adding Sequences to Alignments
```python
# Start with empty alignment
alignment = MultipleSeqAlignment([])
# Add sequences (must have same length)
alignment.append(SeqRecord(Seq("ACTG"), id="seq1"))
alignment.append(SeqRecord(Seq("ACTG"), id="seq2"))
# Extend with another alignment
alignment.extend(other_alignment)
```
## Advanced Alignment Operations
### Removing Gaps
```python
# Remove all gap-only columns
from Bio.Align import AlignInfo
no_gaps = []
for i in range(alignment.get_alignment_length()):
column = alignment[:, i]
if set(column) != {'-'}: # Not all gaps
no_gaps.append(column)
```
### Alignment Sorting
```python
# Sort by sequence ID
sorted_alignment = sorted(alignment, key=lambda x: x.id)
alignment = MultipleSeqAlignment(sorted_alignment)
```
### Computing Pairwise Identities
```python
def pairwise_identity(seq1, seq2):
"""Calculate percent identity between two sequences."""
matches = sum(a == b for a, b in zip(seq1, seq2) if a != '-' and b != '-')
length = sum(1 for a, b in zip(seq1, seq2) if a != '-' and b != '-')
return matches / length if length > 0 else 0
# Calculate all pairwise identities
for i, record1 in enumerate(alignment):
for record2 in alignment[i+1:]:
identity = pairwise_identity(record1.seq, record2.seq)
print(f"{record1.id} vs {record2.id}: {identity:.2%}")
```
## Running External Alignment Tools
### Clustal Omega (via Command Line)
```python
from Bio.Align.Applications import ClustalOmegaCommandline
# Setup command
clustal_cmd = ClustalOmegaCommandline(
infile="sequences.fasta",
outfile="alignment.aln",
verbose=True,
auto=True
)
# Run alignment
stdout, stderr = clustal_cmd()
# Read result
alignment = AlignIO.read("alignment.aln", "clustal")
```
### MUSCLE (via Command Line)
```python
from Bio.Align.Applications import MuscleCommandline
muscle_cmd = MuscleCommandline(
input="sequences.fasta",
out="alignment.aln"
)
stdout, stderr = muscle_cmd()
```
## Best Practices
1. **Choose appropriate scoring schemes** - Use BLOSUM62 for proteins, custom scores for DNA
2. **Consider alignment mode** - Global for similar-length sequences, local for finding conserved regions
3. **Set gap penalties carefully** - Higher penalties create fewer, longer gaps
4. **Use appropriate formats** - FASTA for simple alignments, Stockholm for rich annotation
5. **Validate alignment quality** - Check for conserved regions and percent identity
6. **Handle large alignments carefully** - Use slicing and iteration for memory efficiency
7. **Preserve metadata** - Maintain SeqRecord IDs and annotations through alignment operations
## Common Use Cases
### Find Best Local Alignment
```python
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq
aligner = PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = 2
aligner.mismatch_score = -1
seq1 = Seq("AGCTTAGCTAGCTAGC")
seq2 = Seq("CTAGCTAGC")
alignments = aligner.align(seq1, seq2)
print(alignments[0])
```
### Protein Sequence Alignment
```python
from Bio.Align import PairwiseAligner, substitution_matrices
aligner = PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
aligner.open_gap_score = -10
aligner.extend_gap_score = -0.5
protein1 = Seq("KEVLA")
protein2 = Seq("KEVLAEQP")
alignments = aligner.align(protein1, protein2)
```
### Extract Conserved Regions
```python
from Bio import AlignIO
alignment = AlignIO.read("alignment.aln", "clustal")
# Find columns with >80% identity
conserved_positions = []
for i in range(alignment.get_alignment_length()):
column = alignment[:, i]
most_common = max(set(column), key=column.count)
if column.count(most_common) / len(column) > 0.8:
conserved_positions.append(i)
print(f"Conserved positions: {conserved_positions}")
```

View File

@@ -0,0 +1,455 @@
# BLAST Operations with Bio.Blast
## Overview
Bio.Blast provides tools for running BLAST searches (both locally and via NCBI web services) and parsing BLAST results in various formats. The module handles the complexity of submitting queries and parsing outputs.
## Running BLAST via NCBI Web Services
### Bio.Blast.NCBIWWW
The `qblast()` function submits sequences to NCBI's online BLAST service:
```python
from Bio.Blast import NCBIWWW
from Bio import SeqIO
# Read sequence from file
record = SeqIO.read("sequence.fasta", "fasta")
# Run BLAST search
result_handle = NCBIWWW.qblast(
program="blastn", # BLAST program
database="nt", # Database to search
sequence=str(record.seq) # Query sequence
)
# Save results
with open("blast_results.xml", "w") as out_file:
out_file.write(result_handle.read())
result_handle.close()
```
### BLAST Programs Available
- **blastn** - Nucleotide vs nucleotide
- **blastp** - Protein vs protein
- **blastx** - Translated nucleotide vs protein
- **tblastn** - Protein vs translated nucleotide
- **tblastx** - Translated nucleotide vs translated nucleotide
### Common Databases
**Nucleotide databases:**
- `nt` - All GenBank+EMBL+DDBJ+PDB sequences
- `refseq_rna` - RefSeq RNA sequences
**Protein databases:**
- `nr` - All non-redundant GenBank CDS translations
- `refseq_protein` - RefSeq protein sequences
- `pdb` - Protein Data Bank sequences
- `swissprot` - Curated UniProtKB/Swiss-Prot
### Advanced qblast Parameters
```python
result_handle = NCBIWWW.qblast(
program="blastn",
database="nt",
sequence=str(record.seq),
expect=0.001, # E-value threshold
hitlist_size=50, # Number of hits to return
alignments=25, # Number of alignments to show
word_size=11, # Word size for initial match
gapcosts="5 2", # Gap costs (open extend)
format_type="XML" # Output format (default)
)
```
### Using Sequence Files or IDs
```python
# Use FASTA format string
fasta_string = open("sequence.fasta").read()
result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string)
# Use GenBank ID
result_handle = NCBIWWW.qblast("blastn", "nt", "EU490707")
# Use GI number
result_handle = NCBIWWW.qblast("blastn", "nt", "160418")
```
## Parsing BLAST Results
### Bio.Blast.NCBIXML
NCBIXML provides parsers for BLAST XML output (the recommended format):
```python
from Bio.Blast import NCBIXML
# Parse single BLAST result
with open("blast_results.xml") as result_handle:
blast_record = NCBIXML.read(result_handle)
```
### Accessing BLAST Record Data
```python
# Query information
print(f"Query: {blast_record.query}")
print(f"Query length: {blast_record.query_length}")
print(f"Database: {blast_record.database}")
print(f"Number of sequences in database: {blast_record.database_sequences}")
# Iterate through alignments (hits)
for alignment in blast_record.alignments:
print(f"\nHit: {alignment.title}")
print(f"Length: {alignment.length}")
print(f"Accession: {alignment.accession}")
# Each alignment can have multiple HSPs (high-scoring pairs)
for hsp in alignment.hsps:
print(f" E-value: {hsp.expect}")
print(f" Score: {hsp.score}")
print(f" Bits: {hsp.bits}")
print(f" Identities: {hsp.identities}/{hsp.align_length}")
print(f" Gaps: {hsp.gaps}")
print(f" Query: {hsp.query}")
print(f" Match: {hsp.match}")
print(f" Subject: {hsp.sbjct}")
```
### Filtering Results
```python
# Only show hits with E-value < 0.001
E_VALUE_THRESH = 0.001
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < E_VALUE_THRESH:
print(f"Hit: {alignment.title}")
print(f"E-value: {hsp.expect}")
print(f"Identities: {hsp.identities}/{hsp.align_length}")
print()
```
### Multiple BLAST Results
For files containing multiple BLAST results (e.g., from batch searches):
```python
from Bio.Blast import NCBIXML
with open("batch_blast_results.xml") as result_handle:
blast_records = NCBIXML.parse(result_handle)
for blast_record in blast_records:
print(f"\nQuery: {blast_record.query}")
print(f"Hits: {len(blast_record.alignments)}")
if blast_record.alignments:
# Get best hit
best_alignment = blast_record.alignments[0]
best_hsp = best_alignment.hsps[0]
print(f"Best hit: {best_alignment.title}")
print(f"E-value: {best_hsp.expect}")
```
## Running Local BLAST
### Prerequisites
Local BLAST requires:
1. BLAST+ command-line tools installed
2. BLAST databases downloaded locally
### Using Command-Line Wrappers
```python
from Bio.Blast.Applications import NcbiblastnCommandline
# Setup BLAST command
blastn_cline = NcbiblastnCommandline(
query="input.fasta",
db="local_database",
evalue=0.001,
outfmt=5, # XML format
out="results.xml"
)
# Run BLAST
stdout, stderr = blastn_cline()
# Parse results
from Bio.Blast import NCBIXML
with open("results.xml") as result_handle:
blast_record = NCBIXML.read(result_handle)
```
### Available Command-Line Wrappers
- `NcbiblastnCommandline` - BLASTN wrapper
- `NcbiblastpCommandline` - BLASTP wrapper
- `NcbiblastxCommandline` - BLASTX wrapper
- `NcbitblastnCommandline` - TBLASTN wrapper
- `NcbitblastxCommandline` - TBLASTX wrapper
### Creating BLAST Databases
```python
from Bio.Blast.Applications import NcbimakeblastdbCommandline
# Create nucleotide database
makedb_cline = NcbimakeblastdbCommandline(
input_file="sequences.fasta",
dbtype="nucl",
out="my_database"
)
stdout, stderr = makedb_cline()
```
## Analyzing BLAST Results
### Extract Best Hits
```python
def get_best_hits(blast_record, num_hits=10, e_value_thresh=0.001):
"""Extract best hits from BLAST record."""
hits = []
for alignment in blast_record.alignments[:num_hits]:
for hsp in alignment.hsps:
if hsp.expect < e_value_thresh:
hits.append({
'title': alignment.title,
'accession': alignment.accession,
'length': alignment.length,
'e_value': hsp.expect,
'score': hsp.score,
'identities': hsp.identities,
'align_length': hsp.align_length,
'query_start': hsp.query_start,
'query_end': hsp.query_end,
'sbjct_start': hsp.sbjct_start,
'sbjct_end': hsp.sbjct_end
})
break # Only take best HSP per alignment
return hits
```
### Calculate Percent Identity
```python
def calculate_percent_identity(hsp):
"""Calculate percent identity for an HSP."""
return (hsp.identities / hsp.align_length) * 100
# Use it
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < 0.001:
identity = calculate_percent_identity(hsp)
print(f"{alignment.title}: {identity:.2f}% identity")
```
### Extract Hit Sequences
```python
from Bio import Entrez, SeqIO
Entrez.email = "your.email@example.com"
def fetch_hit_sequences(blast_record, num_sequences=5):
"""Fetch sequences for top BLAST hits."""
sequences = []
for alignment in blast_record.alignments[:num_sequences]:
accession = alignment.accession
# Fetch sequence from GenBank
handle = Entrez.efetch(
db="nucleotide",
id=accession,
rettype="fasta",
retmode="text"
)
record = SeqIO.read(handle, "fasta")
handle.close()
sequences.append(record)
return sequences
```
## Parsing Other BLAST Formats
### Tab-Delimited Output (outfmt 6/7)
```python
# Run BLAST with tabular output
blastn_cline = NcbiblastnCommandline(
query="input.fasta",
db="database",
outfmt=6,
out="results.txt"
)
# Parse tabular results
with open("results.txt") as f:
for line in f:
fields = line.strip().split('\t')
query_id = fields[0]
subject_id = fields[1]
percent_identity = float(fields[2])
align_length = int(fields[3])
e_value = float(fields[10])
bit_score = float(fields[11])
print(f"{query_id} -> {subject_id}: {percent_identity}% identity, E={e_value}")
```
### Custom Output Formats
```python
# Specify custom columns (outfmt 6 with custom fields)
blastn_cline = NcbiblastnCommandline(
query="input.fasta",
db="database",
outfmt="6 qseqid sseqid pident length evalue bitscore qseq sseq",
out="results.txt"
)
```
## Best Practices
1. **Use XML format** for parsing (outfmt 5) - most reliable and complete
2. **Save BLAST results** - Don't re-run searches unnecessarily
3. **Set appropriate E-value thresholds** - Default is 10, but 0.001-0.01 is often better
4. **Handle rate limits** - NCBI limits request frequency
5. **Use local BLAST** for large-scale searches or repeated queries
6. **Cache results** - Save parsed data to avoid re-parsing
7. **Check for empty results** - Handle cases with no hits gracefully
8. **Consider alternatives** - For large datasets, consider DIAMOND or other fast aligners
9. **Batch searches** - Submit multiple sequences together when possible
10. **Filter by identity** - E-value alone may not be sufficient
## Common Use Cases
### Basic BLAST Search and Parse
```python
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
# Read query sequence
record = SeqIO.read("query.fasta", "fasta")
# Run BLAST
print("Running BLAST search...")
result_handle = NCBIWWW.qblast("blastn", "nt", str(record.seq))
# Parse results
blast_record = NCBIXML.read(result_handle)
# Display top 5 hits
print(f"\nTop 5 hits for {blast_record.query}:")
for i, alignment in enumerate(blast_record.alignments[:5], 1):
hsp = alignment.hsps[0]
identity = (hsp.identities / hsp.align_length) * 100
print(f"{i}. {alignment.title}")
print(f" E-value: {hsp.expect}, Identity: {identity:.1f}%")
```
### Find Orthologs
```python
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import Entrez, SeqIO
Entrez.email = "your.email@example.com"
# Query gene sequence
query_record = SeqIO.read("gene.fasta", "fasta")
# BLAST against specific organism
result_handle = NCBIWWW.qblast(
"blastn",
"nt",
str(query_record.seq),
entrez_query="Mus musculus[Organism]" # Restrict to mouse
)
blast_record = NCBIXML.read(result_handle)
# Find best hit
if blast_record.alignments:
best_hit = blast_record.alignments[0]
print(f"Potential ortholog: {best_hit.title}")
print(f"Accession: {best_hit.accession}")
```
### Batch BLAST Multiple Sequences
```python
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
# Read multiple sequences
sequences = list(SeqIO.parse("queries.fasta", "fasta"))
# Create batch results file
with open("batch_results.xml", "w") as out_file:
for seq_record in sequences:
print(f"Searching for {seq_record.id}...")
result_handle = NCBIWWW.qblast("blastn", "nt", str(seq_record.seq))
out_file.write(result_handle.read())
result_handle.close()
# Parse batch results
with open("batch_results.xml") as result_handle:
for blast_record in NCBIXML.parse(result_handle):
print(f"\n{blast_record.query}: {len(blast_record.alignments)} hits")
```
### Reciprocal Best Hits
```python
def reciprocal_best_hit(seq1_id, seq2_id, database="nr", program="blastp"):
"""Check if two sequences are reciprocal best hits."""
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import Entrez
Entrez.email = "your.email@example.com"
# Forward BLAST
result1 = NCBIWWW.qblast(program, database, seq1_id)
record1 = NCBIXML.read(result1)
best_hit1 = record1.alignments[0].accession if record1.alignments else None
# Reverse BLAST
result2 = NCBIWWW.qblast(program, database, seq2_id)
record2 = NCBIXML.read(result2)
best_hit2 = record2.alignments[0].accession if record2.alignments else None
# Check reciprocity
return best_hit1 == seq2_id and best_hit2 == seq1_id
```
## Error Handling
```python
from Bio.Blast import NCBIWWW, NCBIXML
from urllib.error import HTTPError
try:
result_handle = NCBIWWW.qblast("blastn", "nt", "ATCGATCGATCG")
blast_record = NCBIXML.read(result_handle)
result_handle.close()
except HTTPError as e:
print(f"HTTP Error: {e.code}")
except Exception as e:
print(f"Error running BLAST: {e}")
```

View File

@@ -1,232 +0,0 @@
# BioPython Core Modules Reference
This document provides detailed information about BioPython's core modules and their capabilities.
## Sequence Handling
### Bio.Seq - Sequence Objects
Seq objects are BioPython's fundamental data structure for biological sequences, providing biological methods on top of string-like behavior.
**Creation:**
```python
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
```
**Key Operations:**
- String methods: `find()`, `count()`, `count_overlap()` (for overlapping patterns)
- Complement/Reverse complement: Returns complementary sequences
- Transcription: DNA → RNA (T → U)
- Back transcription: RNA → DNA
- Translation: DNA/RNA → protein with customizable genetic codes and stop codon handling
**Use Cases:**
- DNA/RNA sequence manipulation
- Converting between nucleic acid types
- Protein translation from coding sequences
- Sequence searching and pattern counting
### Bio.SeqRecord - Sequence Metadata
SeqRecord wraps Seq objects with metadata like ID, description, and features.
**Attributes:**
- `seq`: The sequence itself (Seq object)
- `id`: Unique identifier
- `name`: Short name
- `description`: Longer description
- `features`: List of SeqFeature objects
- `annotations`: Dictionary of additional information
- `letter_annotations`: Per-letter annotations (e.g., quality scores)
### Bio.SeqFeature - Sequence Annotations
Manages sequence annotations and features such as genes, promoters, and coding regions.
**Common Features:**
- Gene locations
- CDS (coding sequences)
- Promoters and regulatory elements
- Exons and introns
- Protein domains
## File Input/Output
### Bio.SeqIO - Sequence File I/O
Unified interface for reading and writing sequence files in multiple formats.
**Supported Formats:**
- FASTA/FASTQ: Standard sequence formats
- GenBank/EMBL: Feature-rich annotation formats
- Clustal/Stockholm/PHYLIP: Alignment formats
- ABI/SFF: Trace and flowgram data
- Swiss-Prot/PIR: Protein databases
- PDB: Protein structure files
**Key Functions:**
**SeqIO.parse()** - Iterator for reading multiple records:
```python
from Bio import SeqIO
for record in SeqIO.parse("file.fasta", "fasta"):
print(record.id, len(record.seq))
```
**SeqIO.read()** - Read single record:
```python
record = SeqIO.read("file.fasta", "fasta")
```
**SeqIO.write()** - Write sequences:
```python
SeqIO.write(sequences, "output.fasta", "fasta")
```
**SeqIO.convert()** - Direct format conversion:
```python
count = SeqIO.convert("input.gb", "genbank", "output.fasta", "fasta")
```
**SeqIO.index()** - Memory-efficient random access for large files:
```python
record_dict = SeqIO.index("large_file.fasta", "fasta")
sequence = record_dict["seq_id"]
```
**SeqIO.to_dict()** - Load all records into dictionary (memory-based):
```python
record_dict = SeqIO.to_dict(SeqIO.parse("file.fasta", "fasta"))
```
**Common Patterns:**
- Format conversion between FASTA, GenBank, FASTQ
- Filtering sequences by length, ID, or content
- Extracting subsequences
- Batch processing large files with iterators
### Bio.AlignIO - Multiple Sequence Alignment I/O
Handles multiple sequence alignment files.
**Key Functions:**
- `write()`: Save alignments
- `parse()`: Read multiple alignments
- `read()`: Read single alignment
- `convert()`: Convert between formats
**Supported Formats:**
- Clustal
- PHYLIP (sequential and interleaved)
- Stockholm
- NEXUS
- FASTA (aligned)
- MAF (Multiple Alignment Format)
## Sequence Alignment
### Bio.Align - Alignment Tools
**PairwiseAligner** - High-performance pairwise alignment:
```python
from Bio import Align
aligner = Align.PairwiseAligner()
aligner.mode = 'global' # or 'local'
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.gap_score = -2.5
alignments = aligner.align(seq1, seq2)
```
**CodonAligner** - Codon-aware alignment
**MultipleSeqAlignment** - Container for MSA with column access
### Bio.pairwise2 (Legacy)
Legacy pairwise alignment module with functions like `align.globalxx()`, `align.localxx()`.
## Sequence Analysis Utilities
### Bio.SeqUtils - Sequence Analysis
Collection of utility functions:
**CheckSum** - Calculate sequence checksums (CRC32, CRC64, GCG)
**MeltingTemp** - DNA melting temperature calculations:
- Nearest-neighbor method
- Wallace rule
- GC content method
**IsoelectricPoint** - Protein pI calculation
**ProtParam** - Protein analysis:
- Molecular weight
- Aromaticity
- Instability index
- Secondary structure fractions
**GC/GC_skew** - Calculate GC content and GC skew for sequence windows
### Bio.Data.CodonTable - Genetic Codes
Access to NCBI genetic code tables:
```python
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
print(standard_table.forward_table) # codon to amino acid
print(standard_table.back_table) # amino acid to codons
print(standard_table.start_codons)
print(standard_table.stop_codons)
```
**Available codes:**
- Standard code (1)
- Vertebrate mitochondrial (2)
- Yeast mitochondrial (3)
- And many more organism-specific codes
## Sequence Motifs and Patterns
### Bio.motifs - Sequence Motif Analysis
Tools for working with sequence motifs:
**Position Weight Matrices (PWM):**
- Create PWM from aligned sequences
- Calculate information content
- Search sequences for motif matches
- Generate consensus sequences
**Position Specific Scoring Matrices (PSSM):**
- Convert PWM to PSSM
- Score sequences against motifs
- Determine significance thresholds
**Supported Formats:**
- JASPAR
- TRANSFAC
- MEME
- AlignAce
### Bio.Restriction - Restriction Enzymes
Comprehensive restriction enzyme database and analysis:
**Capabilities:**
- Search for restriction sites
- Predict digestion products
- Analyze restriction maps
- Access enzyme properties (recognition site, cut positions, isoschizomers)
**Example usage:**
```python
from Bio import Restriction
from Bio.Seq import Seq
seq = Seq("GAATTC...")
enzyme = Restriction.EcoRI
results = enzyme.search(seq)
```

View File

@@ -1,306 +0,0 @@
# BioPython Database Access and Search Tools
This document covers BioPython's capabilities for accessing biological databases and performing sequence searches.
## NCBI Database Access
### Bio.Entrez - NCBI E-utilities Interface
Provides programmatic access to NCBI databases including PubMed, GenBank, Protein, Nucleotide, and more.
**Important:** Always set your email before using Entrez:
```python
from Bio import Entrez
Entrez.email = "your.email@example.com"
```
#### Core Query Functions
**esearch** - Search databases and retrieve IDs:
```python
handle = Entrez.esearch(db="nucleotide", term="Homo sapiens[Organism] AND COX1")
record = Entrez.read(handle)
id_list = record["IdList"]
```
Parameters:
- `db`: Database to search (nucleotide, protein, pubmed, etc.)
- `term`: Search query
- `retmax`: Maximum number of IDs to return
- `sort`: Sort order (relevance, pub_date, etc.)
- `usehistory`: Store results on server (useful for large queries)
**efetch** - Retrieve full records:
```python
handle = Entrez.efetch(db="nucleotide", id="123456", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
```
Parameters:
- `db`: Database name
- `id`: Single ID or comma-separated list
- `rettype`: Return type (gb, fasta, gp, xml, etc.)
- `retmode`: Return mode (text, xml, asn.1)
- Automatically uses POST for >200 IDs
**elink** - Find related records across databases:
```python
handle = Entrez.elink(dbfrom="protein", db="gene", id="15718680")
result = Entrez.read(handle)
```
Parameters:
- `dbfrom`: Source database
- `db`: Target database
- `id`: ID(s) to link from
- Returns LinkOut providers and relevancy scores
**esummary** - Get document summaries:
```python
handle = Entrez.esummary(db="protein", id="15718680")
summary = Entrez.read(handle)
print(summary[0]['Title'])
```
Returns quick overviews without full records.
**einfo** - Get database statistics:
```python
handle = Entrez.einfo(db="nucleotide")
info = Entrez.read(handle)
```
Provides field indices, term counts, update dates, and available links.
**epost** - Upload ID lists to server:
```python
handle = Entrez.epost("nucleotide", id="123456,789012")
result = Entrez.read(handle)
webenv = result["WebEnv"]
query_key = result["QueryKey"]
```
Useful for large queries split across multiple requests.
**espell** - Get spelling suggestions:
```python
handle = Entrez.espell(term="brest cancer")
result = Entrez.read(handle)
print(result["CorrectedQuery"]) # "breast cancer"
```
**ecitmatch** - Convert citations to PubMed IDs:
```python
citation = "proc natl acad sci u s a|1991|88|3248|mann bj|"
handle = Entrez.ecitmatch(db="pubmed", bdata=citation)
```
#### Data Processing Functions
**Entrez.read()** - Parse XML to Python dictionary:
```python
handle = Entrez.esearch(db="protein", term="insulin")
record = Entrez.read(handle)
```
**Entrez.parse()** - Generator for large XML results:
```python
handle = Entrez.efetch(db="protein", id=id_list, rettype="gp", retmode="xml")
for record in Entrez.parse(handle):
process(record)
```
#### Common Workflows
**Download sequences by accession:**
```python
handle = Entrez.efetch(db="nucleotide", id="NM_001301717", rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
```
**Search and download multiple sequences:**
```python
# Search
search_handle = Entrez.esearch(db="nucleotide", term="human kinase", retmax="100")
search_results = Entrez.read(search_handle)
# Download
fetch_handle = Entrez.efetch(db="nucleotide", id=search_results["IdList"], rettype="gb", retmode="text")
for record in SeqIO.parse(fetch_handle, "genbank"):
print(record.id)
```
**Use WebEnv for large queries:**
```python
# Post IDs
post_handle = Entrez.epost(db="nucleotide", id=",".join(large_id_list))
post_result = Entrez.read(post_handle)
# Fetch in batches
batch_size = 500
for start in range(0, count, batch_size):
fetch_handle = Entrez.efetch(
db="nucleotide",
rettype="fasta",
retmode="text",
retstart=start,
retmax=batch_size,
webenv=post_result["WebEnv"],
query_key=post_result["QueryKey"]
)
# Process batch
```
### Bio.GenBank - GenBank Format Parsing
Low-level GenBank file parser (SeqIO is usually preferred).
### Bio.SwissProt - Swiss-Prot/UniProt Parsing
Parse Swiss-Prot and UniProtKB flat file format:
```python
from Bio import SwissProt
with open("uniprot.dat") as handle:
for record in SwissProt.parse(handle):
print(record.entry_name, record.organism)
```
## Sequence Similarity Searches
### Bio.Blast - BLAST Interface
Tools for running BLAST searches and parsing results.
#### Running BLAST
**NCBI QBLAST (online):**
```python
from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastn", "nt", sequence)
```
Parameters:
- Program: blastn, blastp, blastx, tblastn, tblastx
- Database: nt, nr, refseq_rna, pdb, etc.
- Sequence: string or Seq object
- Additional parameters: `expect`, `word_size`, `hitlist_size`, `format_type`
**Local BLAST:**
Run standalone BLAST from command line, then parse results.
#### Parsing BLAST Results
**XML format (recommended):**
```python
from Bio.Blast import NCBIXML
result_handle = open("blast_results.xml")
blast_records = NCBIXML.parse(result_handle)
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < 0.001:
print(f"Hit: {alignment.title}")
print(f"Length: {alignment.length}")
print(f"E-value: {hsp.expect}")
print(f"Identities: {hsp.identities}/{hsp.align_length}")
```
**Functions:**
- `NCBIXML.read()`: Single query
- `NCBIXML.parse()`: Multiple queries (generator)
**Key Record Attributes:**
- `alignments`: List of matching sequences
- `query`: Query sequence ID
- `query_length`: Length of query
**Alignment Attributes:**
- `title`: Description of hit
- `length`: Length of hit sequence
- `hsps`: High-scoring segment pairs
**HSP Attributes:**
- `expect`: E-value
- `score`: Bit score
- `identities`: Number of identical residues
- `positives`: Number of positive scoring matches
- `gaps`: Number of gaps
- `align_length`: Length of alignment
- `query`: Aligned query sequence
- `match`: Match indicators
- `sbjct`: Aligned subject sequence
- `query_start`, `query_end`: Query coordinates
- `sbjct_start`, `sbjct_end`: Subject coordinates
#### Common BLAST Workflows
**Find homologs:**
```python
result = NCBIWWW.qblast("blastp", "nr", protein_sequence, expect=1e-10)
with open("results.xml", "w") as out:
out.write(result.read())
```
**Filter results by criteria:**
```python
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
if hsp.expect < 1e-5 and hsp.identities/hsp.align_length > 0.5:
# Process high-quality hits
pass
```
### Bio.SearchIO - Unified Search Results Parser
Modern interface for parsing various search tool outputs (BLAST, HMMER, BLAT, etc.).
**Key Functions:**
- `read()`: Parse single query
- `parse()`: Parse multiple queries (generator)
- `write()`: Write results to file
- `convert()`: Convert between formats
**Supported Tools:**
- BLAST (XML, tabular, plain text)
- HMMER (hmmscan, hmmsearch, phmmer)
- BLAT
- FASTA
- InterProScan
- Exonerate
**Example:**
```python
from Bio import SearchIO
results = SearchIO.parse("blast_output.xml", "blast-xml")
for result in results:
for hit in result:
if hit.hsps[0].evalue < 0.001:
print(hit.id, hit.hsps[0].evalue)
```
## Local Database Management
### BioSQL - SQL Database Interface
Store and manage biological sequences in SQL databases (PostgreSQL, MySQL, SQLite).
**Features:**
- Store SeqRecord objects with annotations
- Efficient querying and retrieval
- Cross-reference sequences
- Track relationships between sequences
**Example:**
```python
from BioSQL import BioSeqDatabase
server = BioSeqDatabase.open_database(driver="MySQLdb", user="user", passwd="pass", host="localhost", db="bioseqdb")
db = server["my_db"]
# Store sequences
db.load(SeqIO.parse("sequences.gb", "genbank"))
# Query
seq = db.lookup(accession="NC_005816")
```

View File

@@ -0,0 +1,484 @@
# Database Access with Bio.Entrez
## Overview
Bio.Entrez provides programmatic access to NCBI's Entrez databases, including PubMed, GenBank, Gene, Protein, Nucleotide, and many others. It handles all the complexity of API calls, rate limiting, and data parsing.
## Setup and Configuration
### Email Address (Required)
NCBI requires an email address to track usage and contact users if issues arise:
```python
from Bio import Entrez
# Always set your email
Entrez.email = "your.email@example.com"
```
### API Key (Recommended)
Using an API key increases rate limits from 3 to 10 requests per second:
```python
# Get API key from: https://www.ncbi.nlm.nih.gov/account/settings/
Entrez.api_key = "your_api_key_here"
```
### Rate Limiting
Biopython automatically respects NCBI rate limits:
- **Without API key**: 3 requests per second
- **With API key**: 10 requests per second
The module handles this automatically, so you don't need to add delays between requests.
## Core Entrez Functions
### EInfo - Database Information
Get information about available databases and their statistics:
```python
# List all databases
handle = Entrez.einfo()
result = Entrez.read(handle)
print(result["DbList"])
# Get information about a specific database
handle = Entrez.einfo(db="pubmed")
result = Entrez.read(handle)
print(result["DbInfo"]["Description"])
print(result["DbInfo"]["Count"]) # Number of records
```
### ESearch - Search Databases
Search for records and retrieve their IDs:
```python
# Search PubMed
handle = Entrez.esearch(db="pubmed", term="biopython")
result = Entrez.read(handle)
handle.close()
id_list = result["IdList"]
count = result["Count"]
print(f"Found {count} results")
print(f"Retrieved IDs: {id_list}")
```
### Advanced ESearch Parameters
```python
# Search with additional parameters
handle = Entrez.esearch(
db="pubmed",
term="biopython[Title]",
retmax=100, # Return up to 100 IDs
sort="relevance", # Sort by relevance
reldate=365, # Only results from last year
datetype="pdat" # Use publication date
)
result = Entrez.read(handle)
handle.close()
```
### ESummary - Get Record Summaries
Retrieve summary information for a list of IDs:
```python
# Get summaries for multiple records
handle = Entrez.esummary(db="pubmed", id="19304878,18606172")
results = Entrez.read(handle)
handle.close()
for record in results:
print(f"Title: {record['Title']}")
print(f"Authors: {record['AuthorList']}")
print(f"Journal: {record['Source']}")
print()
```
### EFetch - Retrieve Full Records
Fetch complete records in various formats:
```python
# Fetch a GenBank record
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record_text = handle.read()
handle.close()
# Parse with SeqIO
from Bio import SeqIO
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
print(record.description)
```
### EFetch Return Types
Different databases support different return types:
**Nucleotide/Protein:**
- `rettype="fasta"` - FASTA format
- `rettype="gb"` or `"genbank"` - GenBank format
- `rettype="gp"` - GenPept format (proteins)
**PubMed:**
- `rettype="medline"` - MEDLINE format
- `rettype="abstract"` - Abstract text
**Common modes:**
- `retmode="text"` - Plain text
- `retmode="xml"` - XML format
### ELink - Find Related Records
Find links between records in different databases:
```python
# Find protein records linked to a nucleotide record
handle = Entrez.elink(dbfrom="nucleotide", db="protein", id="EU490707")
result = Entrez.read(handle)
handle.close()
# Extract linked IDs
for linkset in result[0]["LinkSetDb"]:
if linkset["LinkName"] == "nucleotide_protein":
protein_ids = [link["Id"] for link in linkset["Link"]]
print(f"Linked protein IDs: {protein_ids}")
```
### EPost - Upload ID Lists
Upload large lists of IDs to the server for later use:
```python
# Post IDs to server
id_list = ["19304878", "18606172", "16403221"]
handle = Entrez.epost(db="pubmed", id=",".join(id_list))
result = Entrez.read(handle)
handle.close()
# Get query_key and WebEnv for later use
query_key = result["QueryKey"]
webenv = result["WebEnv"]
# Use in subsequent queries
handle = Entrez.efetch(
db="pubmed",
query_key=query_key,
WebEnv=webenv,
rettype="medline",
retmode="text"
)
```
### EGQuery - Global Query
Search across all Entrez databases at once:
```python
handle = Entrez.egquery(term="biopython")
result = Entrez.read(handle)
handle.close()
for row in result["eGQueryResult"]:
print(f"{row['DbName']}: {row['Count']} results")
```
### ESpell - Spelling Suggestions
Get spelling suggestions for search terms:
```python
handle = Entrez.espell(db="pubmed", term="biopythn")
result = Entrez.read(handle)
handle.close()
print(f"Original: {result['Query']}")
print(f"Suggestion: {result['CorrectedQuery']}")
```
## Working with Different Databases
### PubMed
```python
# Search for articles
handle = Entrez.esearch(db="pubmed", term="cancer genomics", retmax=10)
result = Entrez.read(handle)
handle.close()
# Fetch abstracts
handle = Entrez.efetch(
db="pubmed",
id=result["IdList"],
rettype="medline",
retmode="text"
)
records = handle.read()
handle.close()
print(records)
```
### GenBank / Nucleotide
```python
# Search for sequences
handle = Entrez.esearch(db="nucleotide", term="Cypripedioideae[Orgn] AND matK[Gene]")
result = Entrez.read(handle)
handle.close()
# Fetch sequences
if result["IdList"]:
handle = Entrez.efetch(
db="nucleotide",
id=result["IdList"][:5],
rettype="fasta",
retmode="text"
)
sequences = handle.read()
handle.close()
```
### Protein
```python
# Search for protein sequences
handle = Entrez.esearch(db="protein", term="human insulin")
result = Entrez.read(handle)
handle.close()
# Fetch protein records
from Bio import SeqIO
handle = Entrez.efetch(
db="protein",
id=result["IdList"][:5],
rettype="gp",
retmode="text"
)
records = SeqIO.parse(handle, "genbank")
for record in records:
print(f"{record.id}: {record.description}")
handle.close()
```
### Gene
```python
# Search for gene records
handle = Entrez.esearch(db="gene", term="BRCA1[Gene] AND human[Organism]")
result = Entrez.read(handle)
handle.close()
# Get gene information
handle = Entrez.efetch(db="gene", id=result["IdList"][0], retmode="xml")
record = Entrez.read(handle)
handle.close()
```
### Taxonomy
```python
# Search for organism
handle = Entrez.esearch(db="taxonomy", term="Homo sapiens")
result = Entrez.read(handle)
handle.close()
# Fetch taxonomic information
handle = Entrez.efetch(db="taxonomy", id=result["IdList"][0], retmode="xml")
records = Entrez.read(handle)
handle.close()
for record in records:
print(f"TaxID: {record['TaxId']}")
print(f"Scientific Name: {record['ScientificName']}")
print(f"Lineage: {record['Lineage']}")
```
## Parsing Entrez Results
### Reading XML Results
```python
# Most results can be parsed with Entrez.read()
handle = Entrez.efetch(db="pubmed", id="19304878", retmode="xml")
records = Entrez.read(handle)
handle.close()
# Access parsed data
article = records['PubmedArticle'][0]['MedlineCitation']['Article']
print(article['ArticleTitle'])
```
### Handling Large Result Sets
```python
# Batch processing for large searches
search_term = "cancer[Title]"
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=0)
result = Entrez.read(handle)
handle.close()
total_count = int(result["Count"])
batch_size = 500
for start in range(0, total_count, batch_size):
# Fetch batch
handle = Entrez.esearch(
db="pubmed",
term=search_term,
retstart=start,
retmax=batch_size
)
result = Entrez.read(handle)
handle.close()
# Process IDs
id_list = result["IdList"]
print(f"Processing IDs {start} to {start + len(id_list)}")
```
## Advanced Patterns
### Search History with WebEnv
```python
# Perform search and store on server
handle = Entrez.esearch(
db="pubmed",
term="biopython",
usehistory="y"
)
result = Entrez.read(handle)
handle.close()
webenv = result["WebEnv"]
query_key = result["QueryKey"]
count = int(result["Count"])
# Fetch results in batches using history
batch_size = 100
for start in range(0, count, batch_size):
handle = Entrez.efetch(
db="pubmed",
retstart=start,
retmax=batch_size,
rettype="medline",
retmode="text",
webenv=webenv,
query_key=query_key
)
data = handle.read()
handle.close()
# Process data
```
### Combining Searches
```python
# Use boolean operators
complex_search = "(cancer[Title]) AND (genomics[Title]) AND 2020:2025[PDAT]"
handle = Entrez.esearch(db="pubmed", term=complex_search, retmax=100)
result = Entrez.read(handle)
handle.close()
```
## Best Practices
1. **Always set Entrez.email** - Required by NCBI
2. **Use API key** for higher rate limits (10 req/s vs 3 req/s)
3. **Close handles** after reading to free resources
4. **Batch large requests** - Use retstart and retmax for pagination
5. **Use WebEnv for large downloads** - Store results on server
6. **Cache locally** - Download once and save to avoid repeated requests
7. **Handle errors gracefully** - Network issues and API limits can occur
8. **Respect NCBI guidelines** - Don't overwhelm the service
9. **Use appropriate rettype** - Choose format that matches your needs
10. **Parse XML carefully** - Structure varies by database and record type
## Error Handling
```python
from urllib.error import HTTPError
from Bio import Entrez
Entrez.email = "your.email@example.com"
try:
handle = Entrez.efetch(db="nucleotide", id="invalid_id", rettype="gb")
record = handle.read()
handle.close()
except HTTPError as e:
print(f"HTTP Error: {e.code} - {e.reason}")
except Exception as e:
print(f"Error: {e}")
```
## Common Use Cases
### Download GenBank Records
```python
from Bio import Entrez, SeqIO
Entrez.email = "your.email@example.com"
# List of accession numbers
accessions = ["EU490707", "EU490708", "EU490709"]
for acc in accessions:
handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Save to file
SeqIO.write(record, f"{acc}.gb", "genbank")
```
### Search and Download Papers
```python
# Search PubMed
handle = Entrez.esearch(db="pubmed", term="machine learning bioinformatics", retmax=20)
result = Entrez.read(handle)
handle.close()
# Get details
handle = Entrez.efetch(db="pubmed", id=result["IdList"], retmode="xml")
papers = Entrez.read(handle)
handle.close()
# Extract information
for paper in papers['PubmedArticle']:
article = paper['MedlineCitation']['Article']
print(f"Title: {article['ArticleTitle']}")
print(f"Journal: {article['Journal']['Title']}")
print()
```
### Find Related Sequences
```python
# Start with one sequence
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Find similar sequences
handle = Entrez.elink(dbfrom="nucleotide", db="nucleotide", id="EU490707")
result = Entrez.read(handle)
handle.close()
# Get related IDs
related_ids = []
for linkset in result[0]["LinkSetDb"]:
for link in linkset["Link"]:
related_ids.append(link["Id"])
```

View File

@@ -0,0 +1,566 @@
# Phylogenetics with Bio.Phylo
## Overview
Bio.Phylo provides a unified toolkit for reading, writing, analyzing, and visualizing phylogenetic trees. It supports multiple file formats including Newick, NEXUS, phyloXML, NeXML, and CDAO.
## Supported File Formats
- **Newick** - Simple tree representation (most common)
- **NEXUS** - Extended format with additional data
- **phyloXML** - XML-based format with rich annotations
- **NeXML** - Modern XML format
- **CDAO** - Comparative Data Analysis Ontology
## Reading and Writing Trees
### Reading Trees
```python
from Bio import Phylo
# Read a tree from file
tree = Phylo.read("tree.nwk", "newick")
# Parse multiple trees from a file
trees = list(Phylo.parse("trees.nwk", "newick"))
print(f"Found {len(trees)} trees")
```
### Writing Trees
```python
# Write tree to file
Phylo.write(tree, "output.nwk", "newick")
# Write multiple trees
Phylo.write(trees, "output.nex", "nexus")
```
### Format Conversion
```python
# Convert between formats
count = Phylo.convert("input.nwk", "newick", "output.xml", "phyloxml")
print(f"Converted {count} trees")
```
## Tree Structure and Navigation
### Basic Tree Components
Trees consist of:
- **Clade** - A node (internal or terminal) in the tree
- **Terminal clades** - Leaves/tips (taxa)
- **Internal clades** - Internal nodes
- **Branch length** - Evolutionary distance
### Accessing Tree Properties
```python
# Tree root
root = tree.root
# Terminal nodes (leaves)
terminals = tree.get_terminals()
print(f"Number of taxa: {len(terminals)}")
# Non-terminal nodes
nonterminals = tree.get_nonterminals()
print(f"Number of internal nodes: {len(nonterminals)}")
# All clades
all_clades = list(tree.find_clades())
print(f"Total clades: {len(all_clades)}")
```
### Traversing Trees
```python
# Iterate through all clades
for clade in tree.find_clades():
if clade.name:
print(f"Clade: {clade.name}, Branch length: {clade.branch_length}")
# Iterate through terminals only
for terminal in tree.get_terminals():
print(f"Taxon: {terminal.name}")
# Depth-first traversal
for clade in tree.find_clades(order="preorder"):
print(clade.name)
# Level-order (breadth-first) traversal
for clade in tree.find_clades(order="level"):
print(clade.name)
```
### Finding Specific Clades
```python
# Find clade by name
clade = tree.find_any(name="Species_A")
# Find all clades matching criteria
def is_long_branch(clade):
return clade.branch_length and clade.branch_length > 0.5
long_branches = tree.find_clades(is_long_branch)
```
## Tree Analysis
### Tree Statistics
```python
# Total branch length
total_length = tree.total_branch_length()
print(f"Total tree length: {total_length:.3f}")
# Tree depth (root to furthest leaf)
depths = tree.depths()
max_depth = max(depths.values())
print(f"Maximum depth: {max_depth:.3f}")
# Terminal count
terminal_count = tree.count_terminals()
print(f"Number of taxa: {terminal_count}")
```
### Distance Calculations
```python
# Distance between two taxa
distance = tree.distance("Species_A", "Species_B")
print(f"Distance: {distance:.3f}")
# Create distance matrix
from Bio import Phylo
terminals = tree.get_terminals()
taxa_names = [t.name for t in terminals]
print("Distance Matrix:")
for taxon1 in taxa_names:
row = []
for taxon2 in taxa_names:
if taxon1 == taxon2:
row.append(0)
else:
dist = tree.distance(taxon1, taxon2)
row.append(dist)
print(f"{taxon1}: {row}")
```
### Common Ancestors
```python
# Find common ancestor of two clades
clade1 = tree.find_any(name="Species_A")
clade2 = tree.find_any(name="Species_B")
ancestor = tree.common_ancestor(clade1, clade2)
print(f"Common ancestor: {ancestor.name}")
# Find common ancestor of multiple clades
clades = [tree.find_any(name=n) for n in ["Species_A", "Species_B", "Species_C"]]
ancestor = tree.common_ancestor(*clades)
```
### Tree Comparison
```python
# Compare tree topologies
def compare_trees(tree1, tree2):
"""Compare two trees."""
# Get terminal names
taxa1 = set(t.name for t in tree1.get_terminals())
taxa2 = set(t.name for t in tree2.get_terminals())
# Check if they have same taxa
if taxa1 != taxa2:
return False, "Different taxa"
# Compare distances
differences = []
for taxon1 in taxa1:
for taxon2 in taxa1:
if taxon1 < taxon2:
dist1 = tree1.distance(taxon1, taxon2)
dist2 = tree2.distance(taxon1, taxon2)
if abs(dist1 - dist2) > 0.01:
differences.append((taxon1, taxon2, dist1, dist2))
return len(differences) == 0, differences
```
## Tree Manipulation
### Pruning Trees
```python
# Prune (remove) specific taxa
tree_copy = tree.copy()
tree_copy.prune("Species_A")
# Keep only specific taxa
taxa_to_keep = ["Species_B", "Species_C", "Species_D"]
terminals = tree_copy.get_terminals()
for terminal in terminals:
if terminal.name not in taxa_to_keep:
tree_copy.prune(terminal)
```
### Collapsing Short Branches
```python
# Collapse branches shorter than threshold
def collapse_short_branches(tree, threshold=0.01):
"""Collapse branches shorter than threshold."""
for clade in tree.find_clades():
if clade.branch_length and clade.branch_length < threshold:
clade.branch_length = 0
return tree
```
### Ladderizing Trees
```python
# Ladderize tree (sort branches by size)
tree.ladderize() # ascending order
tree.ladderize(reverse=True) # descending order
```
### Rerooting Trees
```python
# Reroot at midpoint
tree.root_at_midpoint()
# Reroot with outgroup
outgroup = tree.find_any(name="Outgroup_Species")
tree.root_with_outgroup(outgroup)
# Reroot at internal node
internal = tree.get_nonterminals()[0]
tree.root_with_outgroup(internal)
```
## Tree Visualization
### Basic ASCII Drawing
```python
# Draw tree to console
Phylo.draw_ascii(tree)
# Draw with custom format
Phylo.draw_ascii(tree, column_width=80)
```
### Matplotlib Visualization
```python
import matplotlib.pyplot as plt
from Bio import Phylo
# Simple plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes)
plt.show()
# Customize plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes, do_show=False)
axes.set_title("Phylogenetic Tree")
plt.tight_layout()
plt.savefig("tree.png", dpi=300)
```
### Advanced Visualization Options
```python
# Radial (circular) tree
Phylo.draw(tree, branch_labels=lambda c: c.branch_length)
# Show branch support values
Phylo.draw(tree, label_func=lambda n: str(n.confidence) if n.confidence else "")
# Color branches
def color_by_length(clade):
if clade.branch_length:
if clade.branch_length > 0.5:
return "red"
elif clade.branch_length > 0.2:
return "orange"
return "black"
# Note: Direct branch coloring requires custom matplotlib code
```
## Building Trees
### From Distance Matrix
```python
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
# Create distance matrix
dm = DistanceMatrix(
names=["Alpha", "Beta", "Gamma", "Delta"],
matrix=[
[],
[0.23],
[0.45, 0.34],
[0.67, 0.58, 0.29]
]
)
# Build tree using UPGMA
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.draw_ascii(tree)
# Build tree using Neighbor-Joining
tree = constructor.nj(dm)
```
### From Multiple Sequence Alignment
```python
from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
# Read alignment
alignment = AlignIO.read("alignment.fasta", "fasta")
# Calculate distance matrix
calculator = DistanceCalculator("identity")
distance_matrix = calculator.get_distance(alignment)
# Build tree
constructor = DistanceTreeConstructor()
tree = constructor.upgma(distance_matrix)
# Write tree
Phylo.write(tree, "output_tree.nwk", "newick")
```
### Distance Models
Available distance calculation models:
- **identity** - Simple identity
- **blastn** - BLASTN identity
- **trans** - Transition/transversion ratio
- **blosum62** - BLOSUM62 matrix
- **pam250** - PAM250 matrix
```python
# Use different model
calculator = DistanceCalculator("blosum62")
dm = calculator.get_distance(alignment)
```
## Consensus Trees
```python
from Bio.Phylo.Consensus import majority_consensus, strict_consensus
# Read multiple trees
trees = list(Phylo.parse("bootstrap_trees.nwk", "newick"))
# Majority-rule consensus
consensus = majority_consensus(trees, cutoff=0.5)
# Strict consensus
strict_cons = strict_consensus(trees)
# Write consensus tree
Phylo.write(consensus, "consensus.nwk", "newick")
```
## PhyloXML Features
PhyloXML format supports rich annotations:
```python
from Bio.Phylo.PhyloXML import Phylogeny, Clade
# Create PhyloXML tree
tree = Phylogeny(rooted=True)
tree.name = "Example Tree"
tree.description = "A sample phylogenetic tree"
# Add clades with rich annotations
clade = Clade(branch_length=0.5)
clade.name = "Species_A"
clade.color = "red"
clade.width = 2.0
# Add taxonomy information
from Bio.Phylo.PhyloXML import Taxonomy
taxonomy = Taxonomy(scientific_name="Homo sapiens", common_name="Human")
clade.taxonomies.append(taxonomy)
```
## Bootstrap Support
```python
# Add bootstrap support values to tree
def add_bootstrap_support(tree, support_values):
"""Add bootstrap support to internal nodes."""
internal_nodes = tree.get_nonterminals()
for node, support in zip(internal_nodes, support_values):
node.confidence = support
return tree
# Example
support_values = [95, 87, 76, 92]
tree_with_support = add_bootstrap_support(tree, support_values)
```
## Best Practices
1. **Choose appropriate file format** - Newick for simple trees, phyloXML for annotations
2. **Validate tree topology** - Check for polytomies and negative branch lengths
3. **Root trees appropriately** - Use midpoint or outgroup rooting
4. **Handle bootstrap values** - Store as clade confidence
5. **Consider tree size** - Large trees may need special handling
6. **Use tree copies** - Call `.copy()` before modifications
7. **Export publication-ready figures** - Use matplotlib for high-quality output
8. **Document tree construction** - Record alignment and parameters used
9. **Compare multiple trees** - Use consensus methods for bootstrap trees
10. **Validate taxon names** - Ensure consistent naming across files
## Common Use Cases
### Build Tree from Sequences
```python
from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
# Read aligned sequences
alignment = AlignIO.read("sequences.aln", "clustal")
# Calculate distances
calculator = DistanceCalculator("identity")
dm = calculator.get_distance(alignment)
# Build neighbor-joining tree
constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)
# Root at midpoint
tree.root_at_midpoint()
# Save tree
Phylo.write(tree, "tree.nwk", "newick")
# Visualize
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 8))
Phylo.draw(tree)
plt.show()
```
### Extract Subtree
```python
def extract_subtree(tree, taxa_list):
"""Extract subtree containing specific taxa."""
# Create a copy
subtree = tree.copy()
# Get all terminals
all_terminals = subtree.get_terminals()
# Prune taxa not in list
for terminal in all_terminals:
if terminal.name not in taxa_list:
subtree.prune(terminal)
return subtree
# Use it
subtree = extract_subtree(tree, ["Species_A", "Species_B", "Species_C"])
Phylo.write(subtree, "subtree.nwk", "newick")
```
### Calculate Phylogenetic Diversity
```python
def phylogenetic_diversity(tree, taxa_subset=None):
"""Calculate phylogenetic diversity (sum of branch lengths)."""
if taxa_subset:
# Prune to subset
tree = extract_subtree(tree, taxa_subset)
# Sum all branch lengths
total = 0
for clade in tree.find_clades():
if clade.branch_length:
total += clade.branch_length
return total
# Calculate PD for all taxa
pd_all = phylogenetic_diversity(tree)
print(f"Total phylogenetic diversity: {pd_all:.3f}")
# Calculate PD for subset
pd_subset = phylogenetic_diversity(tree, ["Species_A", "Species_B"])
print(f"Subset phylogenetic diversity: {pd_subset:.3f}")
```
### Annotate Tree with External Data
```python
def annotate_tree_from_csv(tree, csv_file):
"""Annotate tree leaves with data from CSV."""
import csv
# Read annotation data
annotations = {}
with open(csv_file) as f:
reader = csv.DictReader(f)
for row in reader:
annotations[row["species"]] = row
# Annotate tree
for terminal in tree.get_terminals():
if terminal.name in annotations:
# Add custom attributes
for key, value in annotations[terminal.name].items():
setattr(terminal, key, value)
return tree
```
### Compare Tree Topologies
```python
def robinson_foulds_distance(tree1, tree2):
"""Calculate Robinson-Foulds distance between two trees."""
# Get bipartitions for each tree
def get_bipartitions(tree):
bipartitions = set()
for clade in tree.get_nonterminals():
terminals = frozenset(t.name for t in clade.get_terminals())
bipartitions.add(terminals)
return bipartitions
bp1 = get_bipartitions(tree1)
bp2 = get_bipartitions(tree2)
# Symmetric difference
diff = len(bp1.symmetric_difference(bp2))
return diff
# Use it
tree1 = Phylo.read("tree1.nwk", "newick")
tree2 = Phylo.read("tree2.nwk", "newick")
rf_dist = robinson_foulds_distance(tree1, tree2)
print(f"Robinson-Foulds distance: {rf_dist}")
```

View File

@@ -0,0 +1,285 @@
# Sequence Handling with Bio.Seq and Bio.SeqIO
## Overview
Bio.Seq provides the `Seq` object for biological sequences with specialized methods, while Bio.SeqIO offers a unified interface for reading, writing, and converting sequence files across multiple formats.
## The Seq Object
### Creating Sequences
```python
from Bio.Seq import Seq
# Create a basic sequence
my_seq = Seq("AGTACACTGGT")
# Sequences support string-like operations
print(len(my_seq)) # Length
print(my_seq[0:5]) # Slicing
```
### Core Sequence Operations
```python
# Complement and reverse complement
complement = my_seq.complement()
rev_comp = my_seq.reverse_complement()
# Transcription (DNA to RNA)
rna = my_seq.transcribe()
# Translation (to protein)
protein = my_seq.translate()
# Back-transcription (RNA to DNA)
dna = rna_seq.back_transcribe()
```
### Sequence Methods
- `complement()` - Returns complementary strand
- `reverse_complement()` - Returns reverse complement
- `transcribe()` - DNA to RNA transcription
- `back_transcribe()` - RNA to DNA conversion
- `translate()` - Translate to protein sequence
- `translate(table=N)` - Use specific genetic code table
- `translate(to_stop=True)` - Stop at first stop codon
## Bio.SeqIO: Sequence File I/O
### Core Functions
**Bio.SeqIO.parse()**: The primary workhorse for reading sequence files as an iterator of `SeqRecord` objects.
```python
from Bio import SeqIO
# Parse a FASTA file
for record in SeqIO.parse("sequences.fasta", "fasta"):
print(record.id)
print(record.seq)
print(len(record))
```
**Bio.SeqIO.read()**: For single-record files (validates exactly one record exists).
```python
record = SeqIO.read("single.fasta", "fasta")
```
**Bio.SeqIO.write()**: Output SeqRecord objects to files.
```python
# Write records to file
count = SeqIO.write(seq_records, "output.fasta", "fasta")
print(f"Wrote {count} records")
```
**Bio.SeqIO.convert()**: Streamlined format conversion.
```python
# Convert between formats
count = SeqIO.convert("input.gbk", "genbank", "output.fasta", "fasta")
```
### Supported File Formats
Common formats include:
- **fasta** - FASTA format
- **fastq** - FASTQ format (with quality scores)
- **genbank** or **gb** - GenBank format
- **embl** - EMBL format
- **swiss** - SwissProt format
- **fasta-2line** - FASTA with sequence on one line
- **tab** - Simple tab-separated format
### The SeqRecord Object
`SeqRecord` objects combine sequence data with annotations:
```python
record.id # Primary identifier
record.name # Short name
record.description # Description line
record.seq # The actual sequence (Seq object)
record.annotations # Dictionary of additional info
record.features # List of SeqFeature objects
record.letter_annotations # Per-letter annotations (e.g., quality scores)
```
### Modifying Records
```python
# Modify record attributes
record.id = "new_id"
record.description = "New description"
# Extract subsequences
sub_record = record[10:30] # Slicing preserves annotations
# Modify sequence
record.seq = record.seq.reverse_complement()
```
## Working with Large Files
### Memory-Efficient Parsing
Use iterators to avoid loading entire files into memory:
```python
# Good for large files
for record in SeqIO.parse("large_file.fasta", "fasta"):
if len(record.seq) > 1000:
print(record.id)
```
### Dictionary-Based Access
Three approaches for random access:
**1. Bio.SeqIO.to_dict()** - Loads all records into memory:
```python
seq_dict = SeqIO.to_dict(SeqIO.parse("sequences.fasta", "fasta"))
record = seq_dict["sequence_id"]
```
**2. Bio.SeqIO.index()** - Lazy-loaded dictionary (memory efficient):
```python
seq_index = SeqIO.index("sequences.fasta", "fasta")
record = seq_index["sequence_id"]
seq_index.close()
```
**3. Bio.SeqIO.index_db()** - SQLite-based index for very large files:
```python
seq_index = SeqIO.index_db("index.idx", "sequences.fasta", "fasta")
record = seq_index["sequence_id"]
seq_index.close()
```
### Low-Level Parsers for High Performance
For high-throughput sequencing data, use low-level parsers that return tuples instead of objects:
```python
from Bio.SeqIO.FastaIO import SimpleFastaParser
with open("sequences.fasta") as handle:
for title, sequence in SimpleFastaParser(handle):
print(title, len(sequence))
from Bio.SeqIO.QualityIO import FastqGeneralIterator
with open("reads.fastq") as handle:
for title, sequence, quality in FastqGeneralIterator(handle):
print(title)
```
## Compressed Files
Bio.SeqIO automatically handles compressed files:
```python
# Works with gzip compression
for record in SeqIO.parse("sequences.fasta.gz", "fasta"):
print(record.id)
# BGZF format for random access
from Bio import bgzf
with bgzf.open("sequences.fasta.bgz", "r") as handle:
records = SeqIO.parse(handle, "fasta")
```
## Data Extraction Patterns
### Extract Specific Information
```python
# Get all IDs
ids = [record.id for record in SeqIO.parse("file.fasta", "fasta")]
# Get sequences above length threshold
long_seqs = [record for record in SeqIO.parse("file.fasta", "fasta")
if len(record.seq) > 500]
# Extract organism from GenBank
for record in SeqIO.parse("file.gbk", "genbank"):
organism = record.annotations.get("organism", "Unknown")
print(f"{record.id}: {organism}")
```
### Filter and Write
```python
# Filter sequences by criteria
long_sequences = (record for record in SeqIO.parse("input.fasta", "fasta")
if len(record) > 500)
SeqIO.write(long_sequences, "filtered.fasta", "fasta")
```
## Best Practices
1. **Use iterators** for large files rather than loading everything into memory
2. **Prefer index()** for repeated random access to large files
3. **Use index_db()** for millions of records or multi-file scenarios
4. **Use low-level parsers** for high-throughput data when speed is critical
5. **Download once, reuse locally** rather than repeated network access
6. **Close indexed files** explicitly or use context managers
7. **Validate input** before writing with SeqIO.write()
8. **Use appropriate format strings** - always lowercase (e.g., "fasta", not "FASTA")
## Common Use Cases
### Format Conversion
```python
# GenBank to FASTA
SeqIO.convert("input.gbk", "genbank", "output.fasta", "fasta")
# Multiple format conversion
for fmt in ["fasta", "genbank", "embl"]:
SeqIO.convert("input.fasta", "fasta", f"output.{fmt}", fmt)
```
### Quality Filtering (FASTQ)
```python
from Bio import SeqIO
good_reads = (record for record in SeqIO.parse("reads.fastq", "fastq")
if min(record.letter_annotations["phred_quality"]) >= 20)
count = SeqIO.write(good_reads, "filtered.fastq", "fastq")
```
### Sequence Statistics
```python
from Bio.SeqUtils import gc_fraction
for record in SeqIO.parse("sequences.fasta", "fasta"):
gc = gc_fraction(record.seq)
print(f"{record.id}: GC={gc:.2%}, Length={len(record)}")
```
### Creating Records Programmatically
```python
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# Create a new record
new_record = SeqRecord(
Seq("ATGCGATCGATCG"),
id="seq001",
name="MySequence",
description="Test sequence"
)
# Write to file
SeqIO.write([new_record], "new.fasta", "fasta")
```

View File

@@ -1,612 +0,0 @@
# BioPython Specialized Analysis Modules
This document covers BioPython's specialized modules for structural biology, phylogenetics, population genetics, and other advanced analyses.
## Structural Bioinformatics
### Bio.PDB - Protein Structure Analysis
Comprehensive tools for handling macromolecular crystal structures.
#### Structure Hierarchy
PDB structures are organized hierarchically:
- **Structure** → Models → Chains → Residues → Atoms
```python
from Bio.PDB import PDBParser
parser = PDBParser()
structure = parser.get_structure("protein", "1abc.pdb")
# Navigate hierarchy
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
print(atom.coord) # xyz coordinates
```
#### Parsing Structure Files
**PDB format:**
```python
from Bio.PDB import PDBParser
parser = PDBParser(QUIET=True)
structure = parser.get_structure("myprotein", "structure.pdb")
```
**mmCIF format:**
```python
from Bio.PDB import MMCIFParser
parser = MMCIFParser(QUIET=True)
structure = parser.get_structure("myprotein", "structure.cif")
```
**Fast mmCIF parser:**
```python
from Bio.PDB import FastMMCIFParser
parser = FastMMCIFParser(QUIET=True)
structure = parser.get_structure("myprotein", "structure.cif")
```
**MMTF format:**
```python
from Bio.PDB import MMTFParser
parser = MMTFParser()
structure = parser.get_structure("structure.mmtf")
```
**Binary CIF:**
```python
from Bio.PDB.binary_cif import BinaryCIFParser
parser = BinaryCIFParser()
structure = parser.get_structure("structure.bcif")
```
#### Downloading Structures
```python
from Bio.PDB import PDBList
pdbl = PDBList()
# Download specific structure
pdbl.retrieve_pdb_file("1ABC", file_format="pdb", pdir="structures/")
# Download entire PDB (obsolete entries)
pdbl.download_obsolete_entries(pdir="obsolete/")
# Update local PDB mirror
pdbl.update_pdb()
```
#### Structure Selection and Filtering
```python
# Select specific chains
chain_A = structure[0]['A']
# Select specific residues
residue_10 = chain_A[10]
# Select specific atoms
ca_atom = residue_10['CA']
# Iterate over specific atom types
for atom in structure.get_atoms():
if atom.name == 'CA': # Alpha carbons only
print(atom.coord)
```
**Structure selectors:**
```python
from Bio.PDB.Polypeptide import is_aa
# Filter by residue type
for residue in structure.get_residues():
if is_aa(residue):
print(f"Amino acid: {residue.resname}")
```
#### Secondary Structure Analysis
**DSSP integration:**
```python
from Bio.PDB import DSSP
# Requires DSSP program installed
model = structure[0]
dssp = DSSP(model, "structure.pdb")
# Access secondary structure
for key in dssp:
secondary_structure = dssp[key][2]
accessibility = dssp[key][3]
print(f"Residue {key}: {secondary_structure}, accessible: {accessibility}")
```
DSSP codes:
- H: Alpha helix
- B: Beta bridge
- E: Extended strand (beta sheet)
- G: 3-10 helix
- I: Pi helix
- T: Turn
- S: Bend
- -: Coil
#### Solvent Accessibility
**Shrake-Rupley algorithm:**
```python
from Bio.PDB import ShrakeRupley
sr = ShrakeRupley()
sr.compute(structure, level="R") # R=residue, A=atom, C=chain, M=model, S=structure
for residue in structure.get_residues():
print(f"{residue.resname} {residue.id[1]}: {residue.sasa} Ų")
```
**NACCESS wrapper:**
```python
from Bio.PDB import NACCESS
# Requires NACCESS program
naccess = NACCESS("structure.pdb")
for residue_id, data in naccess.items():
print(f"Residue {residue_id}: {data['all_atoms_abs']} Ų")
```
**Half-sphere exposure:**
```python
from Bio.PDB import HSExposure
# Requires DSSP
model = structure[0]
hse = HSExposure()
hse.calc_hs_exposure(model, "structure.pdb")
for chain in model:
for residue in chain:
if residue.has_id('EXP_HSE_A_U'):
hse_up = residue.xtra['EXP_HSE_A_U']
hse_down = residue.xtra['EXP_HSE_A_D']
```
#### Structural Alignment and Superimposition
**Standard superimposition:**
```python
from Bio.PDB import Superimposer
sup = Superimposer()
sup.set_atoms(ref_atoms, alt_atoms) # Lists of atoms to align
sup.apply(structure2.get_atoms()) # Apply transformation
print(f"RMSD: {sup.rms}")
print(f"Rotation matrix: {sup.rotran[0]}")
print(f"Translation vector: {sup.rotran[1]}")
```
**QCP (Quaternion Characteristic Polynomial) method:**
```python
from Bio.PDB import QCPSuperimposer
qcp = QCPSuperimposer()
qcp.set(ref_coords, alt_coords)
qcp.run()
print(f"RMSD: {qcp.get_rms()}")
```
#### Geometric Calculations
**Distances and angles:**
```python
# Distance between atoms
from Bio.PDB import Vector
dist = atom1 - atom2 # Returns distance
# Angle between three atoms
from Bio.PDB import calc_angle
angle = calc_angle(atom1.coord, atom2.coord, atom3.coord)
# Dihedral angle
from Bio.PDB import calc_dihedral
dihedral = calc_dihedral(atom1.coord, atom2.coord, atom3.coord, atom4.coord)
```
**Vector operations:**
```python
from Bio.PDB.Vector import Vector
v1 = Vector(atom1.coord)
v2 = Vector(atom2.coord)
# Vector operations
v3 = v1 + v2
v4 = v1 - v2
dot_product = v1 * v2
cross_product = v1 ** v2
magnitude = v1.norm()
normalized = v1.normalized()
```
#### Internal Coordinates
Advanced residue geometry representation:
```python
from Bio.PDB import internal_coords
# Enable internal coordinates
structure.atom_to_internal_coordinates()
# Access phi, psi angles
for residue in structure.get_residues():
if residue.internal_coord:
print(f"Phi: {residue.internal_coord.get_angle('phi')}")
print(f"Psi: {residue.internal_coord.get_angle('psi')}")
```
#### Writing Structures
```python
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure)
io.save("output.pdb")
# Save specific selection
io.save("chain_A.pdb", select=ChainSelector("A"))
```
### Bio.SCOP - SCOP Database
Access to Structural Classification of Proteins database.
### Bio.KEGG - Pathway Analysis
Interface to KEGG (Kyoto Encyclopedia of Genes and Genomes) databases:
**Capabilities:**
- Access pathway maps
- Retrieve enzyme data
- Get compound information
- Query orthology relationships
## Phylogenetics
### Bio.Phylo - Phylogenetic Tree Analysis
Comprehensive phylogenetic tree manipulation and analysis.
#### Reading and Writing Trees
**Supported formats:**
- Newick: Simple, widely-used format
- NEXUS: Rich metadata format
- PhyloXML: XML-based with extensive annotations
- NeXML: Modern XML standard
```python
from Bio import Phylo
# Read tree
tree = Phylo.read("tree.nwk", "newick")
# Read multiple trees
trees = list(Phylo.parse("trees.nex", "nexus"))
# Write tree
Phylo.write(tree, "output.nwk", "newick")
```
#### Tree Visualization
**ASCII visualization:**
```python
Phylo.draw_ascii(tree)
```
**Matplotlib plotting:**
```python
import matplotlib.pyplot as plt
Phylo.draw(tree)
plt.show()
# With customization
fig, ax = plt.subplots(figsize=(10, 8))
Phylo.draw(tree, axes=ax, do_show=False)
ax.set_title("My Phylogenetic Tree")
plt.show()
```
#### Tree Navigation and Manipulation
**Find clades:**
```python
# Get all terminal nodes (leaves)
terminals = tree.get_terminals()
# Get all nonterminal nodes
nonterminals = tree.get_nonterminals()
# Find specific clade
target = tree.find_any(name="Species_A")
# Find all matching clades
matches = tree.find_clades(terminal=True)
```
**Tree properties:**
```python
# Count terminals
num_species = tree.count_terminals()
# Get total branch length
total_length = tree.total_branch_length()
# Check if tree is bifurcating
is_bifurcating = tree.is_bifurcating()
# Get maximum distance from root
max_dist = tree.distance(tree.root)
```
**Tree modification:**
```python
# Prune tree to specific taxa
keep_taxa = ["Species_A", "Species_B", "Species_C"]
tree.prune(keep_taxa)
# Collapse short branches
tree.collapse_all(lambda c: c.branch_length < 0.01)
# Ladderize (sort branches)
tree.ladderize()
# Root tree at midpoint
tree.root_at_midpoint()
# Root at specific clade
outgroup = tree.find_any(name="Outgroup_species")
tree.root_with_outgroup(outgroup)
```
**Calculate distances:**
```python
# Distance between two clades
dist = tree.distance(clade1, clade2)
# Distance from root
root_dist = tree.distance(tree.root, terminal_clade)
```
#### Tree Construction
**Distance-based methods:**
```python
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceCalculator
from Bio import AlignIO
# Load alignment
aln = AlignIO.read("alignment.fasta", "fasta")
# Calculate distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)
# Construct tree using UPGMA
constructor = DistanceTreeConstructor()
tree_upgma = constructor.upgma(dm)
# Or using Neighbor-Joining
tree_nj = constructor.nj(dm)
```
**Parsimony method:**
```python
from Bio.Phylo.TreeConstruction import ParsimonyScorer, NNITreeSearcher
scorer = ParsimonyScorer()
searcher = NNITreeSearcher(scorer)
tree = searcher.search(starting_tree, alignment)
```
**Distance calculators:**
- 'identity': Simple identity scoring
- 'blastn': BLAST nucleotide scoring
- 'blastp': BLAST protein scoring
- 'dnafull': EMBOSS DNA scoring matrix
- 'blosum62': BLOSUM62 protein matrix
- 'pam250': PAM250 protein matrix
#### Consensus Trees
```python
from Bio.Phylo.Consensus import majority_consensus, strict_consensus
# Strict consensus
consensus_strict = strict_consensus(trees)
# Majority rule consensus
consensus_majority = majority_consensus(trees, cutoff=0.5)
# Bootstrap consensus
from Bio.Phylo.Consensus import bootstrap_consensus
bootstrap_tree = bootstrap_consensus(trees, cutoff=0.7)
```
#### External Tool Wrappers
**PhyML:**
```python
from Bio.Phylo.Applications import PhymlCommandline
cmd = PhymlCommandline(input="alignment.phy", datatype="nt", model="HKY85", alpha="e", bootstrap=100)
stdout, stderr = cmd()
tree = Phylo.read("alignment.phy_phyml_tree.txt", "newick")
```
**RAxML:**
```python
from Bio.Phylo.Applications import RaxmlCommandline
cmd = RaxmlCommandline(
sequences="alignment.phy",
model="GTRGAMMA",
name="mytree",
parsimony_seed=12345
)
stdout, stderr = cmd()
```
**FastTree:**
```python
from Bio.Phylo.Applications import FastTreeCommandline
cmd = FastTreeCommandline(input="alignment.fasta", out="tree.nwk", gtr=True, gamma=True)
stdout, stderr = cmd()
```
### Bio.Phylo.PAML - Evolutionary Analysis
Interface to PAML (Phylogenetic Analysis by Maximum Likelihood):
**CODEML - Codon-based analysis:**
```python
from Bio.Phylo.PAML import codeml
cml = codeml.Codeml()
cml.alignment = "alignment.phy"
cml.tree = "tree.nwk"
cml.out_file = "results.out"
cml.working_dir = "./paml_wd"
# Set parameters
cml.set_options(
seqtype=1, # Codon sequences
model=0, # One omega ratio
NSsites=[0, 1, 2], # Test different models
CodonFreq=2, # F3x4 codon frequencies
)
results = cml.run()
```
**BaseML - Nucleotide-based analysis:**
```python
from Bio.Phylo.PAML import baseml
bml = baseml.Baseml()
bml.alignment = "alignment.phy"
bml.tree = "tree.nwk"
results = bml.run()
```
**YN00 - Yang-Nielsen method:**
```python
from Bio.Phylo.PAML import yn00
yn = yn00.Yn00()
yn.alignment = "alignment.phy"
results = yn.run()
```
## Population Genetics
### Bio.PopGen - Population Genetics Analysis
Tools for population-level genetic analysis.
**Capabilities:**
- Allele frequency calculations
- Hardy-Weinberg equilibrium testing
- Linkage disequilibrium analysis
- F-statistics (FST, FIS, FIT)
- Tajima's D
- Population structure analysis
## Clustering and Machine Learning
### Bio.Cluster - Clustering Algorithms
Statistical clustering for gene expression and other biological data:
**Hierarchical clustering:**
```python
from Bio.Cluster import treecluster
tree = treecluster(data, method='a', dist='e')
# method: 'a'=average, 's'=single, 'm'=maximum, 'c'=centroid
# dist: 'e'=Euclidean, 'c'=correlation, 'a'=absolute correlation
```
**k-means clustering:**
```python
from Bio.Cluster import kcluster
clusterid, error, nfound = kcluster(data, nclusters=5, npass=100)
```
**Self-Organizing Maps (SOM):**
```python
from Bio.Cluster import somcluster
clusterid, celldata = somcluster(data, nx=3, ny=3)
```
**Principal Component Analysis:**
```python
from Bio.Cluster import pca
columnmean, coordinates, components, eigenvalues = pca(data)
```
## Visualization
### Bio.Graphics - Genomic Visualization
Tools for creating publication-quality biological graphics.
**GenomeDiagram - Circular and linear genome maps:**
```python
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
record = SeqIO.read("genome.gb", "genbank")
gd_diagram = GenomeDiagram.Diagram("Genome Map")
gd_track = gd_diagram.new_track(1, greytrack=True)
gd_feature_set = gd_track.new_set()
# Add features
for feature in record.features:
if feature.type == "gene":
gd_feature_set.add_feature(feature, color="blue", label=True)
gd_diagram.draw(format="linear", pagesize='A4', fragments=1)
gd_diagram.write("genome_map.pdf", "PDF")
```
**Chromosomes - Chromosome visualization:**
```python
from Bio.Graphics.BasicChromosome import Chromosome
chr = Chromosome("Chromosome 1")
chr.add("gene1", 1000, 2000, color="red")
chr.add("gene2", 3000, 4500, color="blue")
```
## Phenotype Analysis
### Bio.phenotype - Phenotypic Microarray Analysis
Tools for analyzing phenotypic microarray data (e.g., Biolog plates):
**Capabilities:**
- Parse PM plate data
- Growth curve analysis
- Compare phenotypic profiles
- Calculate similarity metrics

View File

@@ -0,0 +1,564 @@
# Structural Bioinformatics with Bio.PDB
## Overview
Bio.PDB provides tools for working with macromolecular 3D structures from PDB and mmCIF files. The module uses the SMCRA (Structure/Model/Chain/Residue/Atom) architecture to represent protein structures hierarchically.
## SMCRA Architecture
The Bio.PDB module organizes structures hierarchically:
```
Structure
└── Model (multiple models for NMR structures)
└── Chain (e.g., chain A, B, C)
└── Residue (amino acids, nucleotides, heteroatoms)
└── Atom (individual atoms)
```
## Parsing Structure Files
### PDB Format
```python
from Bio.PDB import PDBParser
# Create parser
parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
# Parse structure
structure = parser.get_structure("1crn", "1crn.pdb")
# Access basic information
print(f"Structure ID: {structure.id}")
print(f"Number of models: {len(structure)}")
```
### mmCIF Format
mmCIF format is more modern and handles large structures better:
```python
from Bio.PDB import MMCIFParser
# Create parser
parser = MMCIFParser(QUIET=True)
# Parse structure
structure = parser.get_structure("1crn", "1crn.cif")
```
### Download from PDB
```python
from Bio.PDB import PDBList
# Create PDB list object
pdbl = PDBList()
# Download PDB file
pdbl.retrieve_pdb_file("1CRN", file_format="pdb", pdir="structures/")
# Download mmCIF file
pdbl.retrieve_pdb_file("1CRN", file_format="mmCif", pdir="structures/")
# Download obsolete structure
pdbl.retrieve_pdb_file("1CRN", obsolete=True, pdir="structures/")
```
## Navigating Structure Hierarchy
### Access Models
```python
# Get first model
model = structure[0]
# Iterate through all models
for model in structure:
print(f"Model {model.id}")
```
### Access Chains
```python
# Get specific chain
chain = model["A"]
# Iterate through all chains
for chain in model:
print(f"Chain {chain.id}")
```
### Access Residues
```python
# Iterate through residues in a chain
for residue in chain:
print(f"Residue: {residue.resname} {residue.id[1]}")
# Get specific residue by ID
# Residue ID is tuple: (hetfield, sequence_id, insertion_code)
residue = chain[(" ", 10, " ")] # Standard amino acid at position 10
```
### Access Atoms
```python
# Iterate through atoms in a residue
for atom in residue:
print(f"Atom: {atom.name}, Coordinates: {atom.coord}")
# Get specific atom
ca_atom = residue["CA"] # Alpha carbon
print(f"CA coordinates: {ca_atom.coord}")
```
### Alternative Access Patterns
```python
# Direct access through hierarchy
atom = structure[0]["A"][10]["CA"]
# Get all atoms
atoms = list(structure.get_atoms())
print(f"Total atoms: {len(atoms)}")
# Get all residues
residues = list(structure.get_residues())
# Get all chains
chains = list(structure.get_chains())
```
## Working with Atom Coordinates
### Accessing Coordinates
```python
# Get atom coordinates
coord = atom.coord
print(f"X: {coord[0]}, Y: {coord[1]}, Z: {coord[2]}")
# Get B-factor (temperature factor)
b_factor = atom.bfactor
# Get occupancy
occupancy = atom.occupancy
# Get element
element = atom.element
```
### Calculate Distances
```python
from Bio.PDB import Vector
# Calculate distance between two atoms
atom1 = residue1["CA"]
atom2 = residue2["CA"]
distance = atom1 - atom2 # Returns distance in Angstroms
print(f"Distance: {distance:.2f} Å")
```
### Calculate Angles
```python
from Bio.PDB.vectors import calc_angle
# Calculate angle between three atoms
angle = calc_angle(
atom1.get_vector(),
atom2.get_vector(),
atom3.get_vector()
)
print(f"Angle: {angle:.2f} radians")
```
### Calculate Dihedrals
```python
from Bio.PDB.vectors import calc_dihedral
# Calculate dihedral angle between four atoms
dihedral = calc_dihedral(
atom1.get_vector(),
atom2.get_vector(),
atom3.get_vector(),
atom4.get_vector()
)
print(f"Dihedral: {dihedral:.2f} radians")
```
## Structure Analysis
### Secondary Structure (DSSP)
DSSP assigns secondary structure to protein structures:
```python
from Bio.PDB import DSSP, PDBParser
# Parse structure
parser = PDBParser()
structure = parser.get_structure("1crn", "1crn.pdb")
# Run DSSP (requires DSSP executable installed)
model = structure[0]
dssp = DSSP(model, "1crn.pdb")
# Access results
for residue_key in dssp:
dssp_data = dssp[residue_key]
residue_id = residue_key[1]
ss = dssp_data[2] # Secondary structure code
phi = dssp_data[4] # Phi angle
psi = dssp_data[5] # Psi angle
print(f"Residue {residue_id}: {ss}, φ={phi:.1f}°, ψ={psi:.1f}°")
```
Secondary structure codes:
- `H` - Alpha helix
- `B` - Beta bridge
- `E` - Strand
- `G` - 3-10 helix
- `I` - Pi helix
- `T` - Turn
- `S` - Bend
- `-` - Coil/loop
### Solvent Accessibility (DSSP)
```python
# Get relative solvent accessibility
for residue_key in dssp:
acc = dssp[residue_key][3] # Relative accessibility
print(f"Residue {residue_key[1]}: {acc:.2f} relative accessibility")
```
### Neighbor Search
Find nearby atoms efficiently:
```python
from Bio.PDB import NeighborSearch
# Get all atoms
atoms = list(structure.get_atoms())
# Create neighbor search object
ns = NeighborSearch(atoms)
# Find atoms within radius
center_atom = structure[0]["A"][10]["CA"]
nearby_atoms = ns.search(center_atom.coord, 5.0) # 5 Å radius
print(f"Found {len(nearby_atoms)} atoms within 5 Å")
# Find residues within radius
nearby_residues = ns.search(center_atom.coord, 5.0, level="R")
# Find chains within radius
nearby_chains = ns.search(center_atom.coord, 10.0, level="C")
```
### Contact Map
```python
def calculate_contact_map(chain, distance_threshold=8.0):
"""Calculate residue-residue contact map."""
residues = list(chain.get_residues())
n = len(residues)
contact_map = [[0] * n for _ in range(n)]
for i, res1 in enumerate(residues):
for j, res2 in enumerate(residues):
if i < j:
# Get CA atoms
if res1.has_id("CA") and res2.has_id("CA"):
dist = res1["CA"] - res2["CA"]
if dist < distance_threshold:
contact_map[i][j] = 1
contact_map[j][i] = 1
return contact_map
```
## Structure Quality Assessment
### Ramachandran Plot Data
```python
from Bio.PDB import Polypeptide
def get_phi_psi(structure):
"""Extract phi and psi angles for Ramachandran plot."""
phi_psi = []
for model in structure:
for chain in model:
polypeptides = Polypeptide.PPBuilder().build_peptides(chain)
for poly in polypeptides:
angles = poly.get_phi_psi_list()
for residue, (phi, psi) in zip(poly, angles):
if phi and psi: # Skip None values
phi_psi.append((residue.resname, phi, psi))
return phi_psi
```
### Check for Missing Atoms
```python
def check_missing_atoms(structure):
"""Identify residues with missing atoms."""
missing = []
for residue in structure.get_residues():
if residue.id[0] == " ": # Standard amino acid
resname = residue.resname
# Expected backbone atoms
expected = ["N", "CA", "C", "O"]
for atom_name in expected:
if not residue.has_id(atom_name):
missing.append((residue.full_id, atom_name))
return missing
```
## Structure Manipulation
### Select Specific Atoms
```python
from Bio.PDB import Select
class CASelect(Select):
"""Select only CA atoms."""
def accept_atom(self, atom):
return atom.name == "CA"
class ChainASelect(Select):
"""Select only chain A."""
def accept_chain(self, chain):
return chain.id == "A"
# Use with PDBIO
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure)
io.save("ca_only.pdb", CASelect())
io.save("chain_a.pdb", ChainASelect())
```
### Transform Structures
```python
import numpy as np
# Rotate structure
from Bio.PDB.vectors import rotaxis
# Define rotation axis and angle
axis = Vector(1, 0, 0) # X-axis
angle = np.pi / 4 # 45 degrees
# Create rotation matrix
rotation = rotaxis(angle, axis)
# Apply rotation to all atoms
for atom in structure.get_atoms():
atom.transform(rotation, Vector(0, 0, 0))
```
### Superimpose Structures
```python
from Bio.PDB import Superimposer, PDBParser
# Parse two structures
parser = PDBParser()
structure1 = parser.get_structure("ref", "reference.pdb")
structure2 = parser.get_structure("mov", "mobile.pdb")
# Get CA atoms from both structures
ref_atoms = [atom for atom in structure1.get_atoms() if atom.name == "CA"]
mov_atoms = [atom for atom in structure2.get_atoms() if atom.name == "CA"]
# Superimpose
super_imposer = Superimposer()
super_imposer.set_atoms(ref_atoms, mov_atoms)
# Apply transformation
super_imposer.apply(structure2.get_atoms())
# Get RMSD
rmsd = super_imposer.rms
print(f"RMSD: {rmsd:.2f} Å")
# Save superimposed structure
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure2)
io.save("superimposed.pdb")
```
## Writing Structure Files
### Save PDB Files
```python
from Bio.PDB import PDBIO
io = PDBIO()
io.set_structure(structure)
io.save("output.pdb")
```
### Save mmCIF Files
```python
from Bio.PDB import MMCIFIO
io = MMCIFIO()
io.set_structure(structure)
io.save("output.cif")
```
## Sequence from Structure
### Extract Sequence
```python
from Bio.PDB import Polypeptide
# Get polypeptides from structure
ppb = Polypeptide.PPBuilder()
for model in structure:
for chain in model:
for pp in ppb.build_peptides(chain):
sequence = pp.get_sequence()
print(f"Chain {chain.id}: {sequence}")
```
### Map to FASTA
```python
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
# Extract sequences and create FASTA
records = []
ppb = Polypeptide.PPBuilder()
for model in structure:
for chain in model:
for pp in ppb.build_peptides(chain):
seq_record = SeqRecord(
pp.get_sequence(),
id=f"{structure.id}_{chain.id}",
description=f"Chain {chain.id}"
)
records.append(seq_record)
SeqIO.write(records, "structure_sequences.fasta", "fasta")
```
## Best Practices
1. **Use mmCIF** for large structures and modern data
2. **Set QUIET=True** to suppress parser warnings
3. **Check for missing atoms** before analysis
4. **Use NeighborSearch** for efficient spatial queries
5. **Validate structure quality** with DSSP or Ramachandran analysis
6. **Handle multiple models** appropriately (NMR structures)
7. **Be aware of heteroatoms** - they have different residue IDs
8. **Use Select classes** for targeted structure output
9. **Cache downloaded structures** locally
10. **Consider alternative conformations** - some residues have multiple positions
## Common Use Cases
### Calculate RMSD Between Structures
```python
from Bio.PDB import PDBParser, Superimposer
parser = PDBParser()
structure1 = parser.get_structure("s1", "structure1.pdb")
structure2 = parser.get_structure("s2", "structure2.pdb")
# Get CA atoms
atoms1 = [atom for atom in structure1[0]["A"].get_atoms() if atom.name == "CA"]
atoms2 = [atom for atom in structure2[0]["A"].get_atoms() if atom.name == "CA"]
# Ensure same number of atoms
min_len = min(len(atoms1), len(atoms2))
atoms1 = atoms1[:min_len]
atoms2 = atoms2[:min_len]
# Calculate RMSD
sup = Superimposer()
sup.set_atoms(atoms1, atoms2)
print(f"RMSD: {sup.rms:.3f} Å")
```
### Find Binding Site Residues
```python
def find_binding_site(structure, ligand_chain, ligand_res_id, distance=5.0):
"""Find residues near a ligand."""
from Bio.PDB import NeighborSearch
# Get ligand atoms
ligand = structure[0][ligand_chain][ligand_res_id]
ligand_atoms = list(ligand.get_atoms())
# Get all protein atoms
protein_atoms = []
for chain in structure[0]:
if chain.id != ligand_chain:
for residue in chain:
if residue.id[0] == " ": # Standard residue
protein_atoms.extend(residue.get_atoms())
# Find nearby atoms
ns = NeighborSearch(protein_atoms)
binding_site = set()
for ligand_atom in ligand_atoms:
nearby = ns.search(ligand_atom.coord, distance, level="R")
binding_site.update(nearby)
return list(binding_site)
```
### Calculate Center of Mass
```python
import numpy as np
def center_of_mass(entity):
"""Calculate center of mass for structure entity."""
masses = []
coords = []
# Atomic masses (simplified)
mass_dict = {"C": 12.0, "N": 14.0, "O": 16.0, "S": 32.0}
for atom in entity.get_atoms():
mass = mass_dict.get(atom.element, 12.0)
masses.append(mass)
coords.append(atom.coord)
masses = np.array(masses)
coords = np.array(coords)
com = np.sum(coords * masses[:, np.newaxis], axis=0) / np.sum(masses)
return com
```