Files
claude-scientific-skills/scientific-packages/biopython/scripts/file_io.py
2025-10-19 14:12:02 -07:00

216 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
File I/O operations using BioPython SeqIO.
This script demonstrates:
- Reading sequences from various formats
- Writing sequences to files
- Converting between formats
- Filtering and processing sequences
- Working with large files efficiently
"""
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def read_sequences(filename, format_type):
"""Read and display sequences from a file."""
print(f"Reading {format_type} file: {filename}")
print("-" * 60)
count = 0
for record in SeqIO.parse(filename, format_type):
count += 1
print(f"ID: {record.id}")
print(f"Name: {record.name}")
print(f"Description: {record.description}")
print(f"Sequence length: {len(record.seq)}")
print(f"Sequence: {record.seq[:50]}...")
print()
# Only show first 3 sequences
if count >= 3:
break
# Count total sequences
total = len(list(SeqIO.parse(filename, format_type)))
print(f"Total sequences in file: {total}")
print()
def read_single_sequence(filename, format_type):
"""Read a single sequence from a file."""
record = SeqIO.read(filename, format_type)
print("Single sequence record:")
print(f"ID: {record.id}")
print(f"Sequence: {record.seq}")
print()
def write_sequences(records, output_filename, format_type):
"""Write sequences to a file."""
count = SeqIO.write(records, output_filename, format_type)
print(f"Wrote {count} sequences to {output_filename} in {format_type} format")
print()
def convert_format(input_file, input_format, output_file, output_format):
"""Convert sequences from one format to another."""
count = SeqIO.convert(input_file, input_format, output_file, output_format)
print(f"Converted {count} sequences from {input_format} to {output_format}")
print()
def filter_sequences(input_file, format_type, min_length=100, max_length=1000):
"""Filter sequences by length."""
filtered = []
for record in SeqIO.parse(input_file, format_type):
if min_length <= len(record.seq) <= max_length:
filtered.append(record)
print(f"Found {len(filtered)} sequences between {min_length} and {max_length} bp")
return filtered
def extract_subsequence(input_file, format_type, seq_id, start, end):
"""Extract a subsequence from a specific record."""
# Index for efficient access
record_dict = SeqIO.index(input_file, format_type)
if seq_id in record_dict:
record = record_dict[seq_id]
subseq = record.seq[start:end]
print(f"Extracted subsequence from {seq_id} ({start}:{end}):")
print(subseq)
return subseq
else:
print(f"Sequence {seq_id} not found")
return None
def create_sequence_records():
"""Create SeqRecord objects from scratch."""
# Simple record
simple_record = SeqRecord(
Seq("ATGCATGCATGC"),
id="seq001",
name="MySequence",
description="Example sequence"
)
# Record with annotations
annotated_record = SeqRecord(
Seq("ATGGTGCATCTGACTCCTGAGGAG"),
id="seq002",
name="GeneX",
description="Important gene"
)
annotated_record.annotations["molecule_type"] = "DNA"
annotated_record.annotations["organism"] = "Homo sapiens"
return [simple_record, annotated_record]
def index_large_file(filename, format_type):
"""Index a large file for random access without loading into memory."""
# Create index
record_index = SeqIO.index(filename, format_type)
print(f"Indexed {len(record_index)} sequences")
print(f"Available IDs: {list(record_index.keys())[:10]}...")
print()
# Access specific record by ID
if len(record_index) > 0:
first_id = list(record_index.keys())[0]
record = record_index[first_id]
print(f"Accessed record: {record.id}")
print()
# Close index
record_index.close()
def parse_with_quality_scores(fastq_file):
"""Parse FASTQ files with quality scores."""
print("Parsing FASTQ with quality scores:")
print("-" * 60)
for record in SeqIO.parse(fastq_file, "fastq"):
print(f"ID: {record.id}")
print(f"Sequence: {record.seq[:50]}...")
print(f"Quality scores (first 10): {record.letter_annotations['phred_quality'][:10]}")
# Calculate average quality
avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
print(f"Average quality: {avg_quality:.2f}")
print()
break # Just show first record
def batch_process_large_file(input_file, format_type, batch_size=100):
"""Process large files in batches to manage memory."""
batch = []
count = 0
for record in SeqIO.parse(input_file, format_type):
batch.append(record)
count += 1
if len(batch) == batch_size:
# Process batch
print(f"Processing batch of {len(batch)} sequences...")
# Do something with batch
batch = [] # Clear for next batch
# Process remaining records
if batch:
print(f"Processing final batch of {len(batch)} sequences...")
print(f"Total sequences processed: {count}")
def example_workflow():
"""Demonstrate a complete workflow."""
print("=" * 60)
print("BioPython SeqIO Workflow Example")
print("=" * 60)
print()
# Create example sequences
records = create_sequence_records()
# Write as FASTA
write_sequences(records, "example_output.fasta", "fasta")
# Write as GenBank
write_sequences(records, "example_output.gb", "genbank")
# Convert FASTA to GenBank (would work if file exists)
# convert_format("input.fasta", "fasta", "output.gb", "genbank")
print("Example workflow completed!")
if __name__ == "__main__":
example_workflow()
print()
print("Note: This script demonstrates BioPython SeqIO operations.")
print("Uncomment and adapt the functions for your specific files.")