mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
216 lines
5.9 KiB
Python
216 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
File I/O operations using BioPython SeqIO.
|
|
|
|
This script demonstrates:
|
|
- Reading sequences from various formats
|
|
- Writing sequences to files
|
|
- Converting between formats
|
|
- Filtering and processing sequences
|
|
- Working with large files efficiently
|
|
"""
|
|
|
|
from Bio import SeqIO
|
|
from Bio.Seq import Seq
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
|
|
def read_sequences(filename, format_type):
|
|
"""Read and display sequences from a file."""
|
|
|
|
print(f"Reading {format_type} file: {filename}")
|
|
print("-" * 60)
|
|
|
|
count = 0
|
|
for record in SeqIO.parse(filename, format_type):
|
|
count += 1
|
|
print(f"ID: {record.id}")
|
|
print(f"Name: {record.name}")
|
|
print(f"Description: {record.description}")
|
|
print(f"Sequence length: {len(record.seq)}")
|
|
print(f"Sequence: {record.seq[:50]}...")
|
|
print()
|
|
|
|
# Only show first 3 sequences
|
|
if count >= 3:
|
|
break
|
|
|
|
# Count total sequences
|
|
total = len(list(SeqIO.parse(filename, format_type)))
|
|
print(f"Total sequences in file: {total}")
|
|
print()
|
|
|
|
|
|
def read_single_sequence(filename, format_type):
|
|
"""Read a single sequence from a file."""
|
|
|
|
record = SeqIO.read(filename, format_type)
|
|
|
|
print("Single sequence record:")
|
|
print(f"ID: {record.id}")
|
|
print(f"Sequence: {record.seq}")
|
|
print()
|
|
|
|
|
|
def write_sequences(records, output_filename, format_type):
|
|
"""Write sequences to a file."""
|
|
|
|
count = SeqIO.write(records, output_filename, format_type)
|
|
print(f"Wrote {count} sequences to {output_filename} in {format_type} format")
|
|
print()
|
|
|
|
|
|
def convert_format(input_file, input_format, output_file, output_format):
|
|
"""Convert sequences from one format to another."""
|
|
|
|
count = SeqIO.convert(input_file, input_format, output_file, output_format)
|
|
print(f"Converted {count} sequences from {input_format} to {output_format}")
|
|
print()
|
|
|
|
|
|
def filter_sequences(input_file, format_type, min_length=100, max_length=1000):
|
|
"""Filter sequences by length."""
|
|
|
|
filtered = []
|
|
|
|
for record in SeqIO.parse(input_file, format_type):
|
|
if min_length <= len(record.seq) <= max_length:
|
|
filtered.append(record)
|
|
|
|
print(f"Found {len(filtered)} sequences between {min_length} and {max_length} bp")
|
|
return filtered
|
|
|
|
|
|
def extract_subsequence(input_file, format_type, seq_id, start, end):
|
|
"""Extract a subsequence from a specific record."""
|
|
|
|
# Index for efficient access
|
|
record_dict = SeqIO.index(input_file, format_type)
|
|
|
|
if seq_id in record_dict:
|
|
record = record_dict[seq_id]
|
|
subseq = record.seq[start:end]
|
|
print(f"Extracted subsequence from {seq_id} ({start}:{end}):")
|
|
print(subseq)
|
|
return subseq
|
|
else:
|
|
print(f"Sequence {seq_id} not found")
|
|
return None
|
|
|
|
|
|
def create_sequence_records():
|
|
"""Create SeqRecord objects from scratch."""
|
|
|
|
# Simple record
|
|
simple_record = SeqRecord(
|
|
Seq("ATGCATGCATGC"),
|
|
id="seq001",
|
|
name="MySequence",
|
|
description="Example sequence"
|
|
)
|
|
|
|
# Record with annotations
|
|
annotated_record = SeqRecord(
|
|
Seq("ATGGTGCATCTGACTCCTGAGGAG"),
|
|
id="seq002",
|
|
name="GeneX",
|
|
description="Important gene"
|
|
)
|
|
annotated_record.annotations["molecule_type"] = "DNA"
|
|
annotated_record.annotations["organism"] = "Homo sapiens"
|
|
|
|
return [simple_record, annotated_record]
|
|
|
|
|
|
def index_large_file(filename, format_type):
|
|
"""Index a large file for random access without loading into memory."""
|
|
|
|
# Create index
|
|
record_index = SeqIO.index(filename, format_type)
|
|
|
|
print(f"Indexed {len(record_index)} sequences")
|
|
print(f"Available IDs: {list(record_index.keys())[:10]}...")
|
|
print()
|
|
|
|
# Access specific record by ID
|
|
if len(record_index) > 0:
|
|
first_id = list(record_index.keys())[0]
|
|
record = record_index[first_id]
|
|
print(f"Accessed record: {record.id}")
|
|
print()
|
|
|
|
# Close index
|
|
record_index.close()
|
|
|
|
|
|
def parse_with_quality_scores(fastq_file):
|
|
"""Parse FASTQ files with quality scores."""
|
|
|
|
print("Parsing FASTQ with quality scores:")
|
|
print("-" * 60)
|
|
|
|
for record in SeqIO.parse(fastq_file, "fastq"):
|
|
print(f"ID: {record.id}")
|
|
print(f"Sequence: {record.seq[:50]}...")
|
|
print(f"Quality scores (first 10): {record.letter_annotations['phred_quality'][:10]}")
|
|
|
|
# Calculate average quality
|
|
avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
|
|
print(f"Average quality: {avg_quality:.2f}")
|
|
print()
|
|
break # Just show first record
|
|
|
|
|
|
def batch_process_large_file(input_file, format_type, batch_size=100):
|
|
"""Process large files in batches to manage memory."""
|
|
|
|
batch = []
|
|
count = 0
|
|
|
|
for record in SeqIO.parse(input_file, format_type):
|
|
batch.append(record)
|
|
count += 1
|
|
|
|
if len(batch) == batch_size:
|
|
# Process batch
|
|
print(f"Processing batch of {len(batch)} sequences...")
|
|
# Do something with batch
|
|
batch = [] # Clear for next batch
|
|
|
|
# Process remaining records
|
|
if batch:
|
|
print(f"Processing final batch of {len(batch)} sequences...")
|
|
|
|
print(f"Total sequences processed: {count}")
|
|
|
|
|
|
def example_workflow():
|
|
"""Demonstrate a complete workflow."""
|
|
|
|
print("=" * 60)
|
|
print("BioPython SeqIO Workflow Example")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# Create example sequences
|
|
records = create_sequence_records()
|
|
|
|
# Write as FASTA
|
|
write_sequences(records, "example_output.fasta", "fasta")
|
|
|
|
# Write as GenBank
|
|
write_sequences(records, "example_output.gb", "genbank")
|
|
|
|
# Convert FASTA to GenBank (would work if file exists)
|
|
# convert_format("input.fasta", "fasta", "output.gb", "genbank")
|
|
|
|
print("Example workflow completed!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
example_workflow()
|
|
|
|
print()
|
|
print("Note: This script demonstrates BioPython SeqIO operations.")
|
|
print("Uncomment and adapt the functions for your specific files.")
|