claude-scientific-skills/scientific-packages/biopython/scripts/file_io.py

#!/usr/bin/env python3
"""
File I/O operations using BioPython SeqIO.

This script demonstrates:
- Reading sequences from various formats
- Writing sequences to files
- Converting between formats
- Filtering and processing sequences
- Working with large files efficiently
"""

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def read_sequences(filename, format_type):
    """Read and display sequences from a file."""

    print(f"Reading {format_type} file: {filename}")
    print("-" * 60)

    count = 0
    for record in SeqIO.parse(filename, format_type):
        count += 1
        print(f"ID: {record.id}")
        print(f"Name: {record.name}")
        print(f"Description: {record.description}")
        print(f"Sequence length: {len(record.seq)}")
        print(f"Sequence: {record.seq[:50]}...")
        print()

        # Only show first 3 sequences
        if count >= 3:
            break

    # Count total sequences
    total = len(list(SeqIO.parse(filename, format_type)))
    print(f"Total sequences in file: {total}")
    print()


def read_single_sequence(filename, format_type):
    """Read a single sequence from a file."""

    record = SeqIO.read(filename, format_type)

    print("Single sequence record:")
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print()


def write_sequences(records, output_filename, format_type):
    """Write sequences to a file."""

    count = SeqIO.write(records, output_filename, format_type)
    print(f"Wrote {count} sequences to {output_filename} in {format_type} format")
    print()


def convert_format(input_file, input_format, output_file, output_format):
    """Convert sequences from one format to another."""

    count = SeqIO.convert(input_file, input_format, output_file, output_format)
    print(f"Converted {count} sequences from {input_format} to {output_format}")
    print()


def filter_sequences(input_file, format_type, min_length=100, max_length=1000):
    """Filter sequences by length."""

    filtered = []

    for record in SeqIO.parse(input_file, format_type):
        if min_length <= len(record.seq) <= max_length:
            filtered.append(record)

    print(f"Found {len(filtered)} sequences between {min_length} and {max_length} bp")
    return filtered


def extract_subsequence(input_file, format_type, seq_id, start, end):
    """Extract a subsequence from a specific record."""

    # Index for efficient access
    record_dict = SeqIO.index(input_file, format_type)

    if seq_id in record_dict:
        record = record_dict[seq_id]
        subseq = record.seq[start:end]
        print(f"Extracted subsequence from {seq_id} ({start}:{end}):")
        print(subseq)
        return subseq
    else:
        print(f"Sequence {seq_id} not found")
        return None


def create_sequence_records():
    """Create SeqRecord objects from scratch."""

    # Simple record
    simple_record = SeqRecord(
        Seq("ATGCATGCATGC"),
        id="seq001",
        name="MySequence",
        description="Example sequence"
    )

    # Record with annotations
    annotated_record = SeqRecord(
        Seq("ATGGTGCATCTGACTCCTGAGGAG"),
        id="seq002",
        name="GeneX",
        description="Important gene"
    )
    annotated_record.annotations["molecule_type"] = "DNA"
    annotated_record.annotations["organism"] = "Homo sapiens"

    return [simple_record, annotated_record]


def index_large_file(filename, format_type):
    """Index a large file for random access without loading into memory."""

    # Create index
    record_index = SeqIO.index(filename, format_type)

    print(f"Indexed {len(record_index)} sequences")
    print(f"Available IDs: {list(record_index.keys())[:10]}...")
    print()

    # Access specific record by ID
    if len(record_index) > 0:
        first_id = list(record_index.keys())[0]
        record = record_index[first_id]
        print(f"Accessed record: {record.id}")
        print()

    # Close index
    record_index.close()


def parse_with_quality_scores(fastq_file):
    """Parse FASTQ files with quality scores."""

    print("Parsing FASTQ with quality scores:")
    print("-" * 60)

    for record in SeqIO.parse(fastq_file, "fastq"):
        print(f"ID: {record.id}")
        print(f"Sequence: {record.seq[:50]}...")
        print(f"Quality scores (first 10): {record.letter_annotations['phred_quality'][:10]}")

        # Calculate average quality
        avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
        print(f"Average quality: {avg_quality:.2f}")
        print()
        break  # Just show first record


def batch_process_large_file(input_file, format_type, batch_size=100):
    """Process large files in batches to manage memory."""

    batch = []
    count = 0

    for record in SeqIO.parse(input_file, format_type):
        batch.append(record)
        count += 1

        if len(batch) == batch_size:
            # Process batch
            print(f"Processing batch of {len(batch)} sequences...")
            # Do something with batch
            batch = []  # Clear for next batch

    # Process remaining records
    if batch:
        print(f"Processing final batch of {len(batch)} sequences...")

    print(f"Total sequences processed: {count}")


def example_workflow():
    """Demonstrate a complete workflow."""

    print("=" * 60)
    print("BioPython SeqIO Workflow Example")
    print("=" * 60)
    print()

    # Create example sequences
    records = create_sequence_records()

    # Write as FASTA
    write_sequences(records, "example_output.fasta", "fasta")

    # Write as GenBank
    write_sequences(records, "example_output.gb", "genbank")

    # Convert FASTA to GenBank (would work if file exists)
    # convert_format("input.fasta", "fasta", "output.gb", "genbank")

    print("Example workflow completed!")


if __name__ == "__main__":
    example_workflow()

    print()
    print("Note: This script demonstrates BioPython SeqIO operations.")
    print("Uncomment and adapt the functions for your specific files.")