mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-28 07:33:45 +08:00
Add more scientific skills
This commit is contained in:
215
scientific-packages/biopython/scripts/file_io.py
Normal file
215
scientific-packages/biopython/scripts/file_io.py
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
File I/O operations using BioPython SeqIO.
|
||||
|
||||
This script demonstrates:
|
||||
- Reading sequences from various formats
|
||||
- Writing sequences to files
|
||||
- Converting between formats
|
||||
- Filtering and processing sequences
|
||||
- Working with large files efficiently
|
||||
"""
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
|
||||
def read_sequences(filename, format_type):
|
||||
"""Read and display sequences from a file."""
|
||||
|
||||
print(f"Reading {format_type} file: {filename}")
|
||||
print("-" * 60)
|
||||
|
||||
count = 0
|
||||
for record in SeqIO.parse(filename, format_type):
|
||||
count += 1
|
||||
print(f"ID: {record.id}")
|
||||
print(f"Name: {record.name}")
|
||||
print(f"Description: {record.description}")
|
||||
print(f"Sequence length: {len(record.seq)}")
|
||||
print(f"Sequence: {record.seq[:50]}...")
|
||||
print()
|
||||
|
||||
# Only show first 3 sequences
|
||||
if count >= 3:
|
||||
break
|
||||
|
||||
# Count total sequences
|
||||
total = len(list(SeqIO.parse(filename, format_type)))
|
||||
print(f"Total sequences in file: {total}")
|
||||
print()
|
||||
|
||||
|
||||
def read_single_sequence(filename, format_type):
|
||||
"""Read a single sequence from a file."""
|
||||
|
||||
record = SeqIO.read(filename, format_type)
|
||||
|
||||
print("Single sequence record:")
|
||||
print(f"ID: {record.id}")
|
||||
print(f"Sequence: {record.seq}")
|
||||
print()
|
||||
|
||||
|
||||
def write_sequences(records, output_filename, format_type):
|
||||
"""Write sequences to a file."""
|
||||
|
||||
count = SeqIO.write(records, output_filename, format_type)
|
||||
print(f"Wrote {count} sequences to {output_filename} in {format_type} format")
|
||||
print()
|
||||
|
||||
|
||||
def convert_format(input_file, input_format, output_file, output_format):
|
||||
"""Convert sequences from one format to another."""
|
||||
|
||||
count = SeqIO.convert(input_file, input_format, output_file, output_format)
|
||||
print(f"Converted {count} sequences from {input_format} to {output_format}")
|
||||
print()
|
||||
|
||||
|
||||
def filter_sequences(input_file, format_type, min_length=100, max_length=1000):
|
||||
"""Filter sequences by length."""
|
||||
|
||||
filtered = []
|
||||
|
||||
for record in SeqIO.parse(input_file, format_type):
|
||||
if min_length <= len(record.seq) <= max_length:
|
||||
filtered.append(record)
|
||||
|
||||
print(f"Found {len(filtered)} sequences between {min_length} and {max_length} bp")
|
||||
return filtered
|
||||
|
||||
|
||||
def extract_subsequence(input_file, format_type, seq_id, start, end):
|
||||
"""Extract a subsequence from a specific record."""
|
||||
|
||||
# Index for efficient access
|
||||
record_dict = SeqIO.index(input_file, format_type)
|
||||
|
||||
if seq_id in record_dict:
|
||||
record = record_dict[seq_id]
|
||||
subseq = record.seq[start:end]
|
||||
print(f"Extracted subsequence from {seq_id} ({start}:{end}):")
|
||||
print(subseq)
|
||||
return subseq
|
||||
else:
|
||||
print(f"Sequence {seq_id} not found")
|
||||
return None
|
||||
|
||||
|
||||
def create_sequence_records():
|
||||
"""Create SeqRecord objects from scratch."""
|
||||
|
||||
# Simple record
|
||||
simple_record = SeqRecord(
|
||||
Seq("ATGCATGCATGC"),
|
||||
id="seq001",
|
||||
name="MySequence",
|
||||
description="Example sequence"
|
||||
)
|
||||
|
||||
# Record with annotations
|
||||
annotated_record = SeqRecord(
|
||||
Seq("ATGGTGCATCTGACTCCTGAGGAG"),
|
||||
id="seq002",
|
||||
name="GeneX",
|
||||
description="Important gene"
|
||||
)
|
||||
annotated_record.annotations["molecule_type"] = "DNA"
|
||||
annotated_record.annotations["organism"] = "Homo sapiens"
|
||||
|
||||
return [simple_record, annotated_record]
|
||||
|
||||
|
||||
def index_large_file(filename, format_type):
|
||||
"""Index a large file for random access without loading into memory."""
|
||||
|
||||
# Create index
|
||||
record_index = SeqIO.index(filename, format_type)
|
||||
|
||||
print(f"Indexed {len(record_index)} sequences")
|
||||
print(f"Available IDs: {list(record_index.keys())[:10]}...")
|
||||
print()
|
||||
|
||||
# Access specific record by ID
|
||||
if len(record_index) > 0:
|
||||
first_id = list(record_index.keys())[0]
|
||||
record = record_index[first_id]
|
||||
print(f"Accessed record: {record.id}")
|
||||
print()
|
||||
|
||||
# Close index
|
||||
record_index.close()
|
||||
|
||||
|
||||
def parse_with_quality_scores(fastq_file):
|
||||
"""Parse FASTQ files with quality scores."""
|
||||
|
||||
print("Parsing FASTQ with quality scores:")
|
||||
print("-" * 60)
|
||||
|
||||
for record in SeqIO.parse(fastq_file, "fastq"):
|
||||
print(f"ID: {record.id}")
|
||||
print(f"Sequence: {record.seq[:50]}...")
|
||||
print(f"Quality scores (first 10): {record.letter_annotations['phred_quality'][:10]}")
|
||||
|
||||
# Calculate average quality
|
||||
avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
|
||||
print(f"Average quality: {avg_quality:.2f}")
|
||||
print()
|
||||
break # Just show first record
|
||||
|
||||
|
||||
def batch_process_large_file(input_file, format_type, batch_size=100):
|
||||
"""Process large files in batches to manage memory."""
|
||||
|
||||
batch = []
|
||||
count = 0
|
||||
|
||||
for record in SeqIO.parse(input_file, format_type):
|
||||
batch.append(record)
|
||||
count += 1
|
||||
|
||||
if len(batch) == batch_size:
|
||||
# Process batch
|
||||
print(f"Processing batch of {len(batch)} sequences...")
|
||||
# Do something with batch
|
||||
batch = [] # Clear for next batch
|
||||
|
||||
# Process remaining records
|
||||
if batch:
|
||||
print(f"Processing final batch of {len(batch)} sequences...")
|
||||
|
||||
print(f"Total sequences processed: {count}")
|
||||
|
||||
|
||||
def example_workflow():
|
||||
"""Demonstrate a complete workflow."""
|
||||
|
||||
print("=" * 60)
|
||||
print("BioPython SeqIO Workflow Example")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Create example sequences
|
||||
records = create_sequence_records()
|
||||
|
||||
# Write as FASTA
|
||||
write_sequences(records, "example_output.fasta", "fasta")
|
||||
|
||||
# Write as GenBank
|
||||
write_sequences(records, "example_output.gb", "genbank")
|
||||
|
||||
# Convert FASTA to GenBank (would work if file exists)
|
||||
# convert_format("input.fasta", "fasta", "output.gb", "genbank")
|
||||
|
||||
print("Example workflow completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_workflow()
|
||||
|
||||
print()
|
||||
print("Note: This script demonstrates BioPython SeqIO operations.")
|
||||
print("Uncomment and adapt the functions for your specific files.")
|
||||
Reference in New Issue
Block a user