Add more scientific skills

2026-03-28 07:33:45 +08:00 · 2025-10-19 14:12:02 -07:00
parent 78d5ac2b56
commit 660c8574d0
210 changed files with 88957 additions and 1 deletions
--- a/scientific-packages/biopython/scripts/file_io.py
+++ b/scientific-packages/biopython/scripts/file_io.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+File I/O operations using BioPython SeqIO.
+
+This script demonstrates:
+- Reading sequences from various formats
+- Writing sequences to files
+- Converting between formats
+- Filtering and processing sequences
+- Working with large files efficiently
+"""
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+
+def read_sequences(filename, format_type):
+    """Read and display sequences from a file."""
+
+    print(f"Reading {format_type} file: {filename}")
+    print("-" * 60)
+
+    count = 0
+    for record in SeqIO.parse(filename, format_type):
+        count += 1
+        print(f"ID: {record.id}")
+        print(f"Name: {record.name}")
+        print(f"Description: {record.description}")
+        print(f"Sequence length: {len(record.seq)}")
+        print(f"Sequence: {record.seq[:50]}...")
+        print()
+
+        # Only show first 3 sequences
+        if count >= 3:
+            break
+
+    # Count total sequences
+    total = len(list(SeqIO.parse(filename, format_type)))
+    print(f"Total sequences in file: {total}")
+    print()
+
+
+def read_single_sequence(filename, format_type):
+    """Read a single sequence from a file."""
+
+    record = SeqIO.read(filename, format_type)
+
+    print("Single sequence record:")
+    print(f"ID: {record.id}")
+    print(f"Sequence: {record.seq}")
+    print()
+
+
+def write_sequences(records, output_filename, format_type):
+    """Write sequences to a file."""
+
+    count = SeqIO.write(records, output_filename, format_type)
+    print(f"Wrote {count} sequences to {output_filename} in {format_type} format")
+    print()
+
+
+def convert_format(input_file, input_format, output_file, output_format):
+    """Convert sequences from one format to another."""
+
+    count = SeqIO.convert(input_file, input_format, output_file, output_format)
+    print(f"Converted {count} sequences from {input_format} to {output_format}")
+    print()
+
+
+def filter_sequences(input_file, format_type, min_length=100, max_length=1000):
+    """Filter sequences by length."""
+
+    filtered = []
+
+    for record in SeqIO.parse(input_file, format_type):
+        if min_length <= len(record.seq) <= max_length:
+            filtered.append(record)
+
+    print(f"Found {len(filtered)} sequences between {min_length} and {max_length} bp")
+    return filtered
+
+
+def extract_subsequence(input_file, format_type, seq_id, start, end):
+    """Extract a subsequence from a specific record."""
+
+    # Index for efficient access
+    record_dict = SeqIO.index(input_file, format_type)
+
+    if seq_id in record_dict:
+        record = record_dict[seq_id]
+        subseq = record.seq[start:end]
+        print(f"Extracted subsequence from {seq_id} ({start}:{end}):")
+        print(subseq)
+        return subseq
+    else:
+        print(f"Sequence {seq_id} not found")
+        return None
+
+
+def create_sequence_records():
+    """Create SeqRecord objects from scratch."""
+
+    # Simple record
+    simple_record = SeqRecord(
+        Seq("ATGCATGCATGC"),
+        id="seq001",
+        name="MySequence",
+        description="Example sequence"
+    )
+
+    # Record with annotations
+    annotated_record = SeqRecord(
+        Seq("ATGGTGCATCTGACTCCTGAGGAG"),
+        id="seq002",
+        name="GeneX",
+        description="Important gene"
+    )
+    annotated_record.annotations["molecule_type"] = "DNA"
+    annotated_record.annotations["organism"] = "Homo sapiens"
+
+    return [simple_record, annotated_record]
+
+
+def index_large_file(filename, format_type):
+    """Index a large file for random access without loading into memory."""
+
+    # Create index
+    record_index = SeqIO.index(filename, format_type)
+
+    print(f"Indexed {len(record_index)} sequences")
+    print(f"Available IDs: {list(record_index.keys())[:10]}...")
+    print()
+
+    # Access specific record by ID
+    if len(record_index) > 0:
+        first_id = list(record_index.keys())[0]
+        record = record_index[first_id]
+        print(f"Accessed record: {record.id}")
+        print()
+
+    # Close index
+    record_index.close()
+
+
+def parse_with_quality_scores(fastq_file):
+    """Parse FASTQ files with quality scores."""
+
+    print("Parsing FASTQ with quality scores:")
+    print("-" * 60)
+
+    for record in SeqIO.parse(fastq_file, "fastq"):
+        print(f"ID: {record.id}")
+        print(f"Sequence: {record.seq[:50]}...")
+        print(f"Quality scores (first 10): {record.letter_annotations['phred_quality'][:10]}")
+
+        # Calculate average quality
+        avg_quality = sum(record.letter_annotations["phred_quality"]) / len(record)
+        print(f"Average quality: {avg_quality:.2f}")
+        print()
+        break  # Just show first record
+
+
+def batch_process_large_file(input_file, format_type, batch_size=100):
+    """Process large files in batches to manage memory."""
+
+    batch = []
+    count = 0
+
+    for record in SeqIO.parse(input_file, format_type):
+        batch.append(record)
+        count += 1
+
+        if len(batch) == batch_size:
+            # Process batch
+            print(f"Processing batch of {len(batch)} sequences...")
+            # Do something with batch
+            batch = []  # Clear for next batch
+
+    # Process remaining records
+    if batch:
+        print(f"Processing final batch of {len(batch)} sequences...")
+
+    print(f"Total sequences processed: {count}")
+
+
+def example_workflow():
+    """Demonstrate a complete workflow."""
+
+    print("=" * 60)
+    print("BioPython SeqIO Workflow Example")
+    print("=" * 60)
+    print()
+
+    # Create example sequences
+    records = create_sequence_records()
+
+    # Write as FASTA
+    write_sequences(records, "example_output.fasta", "fasta")
+
+    # Write as GenBank
+    write_sequences(records, "example_output.gb", "genbank")
+
+    # Convert FASTA to GenBank (would work if file exists)
+    # convert_format("input.fasta", "fasta", "output.gb", "genbank")
+
+    print("Example workflow completed!")
+
+
+if __name__ == "__main__":
+    example_workflow()
+
+    print()
+    print("Note: This script demonstrates BioPython SeqIO operations.")
+    print("Uncomment and adapt the functions for your specific files.")