From 152d0d54de880fe54d07f74b46167829e4048e6d Mon Sep 17 00:00:00 2001 From: Timothy Kassis Date: Sun, 19 Oct 2025 14:01:29 -0700 Subject: [PATCH] Initial commit --- .claude-plugin | 33 ++ LICENSE.md | 56 ++ .../pubchem-database/SKILL.md | 557 ++++++++++++++++++ .../references/api_reference.md | 440 ++++++++++++++ .../scripts/bioactivity_query.py | 367 ++++++++++++ .../scripts/compound_search.py | 297 ++++++++++ scientific-packages/anndata/SKILL.md | 527 +++++++++++++++++ .../anndata/references/api_reference.md | 218 +++++++ .../anndata/references/concatenation_guide.md | 478 +++++++++++++++ .../references/workflows_best_practices.md | 438 ++++++++++++++ scientific-packages/arboreto/SKILL.md | 415 +++++++++++++ .../arboreto/references/api_reference.md | 271 +++++++++ .../arboreto/scripts/basic_grn_inference.py | 110 ++++ .../arboreto/scripts/compare_algorithms.py | 205 +++++++ .../arboreto/scripts/distributed_inference.py | 157 +++++ 15 files changed, 4569 insertions(+) create mode 100644 .claude-plugin create mode 100644 LICENSE.md create mode 100644 scientific-databases/pubchem-database/SKILL.md create mode 100644 scientific-databases/pubchem-database/references/api_reference.md create mode 100644 scientific-databases/pubchem-database/scripts/bioactivity_query.py create mode 100644 scientific-databases/pubchem-database/scripts/compound_search.py create mode 100644 scientific-packages/anndata/SKILL.md create mode 100644 scientific-packages/anndata/references/api_reference.md create mode 100644 scientific-packages/anndata/references/concatenation_guide.md create mode 100644 scientific-packages/anndata/references/workflows_best_practices.md create mode 100644 scientific-packages/arboreto/SKILL.md create mode 100644 scientific-packages/arboreto/references/api_reference.md create mode 100644 scientific-packages/arboreto/scripts/basic_grn_inference.py create mode 100644 scientific-packages/arboreto/scripts/compare_algorithms.py create mode 100644 scientific-packages/arboreto/scripts/distributed_inference.py diff --git a/.claude-plugin b/.claude-plugin new file mode 100644 index 0000000..4f76c55 --- /dev/null +++ b/.claude-plugin @@ -0,0 +1,33 @@ + +{ + "name": "claude-scientific-skills", + "owner": { + "name": "Timothy Kassis", + "email": "timothy.kassis@k-dense.ai" + }, + "metadata": { + "description": "Claude scientific skills from K-Dense Inc", + "version": "1.0.0" + }, + "plugins": [ + { + "name": "scientific-packages", + "description": "Collection of python scientific packages", + "source": "./", + "strict": false, + "skills": [ + "./scientific-packages/anndata", + "./scientific-packages/arboreto" + ] + }, + { + "name": "scientific-databases", + "description": "Collection of scientific databases", + "source": "./", + "strict": false, + "skills": [ + "./scientific-databases/pubchem-database" + ] + } + ] +} diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..9bc4adf --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,56 @@ +# PolyForm Noncommercial License 1.0.0 + + +> **Required Notice:** Copyright © K-Dense Inc. (https://k-dense.ai) + +## Acceptance +In order to get any license under these terms, you must agree to them as both strict obligations and conditions to all your licenses. + +## Copyright License +The licensor (**K-Dense Inc.**) grants you a copyright license for the software to do everything you might do with the software that would otherwise infringe the licensor's copyright in it for any permitted purpose. However, you may only distribute the software according to [Distribution License](#distribution-license) and make changes or new works based on the software according to [Changes and New Works License](#changes-and-new-works-license). + +## Distribution License +The licensor (**K-Dense Inc.**) grants you an additional copyright license to distribute copies of the software. Your license to distribute covers distributing the software with changes and new works permitted by [Changes and New Works License](#changes-and-new-works-license). + +## Notices +You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms or the URL for them above, as well as copies of any plain-text lines beginning with `Required Notice:` that the licensor provided with the software. For example: + +> Required Notice: Copyright © K-Dense Inc. (https://k-dense.ai) + +## Changes and New Works License +The licensor (**K-Dense Inc.**) grants you an additional copyright license to make changes and new works based on the software for any permitted purpose. + +## Patent License +The licensor (**K-Dense Inc.**) grants you a patent license for the software that covers patent claims the licensor can license, or becomes able to license, that you would infringe by using the software. + +## Noncommercial Purposes +Any noncommercial purpose is a permitted purpose. + +## Personal Uses +Personal use for research, experiment, and testing for the benefit of public knowledge, personal study, private entertainment, hobby projects, amateur pursuits, or religious observance, without any anticipated commercial application, is use for a permitted purpose. + +## Noncommercial Organizations +Use by any charitable organization, educational institution, public research organization, public safety or health organization, environmental protection organization, or government institution is use for a permitted purpose regardless of the source of funding or obligations resulting from the funding. + +## Fair Use +You may have "fair use" rights for the software under the law. These terms do not limit them. + +## No Other Rights +These terms do not allow you to sublicense or transfer any of your licenses to anyone else, or prevent the licensor (**K-Dense Inc.**) from granting licenses to anyone else. These terms do not imply any other licenses. + +## Patent Defense +If you make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company. + +## Violations +The first time you are notified in writing that you have violated any of these terms, or done anything with the software not covered by your licenses, your licenses can nonetheless continue if you come into full compliance with these terms and take practical steps to correct past violations within 32 days of receiving notice. Otherwise, all your licenses end immediately. + +## No Liability +***As far as the law allows, the software comes as is, without any warranty or condition, and K-Dense Inc. will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.*** + +## Definitions +The **licensor** is **K-Dense Inc.**, the individual or entity offering these terms, and the **software** is the software K-Dense Inc. makes available under these terms. + +**You** refers to the individual or entity agreeing to these terms. +**Your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **Control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect. +**Your licenses** are all the licenses granted to you for the software under these terms. +**Use** means anything you do with the software requiring one of your licenses. diff --git a/scientific-databases/pubchem-database/SKILL.md b/scientific-databases/pubchem-database/SKILL.md new file mode 100644 index 0000000..bd95b1c --- /dev/null +++ b/scientific-databases/pubchem-database/SKILL.md @@ -0,0 +1,557 @@ +--- +name: pubchem-database +description: Access chemical compound data from PubChem, the world's largest free chemical database. This skill should be used when retrieving compound properties, searching for chemicals by name/SMILES/InChI, performing similarity or substructure searches, accessing bioactivity data, converting between chemical formats, or generating chemical structure images. Works with over 110 million compounds and 270 million bioactivities through PUG-REST API and PubChemPy library. +--- + +# PubChem Database + +## Overview + +PubChem is the world's largest freely available chemical database maintained by the National Center for Biotechnology Information (NCBI). It contains over 110 million unique chemical structures and over 270 million bioactivities from more than 770 data sources. This skill provides guidance for programmatically accessing PubChem data using the PUG-REST API and PubChemPy Python library. + +## Core Capabilities + +### 1. Chemical Structure Search + +Search for compounds using multiple identifier types: + +**By Chemical Name**: +```python +import pubchempy as pcp +compounds = pcp.get_compounds('aspirin', 'name') +compound = compounds[0] +``` + +**By CID (Compound ID)**: +```python +compound = pcp.Compound.from_cid(2244) # Aspirin +``` + +**By SMILES**: +```python +compound = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles')[0] +``` + +**By InChI**: +```python +compound = pcp.get_compounds('InChI=1S/C9H8O4/...', 'inchi')[0] +``` + +**By Molecular Formula**: +```python +compounds = pcp.get_compounds('C9H8O4', 'formula') +# Returns all compounds matching this formula +``` + +### 2. Property Retrieval + +Retrieve molecular properties for compounds using either high-level or low-level approaches: + +**Using PubChemPy (Recommended)**: +```python +import pubchempy as pcp + +# Get compound object with all properties +compound = pcp.get_compounds('caffeine', 'name')[0] + +# Access individual properties +molecular_formula = compound.molecular_formula +molecular_weight = compound.molecular_weight +iupac_name = compound.iupac_name +smiles = compound.canonical_smiles +inchi = compound.inchi +xlogp = compound.xlogp # Partition coefficient +tpsa = compound.tpsa # Topological polar surface area +``` + +**Get Specific Properties**: +```python +# Request only specific properties +properties = pcp.get_properties( + ['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES', 'XLogP'], + 'aspirin', + 'name' +) +# Returns list of dictionaries +``` + +**Batch Property Retrieval**: +```python +import pandas as pd + +compound_names = ['aspirin', 'ibuprofen', 'paracetamol'] +all_properties = [] + +for name in compound_names: + props = pcp.get_properties( + ['MolecularFormula', 'MolecularWeight', 'XLogP'], + name, + 'name' + ) + all_properties.extend(props) + +df = pd.DataFrame(all_properties) +``` + +**Available Properties**: MolecularFormula, MolecularWeight, CanonicalSMILES, IsomericSMILES, InChI, InChIKey, IUPACName, XLogP, TPSA, HBondDonorCount, HBondAcceptorCount, RotatableBondCount, Complexity, Charge, and many more (see `references/api_reference.md` for complete list). + +### 3. Similarity Search + +Find structurally similar compounds using Tanimoto similarity: + +```python +import pubchempy as pcp + +# Start with a query compound +query_compound = pcp.get_compounds('gefitinib', 'name')[0] +query_smiles = query_compound.canonical_smiles + +# Perform similarity search +similar_compounds = pcp.get_compounds( + query_smiles, + 'smiles', + searchtype='similarity', + Threshold=85, # Similarity threshold (0-100) + MaxRecords=50 +) + +# Process results +for compound in similar_compounds[:10]: + print(f"CID {compound.cid}: {compound.iupac_name}") + print(f" MW: {compound.molecular_weight}") +``` + +**Note**: Similarity searches are asynchronous for large queries and may take 15-30 seconds to complete. PubChemPy handles the asynchronous pattern automatically. + +### 4. Substructure Search + +Find compounds containing a specific structural motif: + +```python +import pubchempy as pcp + +# Search for compounds containing pyridine ring +pyridine_smiles = 'c1ccncc1' + +matches = pcp.get_compounds( + pyridine_smiles, + 'smiles', + searchtype='substructure', + MaxRecords=100 +) + +print(f"Found {len(matches)} compounds containing pyridine") +``` + +**Common Substructures**: +- Benzene ring: `c1ccccc1` +- Pyridine: `c1ccncc1` +- Phenol: `c1ccc(O)cc1` +- Carboxylic acid: `C(=O)O` + +### 5. Format Conversion + +Convert between different chemical structure formats: + +```python +import pubchempy as pcp + +compound = pcp.get_compounds('aspirin', 'name')[0] + +# Convert to different formats +smiles = compound.canonical_smiles +inchi = compound.inchi +inchikey = compound.inchikey +cid = compound.cid + +# Download structure files +pcp.download('SDF', 'aspirin', 'name', 'aspirin.sdf', overwrite=True) +pcp.download('JSON', '2244', 'cid', 'aspirin.json', overwrite=True) +``` + +### 6. Structure Visualization + +Generate 2D structure images: + +```python +import pubchempy as pcp + +# Download compound structure as PNG +pcp.download('PNG', 'caffeine', 'name', 'caffeine.png', overwrite=True) + +# Using direct URL (via requests) +import requests + +cid = 2244 # Aspirin +url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/PNG?image_size=large" +response = requests.get(url) + +with open('structure.png', 'wb') as f: + f.write(response.content) +``` + +### 7. Synonym Retrieval + +Get all known names and synonyms for a compound: + +```python +import pubchempy as pcp + +synonyms_data = pcp.get_synonyms('aspirin', 'name') + +if synonyms_data: + cid = synonyms_data[0]['CID'] + synonyms = synonyms_data[0]['Synonym'] + + print(f"CID {cid} has {len(synonyms)} synonyms:") + for syn in synonyms[:10]: # First 10 + print(f" - {syn}") +``` + +### 8. Bioactivity Data Access + +Retrieve biological activity data from assays: + +```python +import requests +import json + +# Get bioassay summary for a compound +cid = 2244 # Aspirin +url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON" + +response = requests.get(url) +if response.status_code == 200: + data = response.json() + # Process bioassay information + table = data.get('Table', {}) + rows = table.get('Row', []) + print(f"Found {len(rows)} bioassay records") +``` + +**For more complex bioactivity queries**, use the `scripts/bioactivity_query.py` helper script which provides: +- Bioassay summaries with activity outcome filtering +- Assay target identification +- Search for compounds by biological target +- Active compound lists for specific assays + +### 9. Comprehensive Compound Annotations + +Access detailed compound information through PUG-View: + +```python +import requests + +cid = 2244 +url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON" + +response = requests.get(url) +if response.status_code == 200: + annotations = response.json() + # Contains extensive data including: + # - Chemical and Physical Properties + # - Drug and Medication Information + # - Pharmacology and Biochemistry + # - Safety and Hazards + # - Toxicity + # - Literature references + # - Patents +``` + +**Get Specific Section**: +```python +# Get only drug information +url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON?heading=Drug and Medication Information" +``` + +## Installation Requirements + +Install PubChemPy for Python-based access: + +```bash +pip install pubchempy +``` + +For direct API access and bioactivity queries: + +```bash +pip install requests +``` + +Optional for data analysis: + +```bash +pip install pandas +``` + +## Helper Scripts + +This skill includes Python scripts for common PubChem tasks: + +### scripts/compound_search.py + +Provides utility functions for searching and retrieving compound information: + +**Key Functions**: +- `search_by_name(name, max_results=10)`: Search compounds by name +- `search_by_smiles(smiles)`: Search by SMILES string +- `get_compound_by_cid(cid)`: Retrieve compound by CID +- `get_compound_properties(identifier, namespace, properties)`: Get specific properties +- `similarity_search(smiles, threshold, max_records)`: Perform similarity search +- `substructure_search(smiles, max_records)`: Perform substructure search +- `get_synonyms(identifier, namespace)`: Get all synonyms +- `batch_search(identifiers, namespace, properties)`: Batch search multiple compounds +- `download_structure(identifier, namespace, format, filename)`: Download structures +- `print_compound_info(compound)`: Print formatted compound information + +**Usage**: +```python +from scripts.compound_search import search_by_name, get_compound_properties + +# Search for a compound +compounds = search_by_name('ibuprofen') + +# Get specific properties +props = get_compound_properties('aspirin', 'name', ['MolecularWeight', 'XLogP']) +``` + +### scripts/bioactivity_query.py + +Provides functions for retrieving biological activity data: + +**Key Functions**: +- `get_bioassay_summary(cid)`: Get bioassay summary for compound +- `get_compound_bioactivities(cid, activity_outcome)`: Get filtered bioactivities +- `get_assay_description(aid)`: Get detailed assay information +- `get_assay_targets(aid)`: Get biological targets for assay +- `search_assays_by_target(target_name, max_results)`: Find assays by target +- `get_active_compounds_in_assay(aid, max_results)`: Get active compounds +- `get_compound_annotations(cid, section)`: Get PUG-View annotations +- `summarize_bioactivities(cid)`: Generate bioactivity summary statistics +- `find_compounds_by_bioactivity(target, threshold, max_compounds)`: Find compounds by target + +**Usage**: +```python +from scripts.bioactivity_query import get_bioassay_summary, summarize_bioactivities + +# Get bioactivity summary +summary = summarize_bioactivities(2244) # Aspirin +print(f"Total assays: {summary['total_assays']}") +print(f"Active: {summary['active']}, Inactive: {summary['inactive']}") +``` + +## API Rate Limits and Best Practices + +**Rate Limits**: +- Maximum 5 requests per second +- Maximum 400 requests per minute +- Maximum 300 seconds running time per minute + +**Best Practices**: +1. **Use CIDs for repeated queries**: CIDs are more efficient than names or structures +2. **Cache results locally**: Store frequently accessed data +3. **Batch requests**: Combine multiple queries when possible +4. **Implement delays**: Add 0.2-0.3 second delays between requests +5. **Handle errors gracefully**: Check for HTTP errors and missing data +6. **Use PubChemPy**: Higher-level abstraction handles many edge cases +7. **Leverage asynchronous pattern**: For large similarity/substructure searches +8. **Specify MaxRecords**: Limit results to avoid timeouts + +**Error Handling**: +```python +from pubchempy import BadRequestError, NotFoundError, TimeoutError + +try: + compound = pcp.get_compounds('query', 'name')[0] +except NotFoundError: + print("Compound not found") +except BadRequestError: + print("Invalid request format") +except TimeoutError: + print("Request timed out - try reducing scope") +except IndexError: + print("No results returned") +``` + +## Common Workflows + +### Workflow 1: Chemical Identifier Conversion Pipeline + +Convert between different chemical identifiers: + +```python +import pubchempy as pcp + +# Start with any identifier type +compound = pcp.get_compounds('caffeine', 'name')[0] + +# Extract all identifier formats +identifiers = { + 'CID': compound.cid, + 'Name': compound.iupac_name, + 'SMILES': compound.canonical_smiles, + 'InChI': compound.inchi, + 'InChIKey': compound.inchikey, + 'Formula': compound.molecular_formula +} +``` + +### Workflow 2: Drug-Like Property Screening + +Screen compounds using Lipinski's Rule of Five: + +```python +import pubchempy as pcp + +def check_drug_likeness(compound_name): + compound = pcp.get_compounds(compound_name, 'name')[0] + + # Lipinski's Rule of Five + rules = { + 'MW <= 500': compound.molecular_weight <= 500, + 'LogP <= 5': compound.xlogp <= 5 if compound.xlogp else None, + 'HBD <= 5': compound.h_bond_donor_count <= 5, + 'HBA <= 10': compound.h_bond_acceptor_count <= 10 + } + + violations = sum(1 for v in rules.values() if v is False) + return rules, violations + +rules, violations = check_drug_likeness('aspirin') +print(f"Lipinski violations: {violations}") +``` + +### Workflow 3: Finding Similar Drug Candidates + +Identify structurally similar compounds to a known drug: + +```python +import pubchempy as pcp + +# Start with known drug +reference_drug = pcp.get_compounds('imatinib', 'name')[0] +reference_smiles = reference_drug.canonical_smiles + +# Find similar compounds +similar = pcp.get_compounds( + reference_smiles, + 'smiles', + searchtype='similarity', + Threshold=85, + MaxRecords=20 +) + +# Filter by drug-like properties +candidates = [] +for comp in similar: + if comp.molecular_weight and 200 <= comp.molecular_weight <= 600: + if comp.xlogp and -1 <= comp.xlogp <= 5: + candidates.append(comp) + +print(f"Found {len(candidates)} drug-like candidates") +``` + +### Workflow 4: Batch Compound Property Comparison + +Compare properties across multiple compounds: + +```python +import pubchempy as pcp +import pandas as pd + +compound_list = ['aspirin', 'ibuprofen', 'naproxen', 'celecoxib'] + +properties_list = [] +for name in compound_list: + try: + compound = pcp.get_compounds(name, 'name')[0] + properties_list.append({ + 'Name': name, + 'CID': compound.cid, + 'Formula': compound.molecular_formula, + 'MW': compound.molecular_weight, + 'LogP': compound.xlogp, + 'TPSA': compound.tpsa, + 'HBD': compound.h_bond_donor_count, + 'HBA': compound.h_bond_acceptor_count + }) + except Exception as e: + print(f"Error processing {name}: {e}") + +df = pd.DataFrame(properties_list) +print(df.to_string(index=False)) +``` + +### Workflow 5: Substructure-Based Virtual Screening + +Screen for compounds containing specific pharmacophores: + +```python +import pubchempy as pcp + +# Define pharmacophore (e.g., sulfonamide group) +pharmacophore_smiles = 'S(=O)(=O)N' + +# Search for compounds containing this substructure +hits = pcp.get_compounds( + pharmacophore_smiles, + 'smiles', + searchtype='substructure', + MaxRecords=100 +) + +# Further filter by properties +filtered_hits = [ + comp for comp in hits + if comp.molecular_weight and comp.molecular_weight < 500 +] + +print(f"Found {len(filtered_hits)} compounds with desired substructure") +``` + +## Reference Documentation + +For detailed API documentation, including complete property lists, URL patterns, advanced query options, and more examples, consult `references/api_reference.md`. This comprehensive reference includes: + +- Complete PUG-REST API endpoint documentation +- Full list of available molecular properties +- Asynchronous request handling patterns +- PubChemPy API reference +- PUG-View API for annotations +- Common workflows and use cases +- Links to official PubChem documentation + +## Troubleshooting + +**Compound Not Found**: +- Try alternative names or synonyms +- Use CID if known +- Check spelling and chemical name format + +**Timeout Errors**: +- Reduce MaxRecords parameter +- Add delays between requests +- Use CIDs instead of names for faster queries + +**Empty Property Values**: +- Not all properties are available for all compounds +- Check if property exists before accessing: `if compound.xlogp:` +- Some properties only available for certain compound types + +**Rate Limit Exceeded**: +- Implement delays (0.2-0.3 seconds) between requests +- Use batch operations where possible +- Consider caching results locally + +**Similarity/Substructure Search Hangs**: +- These are asynchronous operations that may take 15-30 seconds +- PubChemPy handles polling automatically +- Reduce MaxRecords if timing out + +## Additional Resources + +- PubChem Home: https://pubchem.ncbi.nlm.nih.gov/ +- PUG-REST Documentation: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest +- PUG-REST Tutorial: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest-tutorial +- PubChemPy Documentation: https://pubchempy.readthedocs.io/ +- PubChemPy GitHub: https://github.com/mcs07/PubChemPy diff --git a/scientific-databases/pubchem-database/references/api_reference.md b/scientific-databases/pubchem-database/references/api_reference.md new file mode 100644 index 0000000..1653107 --- /dev/null +++ b/scientific-databases/pubchem-database/references/api_reference.md @@ -0,0 +1,440 @@ +# PubChem API Reference + +## Overview + +PubChem is the world's largest freely available chemical database maintained by the National Center for Biotechnology Information (NCBI). It contains over 110 million unique chemical structures and over 270 million bioactivities from more than 770 data sources. + +## Database Structure + +PubChem consists of three primary subdatabases: + +1. **Compound Database**: Unique validated chemical structures with computed properties +2. **Substance Database**: Deposited chemical substance records from data sources +3. **BioAssay Database**: Biological activity test results for chemical compounds + +## PubChem PUG-REST API + +### Base URL Structure + +``` +https://pubchem.ncbi.nlm.nih.gov/rest/pug/// +``` + +Components: +- ``: compound/cid, substance/sid, assay/aid, or search specifications +- ``: Optional operations like property, synonyms, classification, etc. +- ``: Format such as JSON, XML, CSV, PNG, SDF, etc. + +### Common Request Patterns + +#### 1. Retrieve by Identifier + +Get compound by CID (Compound ID): +``` +GET /rest/pug/compound/cid/{cid}/property/{properties}/JSON +``` + +Get compound by name: +``` +GET /rest/pug/compound/name/{name}/property/{properties}/JSON +``` + +Get compound by SMILES: +``` +GET /rest/pug/compound/smiles/{smiles}/property/{properties}/JSON +``` + +Get compound by InChI: +``` +GET /rest/pug/compound/inchi/{inchi}/property/{properties}/JSON +``` + +#### 2. Available Properties + +Common molecular properties that can be retrieved: +- `MolecularFormula` +- `MolecularWeight` +- `CanonicalSMILES` +- `IsomericSMILES` +- `InChI` +- `InChIKey` +- `IUPACName` +- `XLogP` +- `ExactMass` +- `MonoisotopicMass` +- `TPSA` (Topological Polar Surface Area) +- `Complexity` +- `Charge` +- `HBondDonorCount` +- `HBondAcceptorCount` +- `RotatableBondCount` +- `HeavyAtomCount` +- `IsotopeAtomCount` +- `AtomStereoCount` +- `BondStereoCount` +- `CovalentUnitCount` +- `Volume3D` +- `XStericQuadrupole3D` +- `YStericQuadrupole3D` +- `ZStericQuadrupole3D` +- `FeatureCount3D` + +To retrieve multiple properties, separate them with commas: +``` +/property/MolecularFormula,MolecularWeight,CanonicalSMILES/JSON +``` + +#### 3. Structure Search Operations + +**Similarity Search**: +``` +POST /rest/pug/compound/similarity/smiles/{smiles}/JSON +Parameters: Threshold (default 90%) +``` + +**Substructure Search**: +``` +POST /rest/pug/compound/substructure/smiles/{smiles}/cids/JSON +``` + +**Superstructure Search**: +``` +POST /rest/pug/compound/superstructure/smiles/{smiles}/cids/JSON +``` + +#### 4. Image Generation + +Get 2D structure image: +``` +GET /rest/pug/compound/cid/{cid}/PNG +Optional parameters: image_size=small|large +``` + +#### 5. Format Conversion + +Get compound as SDF (Structure-Data File): +``` +GET /rest/pug/compound/cid/{cid}/SDF +``` + +Get compound as MOL: +``` +GET /rest/pug/compound/cid/{cid}/record/SDF +``` + +#### 6. Synonym Retrieval + +Get all synonyms for a compound: +``` +GET /rest/pug/compound/cid/{cid}/synonyms/JSON +``` + +#### 7. Bioassay Data + +Get bioassay data for a compound: +``` +GET /rest/pug/compound/cid/{cid}/assaysummary/JSON +``` + +Get specific assay information: +``` +GET /rest/pug/assay/aid/{aid}/description/JSON +``` + +### Asynchronous Requests + +For large queries (similarity/substructure searches), PUG-REST uses an asynchronous pattern: + +1. Submit the query (returns ListKey) +2. Check status using the ListKey +3. Retrieve results when ready + +Example workflow: +```python +# Step 1: Submit similarity search +response = requests.post( + "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{smiles}/cids/JSON", + data={"Threshold": 90} +) +listkey = response.json()["Waiting"]["ListKey"] + +# Step 2: Check status +status_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{listkey}/cids/JSON" + +# Step 3: Poll until ready (with timeout) +# Step 4: Retrieve results from the same URL +``` + +### Usage Limits + +**Rate Limits**: +- Maximum 5 requests per second +- Maximum 400 requests per minute +- Maximum 300 seconds running time per minute + +**Best Practices**: +- Use batch requests when possible +- Implement exponential backoff for retries +- Cache results when appropriate +- Use asynchronous pattern for large queries + +## PubChemPy Python Library + +PubChemPy is a Python wrapper that simplifies PUG-REST API access. + +### Installation + +```bash +pip install pubchempy +``` + +### Key Classes + +#### Compound Class + +Main class for representing chemical compounds: + +```python +import pubchempy as pcp + +# Get by CID +compound = pcp.Compound.from_cid(2244) + +# Access properties +compound.molecular_formula # 'C9H8O4' +compound.molecular_weight # 180.16 +compound.iupac_name # '2-acetyloxybenzoic acid' +compound.canonical_smiles # 'CC(=O)OC1=CC=CC=C1C(=O)O' +compound.isomeric_smiles # Same as canonical for non-stereoisomers +compound.inchi # InChI string +compound.inchikey # InChI Key +compound.xlogp # Partition coefficient +compound.tpsa # Topological polar surface area +``` + +#### Search Methods + +**By Name**: +```python +compounds = pcp.get_compounds('aspirin', 'name') +# Returns list of Compound objects +``` + +**By SMILES**: +```python +compound = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles')[0] +``` + +**By InChI**: +```python +compound = pcp.get_compounds('InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)', 'inchi')[0] +``` + +**By Formula**: +```python +compounds = pcp.get_compounds('C9H8O4', 'formula') +# Returns all compounds with this formula +``` + +**Similarity Search**: +```python +results = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles', + searchtype='similarity', + Threshold=90) +``` + +**Substructure Search**: +```python +results = pcp.get_compounds('c1ccccc1', 'smiles', + searchtype='substructure') +# Returns all compounds containing benzene ring +``` + +#### Property Retrieval + +Get specific properties for multiple compounds: +```python +properties = pcp.get_properties( + ['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES'], + 'aspirin', + 'name' +) +# Returns list of dictionaries +``` + +Get properties as pandas DataFrame: +```python +import pandas as pd +df = pd.DataFrame(properties) +``` + +#### Synonyms + +Get all synonyms for a compound: +```python +synonyms = pcp.get_synonyms('aspirin', 'name') +# Returns list of dictionaries with CID and synonym lists +``` + +#### Download Formats + +Download compound in various formats: +```python +# Get as SDF +sdf_data = pcp.download('SDF', 'aspirin', 'name', overwrite=True) + +# Get as JSON +json_data = pcp.download('JSON', '2244', 'cid') + +# Get as PNG image +pcp.download('PNG', '2244', 'cid', 'aspirin.png', overwrite=True) +``` + +### Error Handling + +```python +from pubchempy import BadRequestError, NotFoundError, TimeoutError + +try: + compound = pcp.get_compounds('nonexistent', 'name') +except NotFoundError: + print("Compound not found") +except BadRequestError: + print("Invalid request") +except TimeoutError: + print("Request timed out") +``` + +## PUG-View API + +PUG-View provides access to full textual annotations and specialized reports. + +### Key Endpoints + +Get compound annotations: +``` +GET /rest/pug_view/data/compound/{cid}/JSON +``` + +Get specific annotation sections: +``` +GET /rest/pug_view/data/compound/{cid}/JSON?heading={section_name} +``` + +Available sections include: +- Chemical and Physical Properties +- Drug and Medication Information +- Pharmacology and Biochemistry +- Safety and Hazards +- Toxicity +- Literature +- Patents +- Biomolecular Interactions and Pathways + +## Common Workflows + +### 1. Chemical Identifier Conversion + +Convert from name to SMILES to InChI: +```python +import pubchempy as pcp + +compound = pcp.get_compounds('caffeine', 'name')[0] +smiles = compound.canonical_smiles +inchi = compound.inchi +inchikey = compound.inchikey +cid = compound.cid +``` + +### 2. Batch Property Retrieval + +Get properties for multiple compounds: +```python +compound_names = ['aspirin', 'ibuprofen', 'paracetamol'] +properties = [] + +for name in compound_names: + props = pcp.get_properties( + ['MolecularFormula', 'MolecularWeight', 'XLogP'], + name, + 'name' + ) + properties.extend(props) + +import pandas as pd +df = pd.DataFrame(properties) +``` + +### 3. Finding Similar Compounds + +Find structurally similar compounds to a query: +```python +# Start with a known compound +query_compound = pcp.get_compounds('gefitinib', 'name')[0] +query_smiles = query_compound.canonical_smiles + +# Perform similarity search +similar = pcp.get_compounds( + query_smiles, + 'smiles', + searchtype='similarity', + Threshold=85 +) + +# Get properties for similar compounds +for compound in similar[:10]: # First 10 results + print(f"{compound.cid}: {compound.iupac_name}, MW: {compound.molecular_weight}") +``` + +### 4. Substructure Screening + +Find all compounds containing a specific substructure: +```python +# Search for compounds containing pyridine ring +pyridine_smiles = 'c1ccncc1' + +matches = pcp.get_compounds( + pyridine_smiles, + 'smiles', + searchtype='substructure', + MaxRecords=100 +) + +print(f"Found {len(matches)} compounds containing pyridine") +``` + +### 5. Bioactivity Data Retrieval + +```python +import requests + +cid = 2244 # Aspirin +url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON" + +response = requests.get(url) +if response.status_code == 200: + bioassay_data = response.json() + # Process bioassay information +``` + +## Tips and Best Practices + +1. **Use CIDs for repeated queries**: CIDs are more efficient than names or structures +2. **Cache results**: Store frequently accessed data locally +3. **Batch requests**: Combine multiple queries when possible +4. **Handle rate limits**: Implement delays between requests +5. **Use appropriate search types**: Similarity for related compounds, substructure for motif finding +6. **Leverage PubChemPy**: Higher-level abstraction simplifies common tasks +7. **Handle missing data**: Not all properties are available for all compounds +8. **Use asynchronous pattern**: For large similarity/substructure searches +9. **Specify output format**: Choose JSON for programmatic access, SDF for cheminformatics tools +10. **Read documentation**: Full PUG-REST documentation available at https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest + +## Additional Resources + +- PubChem Home: https://pubchem.ncbi.nlm.nih.gov/ +- PUG-REST Documentation: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest +- PUG-REST Tutorial: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest-tutorial +- PubChemPy Documentation: https://pubchempy.readthedocs.io/ +- PubChemPy GitHub: https://github.com/mcs07/PubChemPy +- IUPAC Tutorial: https://iupac.github.io/WFChemCookbook/datasources/pubchem_pugrest.html diff --git a/scientific-databases/pubchem-database/scripts/bioactivity_query.py b/scientific-databases/pubchem-database/scripts/bioactivity_query.py new file mode 100644 index 0000000..6fcc8d9 --- /dev/null +++ b/scientific-databases/pubchem-database/scripts/bioactivity_query.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +""" +PubChem Bioactivity Data Retrieval + +This script provides functions for retrieving biological activity data +from PubChem for compounds and assays. +""" + +import sys +import json +import time +from typing import Dict, List, Optional + +try: + import requests +except ImportError: + print("Error: requests is not installed. Install it with: pip install requests") + sys.exit(1) + + +BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" +PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view" + +# Rate limiting: 5 requests per second maximum +REQUEST_DELAY = 0.21 # seconds between requests + + +def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]: + """ + Make a rate-limited request to PubChem API. + + Args: + url: Request URL + method: HTTP method ('GET' or 'POST') + **kwargs: Additional arguments for requests + + Returns: + Response object or None on error + """ + time.sleep(REQUEST_DELAY) + + try: + if method.upper() == 'GET': + response = requests.get(url, **kwargs) + else: + response = requests.post(url, **kwargs) + + response.raise_for_status() + return response + except requests.exceptions.RequestException as e: + print(f"Request error: {e}") + return None + + +def get_bioassay_summary(cid: int) -> Optional[Dict]: + """ + Get bioassay summary for a compound. + + Args: + cid: PubChem Compound ID + + Returns: + Dictionary containing bioassay summary data + """ + url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON" + response = rate_limited_request(url) + + if response and response.status_code == 200: + return response.json() + return None + + +def get_compound_bioactivities( + cid: int, + activity_outcome: Optional[str] = None +) -> List[Dict]: + """ + Get bioactivity data for a compound. + + Args: + cid: PubChem Compound ID + activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive') + + Returns: + List of bioactivity records + """ + data = get_bioassay_summary(cid) + + if not data: + return [] + + activities = [] + table = data.get('Table', {}) + + for row in table.get('Row', []): + activity = {} + for i, cell in enumerate(row.get('Cell', [])): + column_name = table['Columns']['Column'][i] + activity[column_name] = cell + + if activity_outcome: + if activity.get('Activity Outcome', '').lower() == activity_outcome.lower(): + activities.append(activity) + else: + activities.append(activity) + + return activities + + +def get_assay_description(aid: int) -> Optional[Dict]: + """ + Get detailed description for a specific assay. + + Args: + aid: PubChem Assay ID (AID) + + Returns: + Dictionary containing assay description + """ + url = f"{BASE_URL}/assay/aid/{aid}/description/JSON" + response = rate_limited_request(url) + + if response and response.status_code == 200: + return response.json() + return None + + +def get_assay_targets(aid: int) -> List[str]: + """ + Get biological targets for an assay. + + Args: + aid: PubChem Assay ID + + Returns: + List of target names + """ + description = get_assay_description(aid) + + if not description: + return [] + + targets = [] + assay_data = description.get('PC_AssayContainer', [{}])[0] + assay = assay_data.get('assay', {}) + + # Extract target information + descr = assay.get('descr', {}) + for target in descr.get('target', []): + mol_id = target.get('mol_id', '') + name = target.get('name', '') + if name: + targets.append(name) + elif mol_id: + targets.append(f"GI:{mol_id}") + + return targets + + +def search_assays_by_target( + target_name: str, + max_results: int = 100 +) -> List[int]: + """ + Search for assays targeting a specific protein or gene. + + Args: + target_name: Name of the target (e.g., 'EGFR', 'p53') + max_results: Maximum number of results + + Returns: + List of Assay IDs (AIDs) + """ + # Use PubChem's text search for assays + url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON" + response = rate_limited_request(url) + + if response and response.status_code == 200: + data = response.json() + aids = data.get('IdentifierList', {}).get('AID', []) + return aids[:max_results] + return [] + + +def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]: + """ + Get list of active compounds in an assay. + + Args: + aid: PubChem Assay ID + max_results: Maximum number of results + + Returns: + List of Compound IDs (CIDs) that showed activity + """ + url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active" + response = rate_limited_request(url) + + if response and response.status_code == 200: + data = response.json() + cids = data.get('IdentifierList', {}).get('CID', []) + return cids[:max_results] + return [] + + +def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]: + """ + Get comprehensive compound annotations from PUG-View. + + Args: + cid: PubChem Compound ID + section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry') + + Returns: + Dictionary containing annotation data + """ + url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON" + + if section: + url += f"?heading={section}" + + response = rate_limited_request(url) + + if response and response.status_code == 200: + return response.json() + return None + + +def get_drug_information(cid: int) -> Optional[Dict]: + """ + Get drug and medication information for a compound. + + Args: + cid: PubChem Compound ID + + Returns: + Dictionary containing drug information + """ + return get_compound_annotations(cid, section="Drug and Medication Information") + + +def get_safety_hazards(cid: int) -> Optional[Dict]: + """ + Get safety and hazard information for a compound. + + Args: + cid: PubChem Compound ID + + Returns: + Dictionary containing safety information + """ + return get_compound_annotations(cid, section="Safety and Hazards") + + +def summarize_bioactivities(cid: int) -> Dict: + """ + Generate a summary of bioactivity data for a compound. + + Args: + cid: PubChem Compound ID + + Returns: + Dictionary with bioactivity summary statistics + """ + activities = get_compound_bioactivities(cid) + + summary = { + 'total_assays': len(activities), + 'active': 0, + 'inactive': 0, + 'inconclusive': 0, + 'unspecified': 0, + 'assay_types': {} + } + + for activity in activities: + outcome = activity.get('Activity Outcome', '').lower() + + if 'active' in outcome: + summary['active'] += 1 + elif 'inactive' in outcome: + summary['inactive'] += 1 + elif 'inconclusive' in outcome: + summary['inconclusive'] += 1 + else: + summary['unspecified'] += 1 + + return summary + + +def find_compounds_by_bioactivity( + target: str, + threshold: Optional[float] = None, + max_compounds: int = 100 +) -> List[Dict]: + """ + Find compounds with bioactivity against a specific target. + + Args: + target: Target name (e.g., 'EGFR') + threshold: Activity threshold (if applicable) + max_compounds: Maximum number of compounds to return + + Returns: + List of dictionaries with compound information and activity data + """ + # Step 1: Find assays for the target + assay_ids = search_assays_by_target(target, max_results=10) + + if not assay_ids: + print(f"No assays found for target: {target}") + return [] + + # Step 2: Get active compounds from these assays + compound_set = set() + compound_data = [] + + for aid in assay_ids[:5]: # Limit to first 5 assays + active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds) + + for cid in active_cids: + if cid not in compound_set and len(compound_data) < max_compounds: + compound_set.add(cid) + compound_data.append({ + 'cid': cid, + 'aid': aid, + 'target': target + }) + + if len(compound_data) >= max_compounds: + break + + return compound_data + + +def main(): + """Example usage of bioactivity query functions.""" + + # Example 1: Get bioassay summary for aspirin (CID 2244) + print("Example 1: Getting bioassay summary for aspirin (CID 2244)...") + summary = summarize_bioactivities(2244) + print(json.dumps(summary, indent=2)) + + # Example 2: Get active bioactivities for a compound + print("\nExample 2: Getting active bioactivities for aspirin...") + activities = get_compound_bioactivities(2244, activity_outcome='active') + print(f"Found {len(activities)} active bioactivities") + if activities: + print(f"First activity: {activities[0].get('Assay Name', 'N/A')}") + + # Example 3: Get assay information + print("\nExample 3: Getting assay description...") + if activities: + aid = activities[0].get('AID', 0) + targets = get_assay_targets(aid) + print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}") + + # Example 4: Search for compounds targeting EGFR + print("\nExample 4: Searching for EGFR inhibitors...") + egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5) + print(f"Found {len(egfr_compounds)} compounds with EGFR activity") + for comp in egfr_compounds[:5]: + print(f" CID {comp['cid']} (from AID {comp['aid']})") + + +if __name__ == '__main__': + main() diff --git a/scientific-databases/pubchem-database/scripts/compound_search.py b/scientific-databases/pubchem-database/scripts/compound_search.py new file mode 100644 index 0000000..b6b8a8b --- /dev/null +++ b/scientific-databases/pubchem-database/scripts/compound_search.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +""" +PubChem Compound Search Utility + +This script provides functions for searching and retrieving compound information +from PubChem using the PubChemPy library. +""" + +import sys +import json +from typing import List, Dict, Optional, Union + +try: + import pubchempy as pcp +except ImportError: + print("Error: pubchempy is not installed. Install it with: pip install pubchempy") + sys.exit(1) + + +def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]: + """ + Search for compounds by name. + + Args: + name: Chemical name to search for + max_results: Maximum number of results to return + + Returns: + List of Compound objects + """ + try: + compounds = pcp.get_compounds(name, 'name') + return compounds[:max_results] + except Exception as e: + print(f"Error searching for '{name}': {e}") + return [] + + +def search_by_smiles(smiles: str) -> Optional[pcp.Compound]: + """ + Search for a compound by SMILES string. + + Args: + smiles: SMILES string + + Returns: + Compound object or None if not found + """ + try: + compounds = pcp.get_compounds(smiles, 'smiles') + return compounds[0] if compounds else None + except Exception as e: + print(f"Error searching for SMILES '{smiles}': {e}") + return None + + +def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]: + """ + Retrieve a compound by its CID (Compound ID). + + Args: + cid: PubChem Compound ID + + Returns: + Compound object or None if not found + """ + try: + return pcp.Compound.from_cid(cid) + except Exception as e: + print(f"Error retrieving CID {cid}: {e}") + return None + + +def get_compound_properties( + identifier: Union[str, int], + namespace: str = 'name', + properties: Optional[List[str]] = None +) -> Dict: + """ + Get specific properties for a compound. + + Args: + identifier: Compound identifier (name, SMILES, CID, etc.) + namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.) + properties: List of properties to retrieve. If None, returns common properties. + + Returns: + Dictionary of properties + """ + if properties is None: + properties = [ + 'MolecularFormula', + 'MolecularWeight', + 'CanonicalSMILES', + 'IUPACName', + 'XLogP', + 'TPSA', + 'HBondDonorCount', + 'HBondAcceptorCount' + ] + + try: + result = pcp.get_properties(properties, identifier, namespace) + return result[0] if result else {} + except Exception as e: + print(f"Error getting properties for '{identifier}': {e}") + return {} + + +def similarity_search( + smiles: str, + threshold: int = 90, + max_records: int = 10 +) -> List[pcp.Compound]: + """ + Perform similarity search for compounds similar to the query structure. + + Args: + smiles: Query SMILES string + threshold: Similarity threshold (0-100) + max_records: Maximum number of results + + Returns: + List of similar Compound objects + """ + try: + compounds = pcp.get_compounds( + smiles, + 'smiles', + searchtype='similarity', + Threshold=threshold, + MaxRecords=max_records + ) + return compounds + except Exception as e: + print(f"Error in similarity search: {e}") + return [] + + +def substructure_search( + smiles: str, + max_records: int = 100 +) -> List[pcp.Compound]: + """ + Perform substructure search for compounds containing the query structure. + + Args: + smiles: Query SMILES string (substructure) + max_records: Maximum number of results + + Returns: + List of Compound objects containing the substructure + """ + try: + compounds = pcp.get_compounds( + smiles, + 'smiles', + searchtype='substructure', + MaxRecords=max_records + ) + return compounds + except Exception as e: + print(f"Error in substructure search: {e}") + return [] + + +def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]: + """ + Get all synonyms for a compound. + + Args: + identifier: Compound identifier + namespace: Type of identifier + + Returns: + List of synonym strings + """ + try: + results = pcp.get_synonyms(identifier, namespace) + if results: + return results[0].get('Synonym', []) + return [] + except Exception as e: + print(f"Error getting synonyms: {e}") + return [] + + +def batch_search( + identifiers: List[str], + namespace: str = 'name', + properties: Optional[List[str]] = None +) -> List[Dict]: + """ + Batch search for multiple compounds. + + Args: + identifiers: List of compound identifiers + namespace: Type of identifiers + properties: List of properties to retrieve + + Returns: + List of dictionaries containing properties for each compound + """ + results = [] + for identifier in identifiers: + props = get_compound_properties(identifier, namespace, properties) + if props: + props['query'] = identifier + results.append(props) + return results + + +def download_structure( + identifier: Union[str, int], + namespace: str = 'name', + format: str = 'SDF', + filename: Optional[str] = None +) -> Optional[str]: + """ + Download compound structure in specified format. + + Args: + identifier: Compound identifier + namespace: Type of identifier + format: Output format ('SDF', 'JSON', 'PNG', etc.) + filename: Output filename (if None, returns data as string) + + Returns: + Data string if filename is None, else None + """ + try: + if filename: + pcp.download(format, identifier, namespace, filename, overwrite=True) + return None + else: + return pcp.download(format, identifier, namespace) + except Exception as e: + print(f"Error downloading structure: {e}") + return None + + +def print_compound_info(compound: pcp.Compound) -> None: + """ + Print formatted compound information. + + Args: + compound: PubChemPy Compound object + """ + print(f"\n{'='*60}") + print(f"Compound CID: {compound.cid}") + print(f"{'='*60}") + print(f"IUPAC Name: {compound.iupac_name or 'N/A'}") + print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}") + print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol") + print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}") + print(f"InChI: {compound.inchi or 'N/A'}") + print(f"InChI Key: {compound.inchikey or 'N/A'}") + print(f"XLogP: {compound.xlogp or 'N/A'}") + print(f"TPSA: {compound.tpsa or 'N/A'} Ų") + print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}") + print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}") + print(f"{'='*60}\n") + + +def main(): + """Example usage of PubChem search functions.""" + + # Example 1: Search by name + print("Example 1: Searching for 'aspirin'...") + compounds = search_by_name('aspirin', max_results=1) + if compounds: + print_compound_info(compounds[0]) + + # Example 2: Get properties + print("\nExample 2: Getting properties for caffeine...") + props = get_compound_properties('caffeine', 'name') + print(json.dumps(props, indent=2)) + + # Example 3: Similarity search + print("\nExample 3: Finding compounds similar to benzene...") + benzene_smiles = 'c1ccccc1' + similar = similarity_search(benzene_smiles, threshold=95, max_records=5) + print(f"Found {len(similar)} similar compounds:") + for comp in similar: + print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}") + + # Example 4: Batch search + print("\nExample 4: Batch search for multiple compounds...") + names = ['aspirin', 'ibuprofen', 'paracetamol'] + results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight']) + for result in results: + print(f" {result.get('query')}: {result.get('MolecularFormula')} " + f"({result.get('MolecularWeight')} g/mol)") + + +if __name__ == '__main__': + main() diff --git a/scientific-packages/anndata/SKILL.md b/scientific-packages/anndata/SKILL.md new file mode 100644 index 0000000..0fa9c84 --- /dev/null +++ b/scientific-packages/anndata/SKILL.md @@ -0,0 +1,527 @@ +--- +name: anndata +description: Work with AnnData objects for annotated data matrices, commonly used in single-cell genomics and other scientific domains. This skill should be used when working with .h5ad files, performing single-cell RNA-seq analysis, managing annotated datasets, concatenating multiple datasets, or working with sparse matrices and embeddings in a structured format. +--- + +# AnnData + +## Overview + +AnnData (Annotated Data) is Python's standard for storing and manipulating annotated data matrices, particularly in single-cell genomics. This skill provides comprehensive guidance for working with AnnData objects, including data creation, manipulation, file I/O, concatenation, and best practices for memory-efficient workflows. + +## Core Capabilities + +### 1. Creating and Structuring AnnData Objects + +Create AnnData objects from various data sources and organize multi-dimensional annotations. + +**Basic creation:** +```python +import anndata as ad +import numpy as np +from scipy.sparse import csr_matrix + +# From dense or sparse arrays +counts = np.random.poisson(1, size=(100, 2000)) +adata = ad.AnnData(counts) + +# With sparse matrix (memory-efficient) +counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32) +adata = ad.AnnData(counts) +``` + +**With metadata:** +```python +import pandas as pd + +obs_meta = pd.DataFrame({ + 'cell_type': pd.Categorical(['B', 'T', 'Monocyte'] * 33 + ['B']), + 'batch': ['batch1'] * 50 + ['batch2'] * 50 +}) +var_meta = pd.DataFrame({ + 'gene_name': [f'Gene_{i}' for i in range(2000)], + 'highly_variable': np.random.choice([True, False], 2000) +}) + +adata = ad.AnnData(counts, obs=obs_meta, var=var_meta) +``` + +**Understanding the structure:** +- **X**: Primary data matrix (observations × variables) +- **obs**: Row (observation) annotations as DataFrame +- **var**: Column (variable) annotations as DataFrame +- **obsm**: Multi-dimensional observation annotations (e.g., PCA, UMAP coordinates) +- **varm**: Multi-dimensional variable annotations (e.g., gene loadings) +- **layers**: Alternative data matrices with same dimensions as X +- **uns**: Unstructured metadata dictionary +- **obsp/varp**: Pairwise relationship matrices (graphs) + +### 2. Adding Annotations and Layers + +Organize different data representations and metadata within a single object. + +**Cell-level metadata (obs):** +```python +adata.obs['n_genes'] = (adata.X > 0).sum(axis=1) +adata.obs['total_counts'] = adata.X.sum(axis=1) +adata.obs['condition'] = pd.Categorical(['control', 'treated'] * 50) +``` + +**Gene-level metadata (var):** +```python +adata.var['highly_variable'] = gene_variance > threshold +adata.var['chromosome'] = pd.Categorical(['chr1', 'chr2', ...]) +``` + +**Embeddings (obsm/varm):** +```python +# Dimensionality reduction results +adata.obsm['X_pca'] = pca_coordinates # Shape: (n_obs, n_components) +adata.obsm['X_umap'] = umap_coordinates # Shape: (n_obs, 2) +adata.obsm['X_tsne'] = tsne_coordinates + +# Gene loadings +adata.varm['PCs'] = principal_components # Shape: (n_vars, n_components) +``` + +**Alternative data representations (layers):** +```python +# Store multiple versions +adata.layers['counts'] = raw_counts +adata.layers['log1p'] = np.log1p(adata.X) +adata.layers['scaled'] = (adata.X - mean) / std +``` + +**Unstructured metadata (uns):** +```python +# Analysis parameters +adata.uns['preprocessing'] = { + 'normalization': 'TPM', + 'min_genes': 200, + 'date': '2024-01-15' +} + +# Results +adata.uns['pca'] = {'variance_ratio': variance_explained} +``` + +### 3. Subsetting and Views + +Efficiently subset data while managing memory through views and copies. + +**Subsetting operations:** +```python +# By observation/variable names +subset = adata[['Cell_1', 'Cell_10'], ['Gene_5', 'Gene_1900']] + +# By boolean masks +b_cells = adata[adata.obs.cell_type == 'B'] +high_quality = adata[adata.obs.n_genes > 200] + +# By position +first_cells = adata[:100, :] +top_genes = adata[:, :500] + +# Combined conditions +filtered = adata[ + (adata.obs.batch == 'batch1') & (adata.obs.n_genes > 200), + adata.var.highly_variable +] +``` + +**Understanding views:** +- Subsetting returns **views** by default (memory-efficient, shares data with original) +- Modifying a view affects the original object +- Check with `adata.is_view` +- Convert to independent copy with `.copy()` + +```python +# View (memory-efficient) +subset = adata[adata.obs.condition == 'treated'] +print(subset.is_view) # True + +# Independent copy +subset_copy = adata[adata.obs.condition == 'treated'].copy() +print(subset_copy.is_view) # False +``` + +### 4. File I/O and Backed Mode + +Read and write data efficiently, with options for memory-limited environments. + +**Writing data:** +```python +# Standard format with compression +adata.write('results.h5ad', compression='gzip') + +# Alternative formats +adata.write_zarr('results.zarr') # For cloud storage +adata.write_loom('results.loom') # For compatibility +adata.write_csvs('results/') # As CSV files +``` + +**Reading data:** +```python +# Load into memory +adata = ad.read_h5ad('results.h5ad') + +# Backed mode (disk-backed, memory-efficient) +adata = ad.read_h5ad('large_file.h5ad', backed='r') +print(adata.isbacked) # True +print(adata.filename) # Path to file + +# Close file connection when done +adata.file.close() +``` + +**Reading from other formats:** +```python +# 10X format +adata = ad.read_mtx('matrix.mtx') + +# CSV +adata = ad.read_csv('data.csv') + +# Loom +adata = ad.read_loom('data.loom') +``` + +**Working with backed mode:** +```python +# Read in backed mode for large files +adata = ad.read_h5ad('large_dataset.h5ad', backed='r') + +# Process in chunks +for chunk in adata.chunk_X(chunk_size=1000): + result = process_chunk(chunk) + +# Load to memory if needed +adata_memory = adata.to_memory() +``` + +### 5. Concatenating Multiple Datasets + +Combine multiple AnnData objects with control over how data is merged. + +**Basic concatenation:** +```python +# Concatenate observations (most common) +combined = ad.concat([adata1, adata2, adata3], axis=0) + +# Concatenate variables (rare) +combined = ad.concat([adata1, adata2], axis=1) +``` + +**Join strategies:** +```python +# Inner join: only shared variables (no missing data) +combined = ad.concat([adata1, adata2], join='inner') + +# Outer join: all variables (fills missing with 0) +combined = ad.concat([adata1, adata2], join='outer') +``` + +**Tracking data sources:** +```python +# Add source labels +combined = ad.concat( + [adata1, adata2, adata3], + label='dataset', + keys=['exp1', 'exp2', 'exp3'] +) +# Creates combined.obs['dataset'] with values 'exp1', 'exp2', 'exp3' + +# Make duplicate indices unique +combined = ad.concat( + [adata1, adata2], + keys=['batch1', 'batch2'], + index_unique='-' +) +# Cell names become: Cell_0-batch1, Cell_0-batch2, etc. +``` + +**Merge strategies for metadata:** +```python +# merge=None: exclude variable annotations (default) +combined = ad.concat([adata1, adata2], merge=None) + +# merge='same': keep only identical annotations +combined = ad.concat([adata1, adata2], merge='same') + +# merge='first': use first occurrence +combined = ad.concat([adata1, adata2], merge='first') + +# merge='unique': keep annotations with single value +combined = ad.concat([adata1, adata2], merge='unique') +``` + +**Complete example:** +```python +# Load batches +batch1 = ad.read_h5ad('batch1.h5ad') +batch2 = ad.read_h5ad('batch2.h5ad') +batch3 = ad.read_h5ad('batch3.h5ad') + +# Concatenate with full tracking +combined = ad.concat( + [batch1, batch2, batch3], + axis=0, + join='outer', # Keep all genes + merge='first', # Use first batch's annotations + label='batch_id', # Track source + keys=['b1', 'b2', 'b3'], # Custom labels + index_unique='-' # Make cell names unique +) +``` + +### 6. Data Conversion and Extraction + +Convert between AnnData and other formats for interoperability. + +**To DataFrame:** +```python +# Convert X to DataFrame +df = adata.to_df() + +# Convert specific layer +df = adata.to_df(layer='log1p') +``` + +**Extract vectors:** +```python +# Get 1D arrays from data or annotations +gene_expression = adata.obs_vector('Gene_100') +cell_metadata = adata.obs_vector('n_genes') +``` + +**Transpose:** +```python +# Swap observations and variables +transposed = adata.T +``` + +### 7. Memory Optimization + +Strategies for working with large datasets efficiently. + +**Use sparse matrices:** +```python +from scipy.sparse import csr_matrix + +# Check sparsity +density = (adata.X != 0).sum() / adata.X.size +if density < 0.3: # Less than 30% non-zero + adata.X = csr_matrix(adata.X) +``` + +**Convert strings to categoricals:** +```python +# Automatic conversion +adata.strings_to_categoricals() + +# Manual conversion (more control) +adata.obs['cell_type'] = pd.Categorical(adata.obs['cell_type']) +``` + +**Use backed mode:** +```python +# Read without loading into memory +adata = ad.read_h5ad('large_file.h5ad', backed='r') + +# Work with subsets +subset = adata[:1000, :500].copy() # Only this subset in memory +``` + +**Chunked processing:** +```python +# Process data in chunks +results = [] +for chunk in adata.chunk_X(chunk_size=1000): + result = expensive_computation(chunk) + results.append(result) +``` + +## Common Workflows + +### Single-Cell RNA-seq Analysis + +Complete workflow from loading to analysis: + +```python +import anndata as ad +import numpy as np +import pandas as pd + +# 1. Load data +adata = ad.read_mtx('matrix.mtx') +adata.obs_names = pd.read_csv('barcodes.tsv', header=None)[0] +adata.var_names = pd.read_csv('genes.tsv', header=None)[0] + +# 2. Quality control +adata.obs['n_genes'] = (adata.X > 0).sum(axis=1) +adata.obs['total_counts'] = adata.X.sum(axis=1) +adata = adata[adata.obs.n_genes > 200] +adata = adata[adata.obs.total_counts < 10000] + +# 3. Filter genes +min_cells = 3 +adata = adata[:, (adata.X > 0).sum(axis=0) >= min_cells] + +# 4. Store raw counts +adata.layers['counts'] = adata.X.copy() + +# 5. Normalize +adata.X = adata.X / adata.obs.total_counts.values[:, None] * 1e4 +adata.X = np.log1p(adata.X) + +# 6. Feature selection +gene_var = adata.X.var(axis=0) +adata.var['highly_variable'] = gene_var > np.percentile(gene_var, 90) + +# 7. Dimensionality reduction (example with external tools) +# adata.obsm['X_pca'] = compute_pca(adata.X) +# adata.obsm['X_umap'] = compute_umap(adata.obsm['X_pca']) + +# 8. Save results +adata.write('analyzed.h5ad', compression='gzip') +``` + +### Batch Integration + +Combining multiple experimental batches: + +```python +# Load batches +batches = [ad.read_h5ad(f'batch_{i}.h5ad') for i in range(3)] + +# Concatenate with tracking +combined = ad.concat( + batches, + axis=0, + join='outer', + label='batch', + keys=['batch_0', 'batch_1', 'batch_2'], + index_unique='-' +) + +# Add batch as numeric for correction algorithms +combined.obs['batch_numeric'] = combined.obs['batch'].cat.codes + +# Perform batch correction (with external tools) +# corrected_pca = run_harmony(combined.obsm['X_pca'], combined.obs['batch']) +# combined.obsm['X_pca_corrected'] = corrected_pca + +# Save integrated data +combined.write('integrated.h5ad', compression='gzip') +``` + +### Memory-Efficient Large Dataset Processing + +Working with datasets too large for memory: + +```python +# Read in backed mode +adata = ad.read_h5ad('huge_dataset.h5ad', backed='r') + +# Compute statistics in chunks +total = 0 +for chunk in adata.chunk_X(chunk_size=1000): + total += chunk.sum() + +mean_expression = total / (adata.n_obs * adata.n_vars) + +# Work with subset +high_quality_cells = adata.obs.n_genes > 1000 +subset = adata[high_quality_cells, :].copy() + +# Close file +adata.file.close() +``` + +## Best Practices + +### Data Organization + +1. **Use layers for different representations**: Store raw counts, normalized, log-transformed, and scaled data in separate layers +2. **Use obsm/varm for multi-dimensional data**: Embeddings, loadings, and other matrix-like annotations +3. **Use uns for metadata**: Analysis parameters, dates, version information +4. **Use categoricals for efficiency**: Convert repeated strings to categorical types + +### Subsetting + +1. **Understand views vs copies**: Subsetting returns views by default; use `.copy()` when you need independence +2. **Chain conditions efficiently**: Combine boolean masks in a single subsetting operation +3. **Validate after subsetting**: Check dimensions and data integrity + +### File I/O + +1. **Use compression**: Always use `compression='gzip'` when writing h5ad files +2. **Choose the right format**: H5AD for general use, Zarr for cloud storage, Loom for compatibility +3. **Close backed files**: Always close file connections when done +4. **Use backed mode for large files**: Don't load everything into memory if not needed + +### Concatenation + +1. **Choose appropriate join**: Inner join for complete cases, outer join to preserve all features +2. **Track sources**: Use `label` and `keys` to track data origin +3. **Handle duplicates**: Use `index_unique` to make observation names unique +4. **Select merge strategy**: Choose appropriate merge strategy for variable annotations + +### Memory Management + +1. **Use sparse matrices**: For data with <30% non-zero values +2. **Convert to categoricals**: For repeated string values +3. **Process in chunks**: For operations on very large matrices +4. **Use backed mode**: Read large files with `backed='r'` + +### Naming Conventions + +Follow these conventions for consistency: + +- **Embeddings**: `X_pca`, `X_umap`, `X_tsne` +- **Layers**: Descriptive names like `counts`, `log1p`, `scaled` +- **Observations**: Use snake_case like `cell_type`, `n_genes`, `total_counts` +- **Variables**: Use snake_case like `highly_variable`, `gene_name` + +## Reference Documentation + +For detailed API information, usage patterns, and troubleshooting, refer to the comprehensive reference files in the `references/` directory: + +1. **api_reference.md**: Complete API documentation including all classes, methods, and functions with usage examples. Use `grep -r "pattern" references/api_reference.md` to search for specific functions or parameters. + +2. **workflows_best_practices.md**: Detailed workflows for common tasks (single-cell analysis, batch integration, large datasets), best practices for memory management, data organization, and common pitfalls to avoid. Use `grep -r "pattern" references/workflows_best_practices.md` to search for specific workflows. + +3. **concatenation_guide.md**: Comprehensive guide to concatenation strategies, join types, merge strategies, source tracking, and troubleshooting concatenation issues. Use `grep -r "pattern" references/concatenation_guide.md` to search for concatenation patterns. + +## When to Load References + +Load reference files into context when: +- Implementing complex concatenation with specific merge strategies +- Troubleshooting errors or unexpected behavior +- Optimizing memory usage for large datasets +- Implementing complete analysis workflows +- Understanding nuances of specific API methods + +To search within references without loading them: +```python +# Example: Search for information about backed mode +grep -r "backed mode" references/ +``` + +## Common Error Patterns + +### Memory Errors +**Problem**: "MemoryError: Unable to allocate array" +**Solution**: Use backed mode, sparse matrices, or process in chunks + +### Dimension Mismatch +**Problem**: "ValueError: operands could not be broadcast together" +**Solution**: Use outer join in concatenation or align indices before operations + +### View Modification +**Problem**: "ValueError: assignment destination is read-only" +**Solution**: Convert view to copy with `.copy()` before modification + +### File Already Open +**Problem**: "OSError: Unable to open file (file is already open)" +**Solution**: Close previous file connection with `adata.file.close()` diff --git a/scientific-packages/anndata/references/api_reference.md b/scientific-packages/anndata/references/api_reference.md new file mode 100644 index 0000000..ee828b0 --- /dev/null +++ b/scientific-packages/anndata/references/api_reference.md @@ -0,0 +1,218 @@ +# AnnData API Reference + +## Core AnnData Class + +The `AnnData` class is the central data structure for storing and manipulating annotated datasets in single-cell genomics and other domains. + +### Core Attributes + +| Attribute | Type | Description | +|-----------|------|-------------| +| **X** | array-like | Primary data matrix (#observations × #variables). Supports NumPy arrays, sparse matrices (CSR/CSC), HDF5 datasets, Zarr arrays, and Dask arrays | +| **obs** | DataFrame | One-dimensional annotation of observations (rows). Length equals observation count | +| **var** | DataFrame | One-dimensional annotation of variables/features (columns). Length equals variable count | +| **uns** | OrderedDict | Unstructured annotation for miscellaneous metadata | +| **obsm** | dict-like | Multi-dimensional observation annotations (structured arrays aligned to observation axis) | +| **varm** | dict-like | Multi-dimensional variable annotations (structured arrays aligned to variable axis) | +| **obsp** | dict-like | Pairwise observation annotations (square matrices representing graphs) | +| **varp** | dict-like | Pairwise variable annotations (graphs between features) | +| **layers** | dict-like | Additional data matrices matching X's dimensions | +| **raw** | AnnData | Stores original versions of X and var before transformations | + +### Dimensional Properties + +- **n_obs**: Number of observations (sample count) +- **n_vars**: Number of variables/features +- **shape**: Tuple returning (n_obs, n_vars) +- **T**: Transposed view of the entire object + +### State Properties + +- **isbacked**: Boolean indicating disk-backed storage status +- **is_view**: Boolean identifying whether object is a view of another AnnData +- **filename**: Path to backing .h5ad file; setting this enables disk-backed mode + +### Key Methods + +#### Construction and Copying +- **`AnnData(X=None, obs=None, var=None, ...)`**: Create new AnnData object +- **`copy(filename=None)`**: Create full copy, optionally stored on disk + +#### Subsetting and Views +- **`adata[obs_subset, var_subset]`**: Subset observations and variables (returns view by default) +- **`.copy()`**: Convert view to independent object + +#### Data Access +- **`to_df(layer=None)`**: Generate pandas DataFrame representation +- **`obs_vector(k, layer=None)`**: Extract 1D array from X, layers, or annotations +- **`var_vector(k, layer=None)`**: Extract 1D array for a variable +- **`chunk_X(chunk_size)`**: Iterate over data matrix in chunks +- **`chunked_X(chunk_size)`**: Context manager for chunked iteration + +#### Transformation +- **`transpose()`**: Return transposed object +- **`concatenate(*adatas, ...)`**: Combine multiple AnnData objects along observation axis +- **`to_memory(copy=False)`**: Load all backed arrays into RAM + +#### File I/O +- **`write_h5ad(filename, compression='gzip')`**: Save as .h5ad HDF5 format +- **`write_zarr(store, ...)`**: Export hierarchical Zarr store +- **`write_loom(filename, ...)`**: Output .loom format file +- **`write_csvs(dirname, ...)`**: Write annotations as separate CSV files + +#### Data Management +- **`strings_to_categoricals()`**: Convert string annotations to categorical types +- **`rename_categories(key, categories)`**: Update category labels in annotations +- **`obs_names_make_unique(sep='-')`**: Append numeric suffixes to duplicate observation names +- **`var_names_make_unique(sep='-')`**: Append numeric suffixes to duplicate variable names + +## Module-Level Functions + +### Reading Functions + +#### Native Formats +- **`read_h5ad(filename, backed=None, as_sparse=None)`**: Load HDF5-based .h5ad files +- **`read_zarr(store)`**: Access hierarchical Zarr array stores + +#### Alternative Formats +- **`read_csv(filename, ...)`**: Import from CSV files +- **`read_excel(filename, ...)`**: Import from Excel files +- **`read_hdf(filename, key)`**: Read from HDF5 files +- **`read_loom(filename, ...)`**: Import from .loom files +- **`read_mtx(filename, ...)`**: Import from Matrix Market format +- **`read_text(filename, ...)`**: Import from text files +- **`read_umi_tools(filename, ...)`**: Import from UMI-tools format + +#### Element-Level Access +- **`read_elem(elem)`**: Retrieve specific components from storage +- **`sparse_dataset(group)`**: Generate backed sparse matrix classes + +### Combining Operations +- **`concat(adatas, axis=0, join='inner', merge=None, ...)`**: Merge multiple AnnData objects + - **axis**: 0 (observations) or 1 (variables) + - **join**: 'inner' (intersection) or 'outer' (union) + - **merge**: Strategy for non-concatenation axis ('same', 'unique', 'first', 'only', or None) + - **label**: Column name for source tracking + - **keys**: Dataset identifiers for source annotation + - **index_unique**: Separator for making duplicate indices unique + +### Writing Functions +- **`write_h5ad(filename, adata, compression='gzip')`**: Export to HDF5 format +- **`write_zarr(store, adata, ...)`**: Save as Zarr hierarchical arrays +- **`write_elem(elem, ...)`**: Write individual components + +### Experimental Features +- **`AnnCollection`**: Batch processing for large collections +- **`AnnLoader`**: PyTorch DataLoader integration +- **`concat_on_disk(*adatas, filename, ...)`**: Memory-efficient out-of-core concatenation +- **`read_lazy(filename)`**: Lazy loading with deferred computation +- **`read_dispatched(filename, ...)`**: Custom I/O with callbacks +- **`write_dispatched(filename, ...)`**: Custom writing with callbacks + +### Configuration +- **`settings`**: Package-wide configuration object +- **`settings.override(**kwargs)`**: Context manager for temporary settings changes + +## Common Usage Patterns + +### Creating AnnData Objects + +```python +import anndata as ad +import numpy as np +from scipy.sparse import csr_matrix + +# From dense array +counts = np.random.poisson(1, size=(100, 2000)) +adata = ad.AnnData(counts) + +# From sparse matrix +counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32) +adata = ad.AnnData(counts) + +# With metadata +import pandas as pd +obs_meta = pd.DataFrame({'cell_type': ['B', 'T', 'Monocyte'] * 33 + ['B']}) +var_meta = pd.DataFrame({'gene_name': [f'Gene_{i}' for i in range(2000)]}) +adata = ad.AnnData(counts, obs=obs_meta, var=var_meta) +``` + +### Subsetting + +```python +# By names +subset = adata[['Cell_1', 'Cell_10'], ['Gene_5', 'Gene_1900']] + +# By boolean mask +b_cells = adata[adata.obs.cell_type == 'B'] + +# By position +first_five = adata[:5, :100] + +# Convert view to copy +adata_copy = adata[:5].copy() +``` + +### Adding Annotations + +```python +# Cell-level metadata +adata.obs['batch'] = pd.Categorical(['batch1', 'batch2'] * 50) + +# Gene-level metadata +adata.var['highly_variable'] = np.random.choice([True, False], size=adata.n_vars) + +# Embeddings +adata.obsm['X_pca'] = np.random.normal(size=(adata.n_obs, 50)) +adata.obsm['X_umap'] = np.random.normal(size=(adata.n_obs, 2)) + +# Alternative data representations +adata.layers['log_transformed'] = np.log1p(adata.X) +adata.layers['scaled'] = (adata.X - adata.X.mean(axis=0)) / adata.X.std(axis=0) + +# Unstructured metadata +adata.uns['experiment_date'] = '2024-01-15' +adata.uns['parameters'] = {'min_genes': 200, 'min_cells': 3} +``` + +### File I/O + +```python +# Write to disk +adata.write('my_results.h5ad', compression='gzip') + +# Read into memory +adata = ad.read_h5ad('my_results.h5ad') + +# Read in backed mode (memory-efficient) +adata = ad.read_h5ad('my_results.h5ad', backed='r') + +# Close file connection +adata.file.close() +``` + +### Concatenation + +```python +# Combine multiple datasets +adata1 = ad.AnnData(np.random.poisson(1, size=(100, 2000))) +adata2 = ad.AnnData(np.random.poisson(1, size=(150, 2000))) +adata3 = ad.AnnData(np.random.poisson(1, size=(80, 2000))) + +# Simple concatenation +combined = ad.concat([adata1, adata2, adata3], axis=0) + +# With source labels +combined = ad.concat( + [adata1, adata2, adata3], + axis=0, + label='dataset', + keys=['exp1', 'exp2', 'exp3'] +) + +# Inner join (only shared variables) +combined = ad.concat([adata1, adata2, adata3], axis=0, join='inner') + +# Outer join (all variables, pad with zeros) +combined = ad.concat([adata1, adata2, adata3], axis=0, join='outer') +``` diff --git a/scientific-packages/anndata/references/concatenation_guide.md b/scientific-packages/anndata/references/concatenation_guide.md new file mode 100644 index 0000000..9c1379b --- /dev/null +++ b/scientific-packages/anndata/references/concatenation_guide.md @@ -0,0 +1,478 @@ +# AnnData Concatenation Guide + +## Overview + +The `concat()` function combines multiple AnnData objects through two fundamental operations: +1. **Concatenation**: Stacking sub-elements in order +2. **Merging**: Combining collections into one result + +## Basic Concatenation + +### Syntax +```python +import anndata as ad + +combined = ad.concat( + adatas, # List of AnnData objects + axis=0, # 0=observations, 1=variables + join='inner', # 'inner' or 'outer' + merge=None, # Merge strategy for non-concat axis + label=None, # Column name for source tracking + keys=None, # Dataset identifiers + index_unique=None, # Separator for unique indices + fill_value=None, # Fill value for missing data + pairwise=False # Include pairwise matrices +) +``` + +### Concatenating Observations (Cells) +```python +# Most common: combining multiple samples/batches +adata1 = ad.AnnData(np.random.rand(100, 2000)) +adata2 = ad.AnnData(np.random.rand(150, 2000)) +adata3 = ad.AnnData(np.random.rand(80, 2000)) + +combined = ad.concat([adata1, adata2, adata3], axis=0) +# Result: (330 observations, 2000 variables) +``` + +### Concatenating Variables (Genes) +```python +# Less common: combining different feature sets +adata1 = ad.AnnData(np.random.rand(100, 1000)) +adata2 = ad.AnnData(np.random.rand(100, 500)) + +combined = ad.concat([adata1, adata2], axis=1) +# Result: (100 observations, 1500 variables) +``` + +## Join Strategies + +### Inner Join (Intersection) + +Keeps only shared features across all objects. + +```python +# Datasets with different genes +adata1 = ad.AnnData( + np.random.rand(100, 2000), + var=pd.DataFrame(index=[f'Gene_{i}' for i in range(2000)]) +) +adata2 = ad.AnnData( + np.random.rand(150, 1800), + var=pd.DataFrame(index=[f'Gene_{i}' for i in range(200, 2000)]) +) + +# Inner join: only genes present in both +combined = ad.concat([adata1, adata2], join='inner') +# Result: (250 observations, 1800 variables) +# Only Gene_200 through Gene_1999 +``` + +**Use when:** +- You want to analyze only features measured in all datasets +- Missing features would compromise analysis +- You need a complete case analysis + +**Trade-offs:** +- May lose many features +- Ensures no missing data +- Smaller result size + +### Outer Join (Union) + +Keeps all features from all objects, padding with fill values (default 0). + +```python +# Outer join: all genes from both datasets +combined = ad.concat([adata1, adata2], join='outer') +# Result: (250 observations, 2000 variables) +# Missing values filled with 0 + +# Custom fill value +combined = ad.concat([adata1, adata2], join='outer', fill_value=np.nan) +``` + +**Use when:** +- You want to preserve all features +- Sparse data is acceptable +- Features are independent + +**Trade-offs:** +- Introduces zeros/missing values +- Larger result size +- May need imputation + +## Merge Strategies + +Merge strategies control how elements on the non-concatenation axis are combined. + +### merge=None (Default) + +Excludes all non-concatenation axis elements. + +```python +# Both datasets have var annotations +adata1.var['gene_type'] = ['protein_coding'] * 2000 +adata2.var['gene_type'] = ['protein_coding'] * 1800 + +# merge=None: var annotations excluded +combined = ad.concat([adata1, adata2], merge=None) +assert 'gene_type' not in combined.var.columns +``` + +**Use when:** +- Annotations are dataset-specific +- You'll add new annotations after merging + +### merge='same' + +Keeps only annotations with identical values across datasets. + +```python +# Same annotation values +adata1.var['chromosome'] = ['chr1'] * 1000 + ['chr2'] * 1000 +adata2.var['chromosome'] = ['chr1'] * 900 + ['chr2'] * 900 + +# merge='same': keeps chromosome annotation +combined = ad.concat([adata1, adata2], merge='same') +assert 'chromosome' in combined.var.columns +``` + +**Use when:** +- Annotations should be consistent +- You want to validate consistency +- Shared metadata is important + +**Note:** Comparison occurs after index alignment - only shared indices need to match. + +### merge='unique' + +Includes annotations with a single possible value. + +```python +# Unique values per gene +adata1.var['ensembl_id'] = [f'ENSG{i:08d}' for i in range(2000)] +adata2.var['ensembl_id'] = [f'ENSG{i:08d}' for i in range(2000)] + +# merge='unique': keeps ensembl_id +combined = ad.concat([adata1, adata2], merge='unique') +``` + +**Use when:** +- Each feature has a unique identifier +- Annotations are feature-specific + +### merge='first' + +Takes the first occurrence of each annotation. + +```python +# Different annotation versions +adata1.var['description'] = ['desc1'] * 2000 +adata2.var['description'] = ['desc2'] * 2000 + +# merge='first': uses adata1's descriptions +combined = ad.concat([adata1, adata2], merge='first') +# Uses descriptions from adata1 +``` + +**Use when:** +- One dataset has authoritative annotations +- Order matters +- You need a simple resolution strategy + +### merge='only' + +Retains annotations appearing in exactly one object. + +```python +# Dataset-specific annotations +adata1.var['dataset1_specific'] = ['value'] * 2000 +adata2.var['dataset2_specific'] = ['value'] * 2000 + +# merge='only': keeps both (no conflicts) +combined = ad.concat([adata1, adata2], merge='only') +``` + +**Use when:** +- Datasets have non-overlapping annotations +- You want to preserve all unique metadata + +## Source Tracking + +### Using label + +Add a categorical column to track data origin. + +```python +combined = ad.concat( + [adata1, adata2, adata3], + label='batch' +) + +# Creates obs['batch'] with values 0, 1, 2 +print(combined.obs['batch'].cat.categories) # ['0', '1', '2'] +``` + +### Using keys + +Provide custom names for source tracking. + +```python +combined = ad.concat( + [adata1, adata2, adata3], + label='study', + keys=['control', 'treatment_a', 'treatment_b'] +) + +# Creates obs['study'] with custom names +print(combined.obs['study'].unique()) # ['control', 'treatment_a', 'treatment_b'] +``` + +### Making Indices Unique + +Append source identifiers to duplicate observation names. + +```python +# Both datasets have cells named "Cell_0", "Cell_1", etc. +adata1.obs_names = [f'Cell_{i}' for i in range(100)] +adata2.obs_names = [f'Cell_{i}' for i in range(150)] + +# index_unique adds suffix +combined = ad.concat( + [adata1, adata2], + keys=['batch1', 'batch2'], + index_unique='-' +) + +# Results in: Cell_0-batch1, Cell_0-batch2, etc. +print(combined.obs_names[:5]) +``` + +## Handling Different Attributes + +### X Matrix and Layers + +Follows join strategy. Missing values filled according to `fill_value`. + +```python +# Both have layers +adata1.layers['counts'] = adata1.X.copy() +adata2.layers['counts'] = adata2.X.copy() + +# Concatenates both X and layers +combined = ad.concat([adata1, adata2]) +assert 'counts' in combined.layers +``` + +### obs and var DataFrames + +- **obs**: Concatenated along concatenation axis +- **var**: Handled by merge strategy + +```python +adata1.obs['cell_type'] = ['B cell'] * 100 +adata2.obs['cell_type'] = ['T cell'] * 150 + +combined = ad.concat([adata1, adata2]) +# obs['cell_type'] preserved for all cells +``` + +### obsm and varm + +Multi-dimensional annotations follow same rules as layers. + +```python +adata1.obsm['X_pca'] = np.random.rand(100, 50) +adata2.obsm['X_pca'] = np.random.rand(150, 50) + +combined = ad.concat([adata1, adata2]) +# obsm['X_pca'] concatenated: shape (250, 50) +``` + +### obsp and varp + +Pairwise matrices excluded by default. Enable with `pairwise=True`. + +```python +# Distance matrices +adata1.obsp['distances'] = np.random.rand(100, 100) +adata2.obsp['distances'] = np.random.rand(150, 150) + +# Excluded by default +combined = ad.concat([adata1, adata2]) +assert 'distances' not in combined.obsp + +# Include if needed +combined = ad.concat([adata1, adata2], pairwise=True) +# Results in padded block diagonal matrix +``` + +### uns Dictionary + +Merged recursively, applying merge strategy at any nesting depth. + +```python +adata1.uns['experiment'] = {'date': '2024-01', 'lab': 'A'} +adata2.uns['experiment'] = {'date': '2024-02', 'lab': 'A'} + +# merge='same' keeps 'lab', excludes 'date' +combined = ad.concat([adata1, adata2], merge='same') +# combined.uns['experiment'] = {'lab': 'A'} +``` + +## Advanced Patterns + +### Batch Integration Pipeline + +```python +import anndata as ad + +# Load batches +batches = [ + ad.read_h5ad(f'batch_{i}.h5ad') + for i in range(5) +] + +# Concatenate with tracking +combined = ad.concat( + batches, + axis=0, + join='outer', + merge='first', + label='batch_id', + keys=[f'batch_{i}' for i in range(5)], + index_unique='-' +) + +# Add batch effects +combined.obs['batch_numeric'] = combined.obs['batch_id'].cat.codes +``` + +### Multi-Study Meta-Analysis + +```python +# Different studies with varying gene coverage +studies = { + 'study_a': ad.read_h5ad('study_a.h5ad'), + 'study_b': ad.read_h5ad('study_b.h5ad'), + 'study_c': ad.read_h5ad('study_c.h5ad') +} + +# Outer join to keep all genes +combined = ad.concat( + list(studies.values()), + axis=0, + join='outer', + label='study', + keys=list(studies.keys()), + merge='unique', + fill_value=0 +) + +# Track coverage +for study in studies: + n_genes = studies[study].n_vars + combined.uns[f'{study}_n_genes'] = n_genes +``` + +### Incremental Concatenation + +```python +# For many datasets, concatenate in batches +chunk_size = 10 +all_files = [f'dataset_{i}.h5ad' for i in range(100)] + +# Process in chunks +result = None +for i in range(0, len(all_files), chunk_size): + chunk_files = all_files[i:i+chunk_size] + chunk_adatas = [ad.read_h5ad(f) for f in chunk_files] + chunk_combined = ad.concat(chunk_adatas) + + if result is None: + result = chunk_combined + else: + result = ad.concat([result, chunk_combined]) +``` + +### Memory-Efficient On-Disk Concatenation + +```python +# Experimental feature for large datasets +from anndata.experimental import concat_on_disk + +files = ['dataset1.h5ad', 'dataset2.h5ad', 'dataset3.h5ad'] +concat_on_disk( + files, + 'combined.h5ad', + join='outer' +) + +# Read result in backed mode +combined = ad.read_h5ad('combined.h5ad', backed='r') +``` + +## Troubleshooting + +### Issue: Dimension Mismatch + +```python +# Error: shapes don't match +adata1 = ad.AnnData(np.random.rand(100, 2000)) +adata2 = ad.AnnData(np.random.rand(150, 1500)) + +# Solution: use outer join +combined = ad.concat([adata1, adata2], join='outer') +``` + +### Issue: Memory Error + +```python +# Problem: too many large objects in memory +large_adatas = [ad.read_h5ad(f) for f in many_files] + +# Solution: read and concatenate incrementally +result = None +for file in many_files: + adata = ad.read_h5ad(file) + if result is None: + result = adata + else: + result = ad.concat([result, adata]) + del adata # Free memory +``` + +### Issue: Duplicate Indices + +```python +# Problem: same cell names in different batches +# Solution: use index_unique +combined = ad.concat( + [adata1, adata2], + keys=['batch1', 'batch2'], + index_unique='-' +) +``` + +### Issue: Lost Annotations + +```python +# Problem: annotations disappear +adata1.var['important'] = values1 +adata2.var['important'] = values2 + +combined = ad.concat([adata1, adata2]) # merge=None by default +# Solution: use appropriate merge strategy +combined = ad.concat([adata1, adata2], merge='first') +``` + +## Performance Tips + +1. **Pre-align indices**: Ensure consistent naming before concatenation +2. **Use sparse matrices**: Convert to sparse before concatenating +3. **Batch operations**: Concatenate in groups for many datasets +4. **Choose inner join**: When possible, to reduce result size +5. **Use categoricals**: Convert string annotations before concatenating +6. **Consider on-disk**: For very large datasets, use `concat_on_disk` diff --git a/scientific-packages/anndata/references/workflows_best_practices.md b/scientific-packages/anndata/references/workflows_best_practices.md new file mode 100644 index 0000000..41f91c5 --- /dev/null +++ b/scientific-packages/anndata/references/workflows_best_practices.md @@ -0,0 +1,438 @@ +# AnnData Workflows and Best Practices + +## Common Workflows + +### 1. Single-Cell RNA-seq Analysis Workflow + +#### Loading Data +```python +import anndata as ad +import numpy as np +import pandas as pd + +# Load from 10X format +adata = ad.read_mtx('matrix.mtx') +adata.var_names = pd.read_csv('genes.tsv', sep='\t', header=None)[0] +adata.obs_names = pd.read_csv('barcodes.tsv', sep='\t', header=None)[0] + +# Or load from pre-processed h5ad +adata = ad.read_h5ad('preprocessed_data.h5ad') +``` + +#### Quality Control +```python +# Calculate QC metrics +adata.obs['n_genes'] = (adata.X > 0).sum(axis=1) +adata.obs['total_counts'] = adata.X.sum(axis=1) + +# Filter cells +adata = adata[adata.obs.n_genes > 200] +adata = adata[adata.obs.total_counts < 10000] + +# Filter genes +min_cells = 3 +adata = adata[:, (adata.X > 0).sum(axis=0) >= min_cells] +``` + +#### Normalization and Preprocessing +```python +# Store raw counts +adata.layers['counts'] = adata.X.copy() + +# Normalize +adata.X = adata.X / adata.obs.total_counts.values[:, None] * 1e4 + +# Log transform +adata.layers['log1p'] = np.log1p(adata.X) +adata.X = adata.layers['log1p'] + +# Identify highly variable genes +gene_variance = adata.X.var(axis=0) +adata.var['highly_variable'] = gene_variance > np.percentile(gene_variance, 90) +``` + +#### Dimensionality Reduction +```python +# PCA +from sklearn.decomposition import PCA +pca = PCA(n_components=50) +adata.obsm['X_pca'] = pca.fit_transform(adata.X) + +# Store PCA variance +adata.uns['pca'] = {'variance_ratio': pca.explained_variance_ratio_} + +# UMAP +from umap import UMAP +umap = UMAP(n_components=2) +adata.obsm['X_umap'] = umap.fit_transform(adata.obsm['X_pca']) +``` + +#### Clustering +```python +# Store cluster assignments +adata.obs['clusters'] = pd.Categorical(['cluster_0', 'cluster_1', ...]) + +# Store cluster centroids +centroids = np.array([...]) +adata.varm['cluster_centroids'] = centroids +``` + +#### Save Results +```python +# Save complete analysis +adata.write('analyzed_data.h5ad', compression='gzip') +``` + +### 2. Batch Integration Workflow + +```python +import anndata as ad + +# Load multiple batches +batch1 = ad.read_h5ad('batch1.h5ad') +batch2 = ad.read_h5ad('batch2.h5ad') +batch3 = ad.read_h5ad('batch3.h5ad') + +# Concatenate with batch labels +adata = ad.concat( + [batch1, batch2, batch3], + axis=0, + label='batch', + keys=['batch1', 'batch2', 'batch3'], + index_unique='-' +) + +# Batch effect correction would go here +# (using external tools like Harmony, Scanorama, etc.) + +# Store corrected embeddings +adata.obsm['X_pca_corrected'] = corrected_pca +adata.obsm['X_umap_corrected'] = corrected_umap +``` + +### 3. Memory-Efficient Large Dataset Workflow + +```python +import anndata as ad + +# Read in backed mode +adata = ad.read_h5ad('large_dataset.h5ad', backed='r') + +# Check backing status +print(f"Is backed: {adata.isbacked}") +print(f"File: {adata.filename}") + +# Work with chunks +for chunk in adata.chunk_X(chunk_size=1000): + # Process chunk + result = process_chunk(chunk) + +# Close file when done +adata.file.close() +``` + +### 4. Multi-Dataset Comparison Workflow + +```python +import anndata as ad + +# Load datasets +datasets = { + 'study1': ad.read_h5ad('study1.h5ad'), + 'study2': ad.read_h5ad('study2.h5ad'), + 'study3': ad.read_h5ad('study3.h5ad') +} + +# Outer join to keep all genes +combined = ad.concat( + list(datasets.values()), + axis=0, + join='outer', + label='study', + keys=list(datasets.keys()), + merge='first' +) + +# Handle missing data +combined.X[np.isnan(combined.X)] = 0 + +# Add dataset-specific metadata +combined.uns['datasets'] = { + 'study1': {'date': '2023-01', 'n_samples': datasets['study1'].n_obs}, + 'study2': {'date': '2023-06', 'n_samples': datasets['study2'].n_obs}, + 'study3': {'date': '2024-01', 'n_samples': datasets['study3'].n_obs} +} +``` + +## Best Practices + +### Memory Management + +#### Use Sparse Matrices +```python +from scipy.sparse import csr_matrix + +# Convert to sparse if data is sparse +if density < 0.3: # Less than 30% non-zero + adata.X = csr_matrix(adata.X) +``` + +#### Use Backed Mode for Large Files +```python +# Read with backing +adata = ad.read_h5ad('large_file.h5ad', backed='r') + +# Only load what you need +subset = adata[:1000, :500].copy() # Now in memory +``` + +#### Convert Strings to Categoricals +```python +# Efficient storage for repeated strings +adata.strings_to_categoricals() + +# Or manually +adata.obs['cell_type'] = pd.Categorical(adata.obs['cell_type']) +``` + +### Data Organization + +#### Use Layers for Different Representations +```python +# Store multiple versions of the data +adata.layers['counts'] = raw_counts +adata.layers['normalized'] = normalized_data +adata.layers['log1p'] = log_transformed_data +adata.layers['scaled'] = scaled_data +``` + +#### Use obsm/varm for Multi-Dimensional Annotations +```python +# Embeddings +adata.obsm['X_pca'] = pca_coordinates +adata.obsm['X_umap'] = umap_coordinates +adata.obsm['X_tsne'] = tsne_coordinates + +# Gene loadings +adata.varm['PCs'] = principal_components +``` + +#### Use uns for Analysis Metadata +```python +# Store parameters +adata.uns['preprocessing'] = { + 'normalization': 'TPM', + 'min_genes': 200, + 'min_cells': 3, + 'date': '2024-01-15' +} + +# Store analysis results +adata.uns['differential_expression'] = { + 'method': 't-test', + 'p_value_threshold': 0.05 +} +``` + +### Subsetting and Views + +#### Understand View vs Copy +```python +# Subsetting returns a view +subset = adata[adata.obs.cell_type == 'B cell'] # View +print(subset.is_view) # True + +# Views are memory efficient but modifications affect original +subset.obs['new_column'] = value # Modifies original adata + +# Create independent copy when needed +subset_copy = adata[adata.obs.cell_type == 'B cell'].copy() +``` + +#### Chain Operations Efficiently +```python +# Bad - creates multiple intermediate views +temp1 = adata[adata.obs.batch == 'batch1'] +temp2 = temp1[temp1.obs.n_genes > 200] +result = temp2[:, temp2.var.highly_variable].copy() + +# Good - chain operations +result = adata[ + (adata.obs.batch == 'batch1') & (adata.obs.n_genes > 200), + adata.var.highly_variable +].copy() +``` + +### File I/O + +#### Use Compression +```python +# Save with compression +adata.write('data.h5ad', compression='gzip') +``` + +#### Choose the Right Format +```python +# H5AD for general use (good compression, fast) +adata.write_h5ad('data.h5ad') + +# Zarr for cloud storage and parallel access +adata.write_zarr('data.zarr') + +# Loom for compatibility with other tools +adata.write_loom('data.loom') +``` + +#### Close File Connections +```python +# Use context manager pattern +adata = ad.read_h5ad('file.h5ad', backed='r') +try: + # Work with data + process(adata) +finally: + adata.file.close() +``` + +### Concatenation + +#### Choose Appropriate Join Strategy +```python +# Inner join - only common features (safe, may lose data) +combined = ad.concat([adata1, adata2], join='inner') + +# Outer join - all features (keeps all data, may introduce zeros) +combined = ad.concat([adata1, adata2], join='outer') +``` + +#### Track Data Sources +```python +# Add source labels +combined = ad.concat( + [adata1, adata2, adata3], + label='dataset', + keys=['exp1', 'exp2', 'exp3'] +) + +# Make indices unique +combined = ad.concat( + [adata1, adata2, adata3], + index_unique='-' +) +``` + +#### Handle Variable-Specific Metadata +```python +# Use merge strategy for var annotations +combined = ad.concat( + [adata1, adata2], + merge='same', # Keep only identical annotations + join='outer' +) +``` + +### Naming Conventions + +#### Use Consistent Naming +```python +# Embeddings: X_ +adata.obsm['X_pca'] +adata.obsm['X_umap'] +adata.obsm['X_tsne'] + +# Layers: descriptive names +adata.layers['counts'] +adata.layers['log1p'] +adata.layers['scaled'] + +# Observations: snake_case +adata.obs['cell_type'] +adata.obs['n_genes'] +adata.obs['total_counts'] +``` + +#### Make Indices Unique +```python +# Ensure unique names +adata.obs_names_make_unique() +adata.var_names_make_unique() +``` + +### Error Handling + +#### Validate Data Structure +```python +# Check dimensions +assert adata.n_obs > 0, "No observations in data" +assert adata.n_vars > 0, "No variables in data" + +# Check for NaN values +if np.isnan(adata.X).any(): + print("Warning: NaN values detected") + +# Check for negative values in count data +if (adata.X < 0).any(): + print("Warning: Negative values in count data") +``` + +#### Handle Missing Data +```python +# Check for missing annotations +if adata.obs['cell_type'].isna().any(): + print("Warning: Missing cell type annotations") + # Fill or remove + adata = adata[~adata.obs['cell_type'].isna()] +``` + +## Common Pitfalls + +### 1. Forgetting to Copy Views +```python +# BAD - modifies original +subset = adata[adata.obs.condition == 'treated'] +subset.X = transformed_data # Changes original adata! + +# GOOD +subset = adata[adata.obs.condition == 'treated'].copy() +subset.X = transformed_data # Only changes subset +``` + +### 2. Mixing Backed and In-Memory Operations +```python +# BAD - trying to modify backed data +adata = ad.read_h5ad('file.h5ad', backed='r') +adata.X[0, 0] = 100 # Error: can't modify backed data + +# GOOD - load to memory first +adata = ad.read_h5ad('file.h5ad', backed='r') +adata = adata.to_memory() +adata.X[0, 0] = 100 # Works +``` + +### 3. Not Using Categoricals for Metadata +```python +# BAD - stores as strings (memory inefficient) +adata.obs['cell_type'] = ['B cell', 'T cell', ...] * 1000 + +# GOOD - use categorical +adata.obs['cell_type'] = pd.Categorical(['B cell', 'T cell', ...] * 1000) +``` + +### 4. Incorrect Concatenation Axis +```python +# Concatenating observations (cells) +combined = ad.concat([adata1, adata2], axis=0) # Correct + +# Concatenating variables (genes) - rare +combined = ad.concat([adata1, adata2], axis=1) # Less common +``` + +### 5. Not Preserving Raw Data +```python +# BAD - loses original data +adata.X = normalized_data + +# GOOD - preserve original +adata.layers['counts'] = adata.X.copy() +adata.X = normalized_data +``` diff --git a/scientific-packages/arboreto/SKILL.md b/scientific-packages/arboreto/SKILL.md new file mode 100644 index 0000000..6c4fb21 --- /dev/null +++ b/scientific-packages/arboreto/SKILL.md @@ -0,0 +1,415 @@ +--- +name: arboreto +description: Toolkit for gene regulatory network (GRN) inference from expression data using machine learning. Use this skill when working with gene expression matrices to infer regulatory relationships, performing single-cell RNA-seq analysis, or integrating with pySCENIC workflows. Supports both GRNBoost2 (fast gradient boosting) and GENIE3 (Random Forest) algorithms with distributed computing via Dask. +--- + +# Arboreto - Gene Regulatory Network Inference + +## Overview + +Arboreto is a Python library for inferring gene regulatory networks (GRNs) from gene expression data using machine learning algorithms. It enables scalable GRN inference from single machines to multi-node clusters using Dask for distributed computing. The skill provides comprehensive support for both GRNBoost2 (fast gradient boosting) and GENIE3 (Random Forest) algorithms. + +## When to Use This Skill + +Apply this skill when: +- Inferring regulatory relationships between genes from expression data +- Analyzing single-cell or bulk RNA-seq data to identify transcription factor targets +- Building the GRN inference component of a pySCENIC pipeline +- Comparing GRNBoost2 and GENIE3 algorithm performance +- Setting up distributed computing for large-scale genomic analyses +- Troubleshooting arboreto installation or runtime issues + +## Core Capabilities + +### 1. Basic GRN Inference + +For standard gene regulatory network inference tasks: + +**Key considerations:** +- Expression data format: Rows = observations (cells/samples), Columns = genes +- If data has genes as rows, transpose it first: `expression_df.T` +- Always include `seed` parameter for reproducible results +- Transcription factor list is optional but recommended for focused analysis + +**Typical workflow:** +```python +import pandas as pd +from arboreto.algo import grnboost2 +from arboreto.utils import load_tf_names + +# Load expression data (ensure correct orientation) +expression_data = pd.read_csv('expression_data.tsv', sep='\t', index_col=0) + +# Optional: Load TF names +tf_names = load_tf_names('transcription_factors.txt') + +# Run inference +network = grnboost2( + expression_data=expression_data, + tf_names=tf_names, + seed=42 # For reproducibility +) + +# Save results +network.to_csv('network_output.tsv', sep='\t', index=False) +``` + +**Output format:** +- DataFrame with columns: `['TF', 'target', 'importance']` +- Higher importance scores indicate stronger predicted regulatory relationships +- Typically sorted by importance (descending) + +**Multiprocessing requirement:** +All arboreto code must include `if __name__ == '__main__':` protection due to Dask's multiprocessing requirements: + +```python +if __name__ == '__main__': + # Arboreto code goes here + network = grnboost2(expression_data=expr_data, seed=42) +``` + +### 2. Algorithm Selection + +**GRNBoost2 (Recommended for most cases):** +- ~10-100x faster than GENIE3 +- Uses stochastic gradient boosting with early-stopping +- Best for: Large datasets (>10k observations), time-sensitive analyses +- Function: `arboreto.algo.grnboost2()` + +**GENIE3:** +- Uses Random Forest regression +- More established, classical approach +- Best for: Small datasets, methodological comparisons, reproducing published results +- Function: `arboreto.algo.genie3()` + +**When to compare both algorithms:** +Use the provided `compare_algorithms.py` script when: +- Validating results for critical analyses +- Benchmarking performance on new datasets +- Publishing research requiring methodological comparisons + +### 3. Distributed Computing + +**Local execution (default):** +Arboreto automatically creates a local Dask client. No configuration needed: +```python +network = grnboost2(expression_data=expr_data) +``` + +**Custom local cluster (recommended for better control):** +```python +from dask.distributed import Client, LocalCluster + +# Configure cluster +cluster = LocalCluster( + n_workers=4, + threads_per_worker=2, + memory_limit='4GB', + diagnostics_port=8787 # Dashboard at http://localhost:8787 +) +client = Client(cluster) + +# Run inference +network = grnboost2( + expression_data=expr_data, + client_or_address=client +) + +# Clean up +client.close() +cluster.close() +``` + +**Distributed cluster (multi-node):** +On scheduler node: +```bash +dask-scheduler --no-bokeh +``` + +On worker nodes: +```bash +dask-worker scheduler-address:8786 --local-dir /tmp +``` + +In Python: +```python +from dask.distributed import Client + +client = Client('scheduler-address:8786') +network = grnboost2(expression_data=expr_data, client_or_address=client) +``` + +### 4. Data Preparation + +**Common data format issues:** + +1. **Transposed data** (genes as rows instead of columns): +```python +# If genes are rows, transpose +expression_data = pd.read_csv('data.tsv', sep='\t', index_col=0).T +``` + +2. **Missing gene names:** +```python +# Provide gene names if using numpy array +network = grnboost2( + expression_data=expr_array, + gene_names=['Gene1', 'Gene2', 'Gene3', ...], + seed=42 +) +``` + +3. **Transcription factor specification:** +```python +# Option 1: Python list +tf_names = ['Sox2', 'Oct4', 'Nanog', 'Klf4'] + +# Option 2: Load from file (one TF per line) +from arboreto.utils import load_tf_names +tf_names = load_tf_names('tf_names.txt') +``` + +### 5. Reproducibility + +Always specify a seed for consistent results: +```python +network = grnboost2(expression_data=expr_data, seed=42) +``` + +Without a seed, results will vary between runs due to algorithm randomness. + +### 6. Result Interpretation + +**Understanding the output:** +- `TF`: Transcription factor (regulator) gene +- `target`: Target gene being regulated +- `importance`: Strength of predicted regulatory relationship + +**Typical post-processing:** +```python +# Filter by importance threshold +high_confidence = network[network['importance'] > 10] + +# Get top N predictions +top_predictions = network.head(1000) + +# Find all targets of a specific TF +sox2_targets = network[network['TF'] == 'Sox2'] + +# Count regulations per TF +tf_counts = network['TF'].value_counts() +``` + +## Installation + +**Recommended (via conda):** +```bash +conda install -c bioconda arboreto +``` + +**Via pip:** +```bash +pip install arboreto +``` + +**From source:** +```bash +git clone https://github.com/tmoerman/arboreto.git +cd arboreto +pip install . +``` + +**Dependencies:** +- pandas +- numpy +- scikit-learn +- scipy +- dask +- distributed + +## Troubleshooting + +### Issue: Bokeh error when launching Dask scheduler + +**Error:** `TypeError: got an unexpected keyword argument 'host'` + +**Solutions:** +- Use `dask-scheduler --no-bokeh` to disable Bokeh +- Upgrade to Dask distributed >= 0.20.0 + +### Issue: Workers not connecting to scheduler + +**Symptoms:** Worker processes start but fail to establish connections + +**Solutions:** +- Remove `dask-worker-space` directory before restarting workers +- Specify adequate `local_dir` when creating cluster: +```python +cluster = LocalCluster( + worker_kwargs={'local_dir': '/tmp'} +) +``` + +### Issue: Memory errors with large datasets + +**Solutions:** +- Increase worker memory limits: `memory_limit='8GB'` +- Distribute across more nodes +- Reduce dataset size through preprocessing (e.g., feature selection) +- Ensure expression matrix fits in available RAM + +### Issue: Inconsistent results across runs + +**Solution:** Always specify a `seed` parameter: +```python +network = grnboost2(expression_data=expr_data, seed=42) +``` + +### Issue: Import errors or missing dependencies + +**Solution:** Use conda installation to handle numerical library dependencies: +```bash +conda create --name arboreto-env +conda activate arboreto-env +conda install -c bioconda arboreto +``` + +## Provided Scripts + +This skill includes ready-to-use scripts for common workflows: + +### scripts/basic_grn_inference.py + +Command-line tool for standard GRN inference workflow. + +**Usage:** +```bash +python scripts/basic_grn_inference.py expression_data.tsv \ + -t tf_names.txt \ + -o network.tsv \ + -s 42 \ + --transpose # if genes are rows +``` + +**Features:** +- Automatic data loading and validation +- Optional TF list specification +- Configurable output format +- Data transposition support +- Summary statistics + +### scripts/distributed_inference.py + +GRN inference with custom Dask cluster configuration. + +**Usage:** +```bash +python scripts/distributed_inference.py expression_data.tsv \ + -t tf_names.txt \ + -w 8 \ + -m 4GB \ + --threads 2 \ + --dashboard-port 8787 +``` + +**Features:** +- Configurable worker count and memory limits +- Dask dashboard integration +- Thread configuration +- Resource monitoring + +### scripts/compare_algorithms.py + +Compare GRNBoost2 and GENIE3 side-by-side. + +**Usage:** +```bash +python scripts/compare_algorithms.py expression_data.tsv \ + -t tf_names.txt \ + --top-n 100 +``` + +**Features:** +- Runtime comparison +- Network statistics +- Prediction overlap analysis +- Top prediction comparison + +## Reference Documentation + +Detailed API documentation is available in [references/api_reference.md](references/api_reference.md), including: +- Complete parameter descriptions for all functions +- Data format specifications +- Distributed computing configuration +- Performance optimization tips +- Integration with pySCENIC +- Comprehensive examples + +Load this reference when: +- Working with advanced Dask configurations +- Troubleshooting complex deployment scenarios +- Understanding algorithm internals +- Optimizing performance for specific use cases + +## Integration with pySCENIC + +Arboreto is the first step in the pySCENIC single-cell analysis pipeline: + +1. **GRN Inference (arboreto)** ← This skill + - Input: Expression matrix + - Output: Regulatory network + +2. **Regulon Prediction (pySCENIC)** + - Input: Network from arboreto + - Output: Refined regulons + +3. **Cell Type Identification (pySCENIC)** + - Input: Regulons + - Output: Cell type scores + +When working with pySCENIC, use arboreto to generate the initial network, then pass results to the pySCENIC pipeline. + +## Best Practices + +1. **Always use seed parameter** for reproducible research +2. **Validate data orientation** (rows = observations, columns = genes) +3. **Specify TF list** when known to focus inference and improve speed +4. **Monitor with Dask dashboard** for distributed computing +5. **Save intermediate results** to avoid re-running long computations +6. **Filter results** by importance threshold for downstream analysis +7. **Use GRNBoost2 by default** unless specifically requiring GENIE3 +8. **Include multiprocessing guard** (`if __name__ == '__main__':`) in all scripts + +## Quick Reference + +**Basic inference:** +```python +from arboreto.algo import grnboost2 +network = grnboost2(expression_data=expr_df, seed=42) +``` + +**With TF specification:** +```python +network = grnboost2(expression_data=expr_df, tf_names=tf_list, seed=42) +``` + +**With custom Dask client:** +```python +from dask.distributed import Client, LocalCluster +cluster = LocalCluster(n_workers=4) +client = Client(cluster) +network = grnboost2(expression_data=expr_df, client_or_address=client, seed=42) +client.close() +cluster.close() +``` + +**Load TF names:** +```python +from arboreto.utils import load_tf_names +tf_names = load_tf_names('transcription_factors.txt') +``` + +**Transpose data:** +```python +expression_df = pd.read_csv('data.tsv', sep='\t', index_col=0).T +``` diff --git a/scientific-packages/arboreto/references/api_reference.md b/scientific-packages/arboreto/references/api_reference.md new file mode 100644 index 0000000..894f18f --- /dev/null +++ b/scientific-packages/arboreto/references/api_reference.md @@ -0,0 +1,271 @@ +# Arboreto API Reference + +This document provides comprehensive API documentation for the arboreto package, a Python library for gene regulatory network (GRN) inference. + +## Overview + +Arboreto enables inference of gene regulatory networks from expression data using machine learning algorithms. It supports distributed computing via Dask for scalability from single machines to multi-node clusters. + +**Current Version:** 0.1.5 +**GitHub:** https://github.com/tmoerman/arboreto +**License:** BSD 3-Clause + +## Core Algorithms + +### GRNBoost2 + +The flagship algorithm for fast gene regulatory network inference using stochastic gradient boosting. + +**Function:** `arboreto.algo.grnboost2()` + +**Parameters:** +- `expression_data` (pandas.DataFrame or numpy.ndarray): Expression matrix where rows are observations (cells/samples) and columns are genes. Required. +- `gene_names` (list, optional): List of gene names matching column order. If None, uses DataFrame column names. +- `tf_names` (list, optional): List of transcription factor names to consider as regulators. If None, all genes are considered potential regulators. +- `seed` (int, optional): Random seed for reproducibility. Recommended when consistent results are needed across runs. +- `client_or_address` (dask.distributed.Client or str, optional): Custom Dask client or scheduler address for distributed computing. If None, creates a default local client. +- `verbose` (bool, optional): Enable verbose output for debugging. + +**Returns:** +- pandas.DataFrame with columns `['TF', 'target', 'importance']` representing inferred regulatory links. Each row represents a regulatory relationship with an importance score. + +**Algorithm Details:** +- Uses stochastic gradient boosting with early-stopping regularization +- Much faster than GENIE3, especially for large datasets (tens of thousands of observations) +- Extracts important features from trained regression models to identify regulatory relationships +- Recommended as the default choice for most use cases + +**Example:** +```python +from arboreto.algo import grnboost2 +import pandas as pd + +# Load expression data +expression_matrix = pd.read_csv('expression_data.tsv', sep='\t') +tf_list = ['TF1', 'TF2', 'TF3'] # Optional: specify TFs + +# Run inference +network = grnboost2( + expression_data=expression_matrix, + tf_names=tf_list, + seed=42 # For reproducibility +) + +# Save results +network.to_csv('output_network.tsv', sep='\t', index=False) +``` + +### GENIE3 + +Classical gene regulatory network inference using Random Forest regression. + +**Function:** `arboreto.algo.genie3()` + +**Parameters:** +Same as GRNBoost2 (see above). + +**Returns:** +Same format as GRNBoost2 (see above). + +**Algorithm Details:** +- Uses Random Forest or ExtraTrees regression models +- Blueprint for multiple regression GRN inference strategy +- More computationally expensive than GRNBoost2 +- Better suited for smaller datasets or when maximum accuracy is needed + +**When to Use GENIE3 vs GRNBoost2:** +- **Use GRNBoost2:** For large datasets, faster results, or when computational resources are limited +- **Use GENIE3:** For smaller datasets, when following established protocols, or for comparison with published results + +## Module Structure + +### arboreto.algo + +Primary module for typical users. Contains high-level inference functions. + +**Main Functions:** +- `grnboost2()` - Fast GRN inference using gradient boosting +- `genie3()` - Classical GRN inference using Random Forest + +### arboreto.core + +Advanced module for power users. Contains low-level framework components for custom implementations. + +**Use cases:** +- Custom inference pipelines +- Algorithm modifications +- Performance tuning + +### arboreto.utils + +Utility functions for common data processing tasks. + +**Key Functions:** +- `load_tf_names(filename)` - Load transcription factor names from file + - Reads a text file with one TF name per line + - Returns a list of TF names + - Example: `tf_names = load_tf_names('transcription_factors.txt')` + +## Data Format Requirements + +### Input Format + +**Expression Matrix:** +- **Format:** pandas DataFrame or numpy ndarray +- **Orientation:** Rows = observations (cells/samples), Columns = genes +- **Convention:** Follows scikit-learn format +- **Gene Names:** Column names (DataFrame) or separate `gene_names` parameter +- **Data Type:** Numeric (float or int) + +**Common Mistake:** If data is transposed (genes as rows), use pandas to transpose: +```python +expression_df = pd.read_csv('data.tsv', sep='\t', index_col=0).T +``` + +**Transcription Factor List:** +- **Format:** Python list of strings or text file (one TF per line) +- **Optional:** If not provided, all genes are considered potential regulators +- **Example:** `['Sox2', 'Oct4', 'Nanog']` + +### Output Format + +**Network DataFrame:** +- **Columns:** + - `TF` (str): Transcription factor (regulator) gene name + - `target` (str): Target gene name + - `importance` (float): Importance score of the regulatory relationship +- **Interpretation:** Higher importance scores indicate stronger predicted regulatory relationships +- **Sorting:** Typically sorted by importance (descending) for prioritization + +**Example Output:** +``` +TF target importance +Sox2 Gene1 15.234 +Oct4 Gene1 12.456 +Sox2 Gene2 8.901 +``` + +## Distributed Computing with Dask + +### Local Execution (Default) + +Arboreto automatically creates a local Dask client if none is provided: + +```python +network = grnboost2(expression_data=expr_matrix, tf_names=tf_list) +``` + +### Custom Local Cluster + +For better control over resources or multiple inferences: + +```python +from dask.distributed import Client, LocalCluster + +# Configure cluster +cluster = LocalCluster( + n_workers=4, + threads_per_worker=2, + memory_limit='4GB' +) +client = Client(cluster) + +# Run inference +network = grnboost2( + expression_data=expr_matrix, + tf_names=tf_list, + client_or_address=client +) + +# Clean up +client.close() +cluster.close() +``` + +### Distributed Cluster + +For multi-node computation: + +**On scheduler node:** +```bash +dask-scheduler --no-bokeh # Use --no-bokeh to avoid Bokeh errors +``` + +**On worker nodes:** +```bash +dask-worker scheduler-address:8786 --local-dir /tmp +``` + +**In Python script:** +```python +from dask.distributed import Client + +client = Client('scheduler-address:8786') +network = grnboost2( + expression_data=expr_matrix, + tf_names=tf_list, + client_or_address=client +) +``` + +### Dask Dashboard + +Monitor computation progress via the Dask dashboard: + +```python +from dask.distributed import Client, LocalCluster + +cluster = LocalCluster(diagnostics_port=8787) +client = Client(cluster) + +# Dashboard available at: http://localhost:8787 +``` + +## Reproducibility + +To ensure reproducible results across runs: + +```python +network = grnboost2( + expression_data=expr_matrix, + tf_names=tf_list, + seed=42 # Fixed seed ensures identical results +) +``` + +**Note:** Without a seed parameter, results may vary slightly between runs due to randomness in the algorithms. + +## Performance Considerations + +### Memory Management + +- Expression matrices should fit in memory (RAM) +- For very large datasets, consider: + - Using a machine with more RAM + - Distributing across multiple nodes + - Preprocessing to reduce dimensionality + +### Worker Configuration + +- **Local execution:** Number of workers = number of CPU cores (default) +- **Custom cluster:** Balance workers and threads based on available resources +- **Distributed execution:** Ensure adequate `local_dir` space on worker nodes + +### Algorithm Choice + +- **GRNBoost2:** ~10-100x faster than GENIE3 for large datasets +- **GENIE3:** More established but slower, better for small datasets (<10k observations) + +## Integration with pySCENIC + +Arboreto is a core component of the pySCENIC pipeline for single-cell RNA sequencing analysis: + +1. **GRN Inference (Arboreto):** Infer regulatory networks using GRNBoost2 +2. **Regulon Prediction:** Prune network and identify regulons +3. **Cell Type Identification:** Score regulons across cells + +For pySCENIC workflows, arboreto is typically used in the first step to generate the initial regulatory network. + +## Common Issues and Solutions + +See the main SKILL.md for troubleshooting guidance. diff --git a/scientific-packages/arboreto/scripts/basic_grn_inference.py b/scientific-packages/arboreto/scripts/basic_grn_inference.py new file mode 100644 index 0000000..7ae4ea3 --- /dev/null +++ b/scientific-packages/arboreto/scripts/basic_grn_inference.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +""" +Basic GRN inference script using arboreto GRNBoost2. + +This script demonstrates the standard workflow for gene regulatory network inference: +1. Load expression data +2. Optionally load transcription factor names +3. Run GRNBoost2 inference +4. Save results + +Usage: + python basic_grn_inference.py [options] + +Example: + python basic_grn_inference.py expression_data.tsv -t tf_names.txt -o network.tsv +""" + +import argparse +import pandas as pd +from arboreto.algo import grnboost2 +from arboreto.utils import load_tf_names + + +def main(): + parser = argparse.ArgumentParser( + description='Infer gene regulatory network using GRNBoost2' + ) + parser.add_argument( + 'expression_file', + help='Path to expression data file (TSV/CSV format)' + ) + parser.add_argument( + '-t', '--tf-file', + help='Path to file containing transcription factor names (one per line)', + default=None + ) + parser.add_argument( + '-o', '--output', + help='Output file path for network results', + default='network_output.tsv' + ) + parser.add_argument( + '-s', '--seed', + type=int, + help='Random seed for reproducibility', + default=42 + ) + parser.add_argument( + '--sep', + help='Separator for input file (default: tab)', + default='\t' + ) + parser.add_argument( + '--transpose', + action='store_true', + help='Transpose the expression matrix (use if genes are rows)' + ) + + args = parser.parse_args() + + # Load expression data + print(f"Loading expression data from {args.expression_file}...") + expression_data = pd.read_csv(args.expression_file, sep=args.sep, index_col=0) + + # Transpose if needed + if args.transpose: + print("Transposing expression matrix...") + expression_data = expression_data.T + + print(f"Expression data shape: {expression_data.shape}") + print(f" Observations (rows): {expression_data.shape[0]}") + print(f" Genes (columns): {expression_data.shape[1]}") + + # Load TF names if provided + tf_names = None + if args.tf_file: + print(f"Loading transcription factor names from {args.tf_file}...") + tf_names = load_tf_names(args.tf_file) + print(f" Found {len(tf_names)} transcription factors") + else: + print("No TF file provided. Using all genes as potential regulators.") + + # Run GRNBoost2 + print("\nRunning GRNBoost2 inference...") + print(" (This may take a while depending on dataset size)") + + network = grnboost2( + expression_data=expression_data, + tf_names=tf_names, + seed=args.seed + ) + + print(f"\nInference complete!") + print(f" Total regulatory links inferred: {len(network)}") + print(f" Unique TFs: {network['TF'].nunique()}") + print(f" Unique targets: {network['target'].nunique()}") + + # Save results + print(f"\nSaving results to {args.output}...") + network.to_csv(args.output, sep='\t', index=False) + + # Display top 10 predictions + print("\nTop 10 predicted regulatory relationships:") + print(network.head(10).to_string(index=False)) + + print("\nDone!") + + +if __name__ == '__main__': + main() diff --git a/scientific-packages/arboreto/scripts/compare_algorithms.py b/scientific-packages/arboreto/scripts/compare_algorithms.py new file mode 100644 index 0000000..8dde5bc --- /dev/null +++ b/scientific-packages/arboreto/scripts/compare_algorithms.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +""" +Compare GRNBoost2 and GENIE3 algorithms on the same dataset. + +This script runs both algorithms on the same expression data and compares: +- Runtime +- Number of predicted links +- Top predicted relationships +- Overlap between predictions + +Usage: + python compare_algorithms.py [options] + +Example: + python compare_algorithms.py expression_data.tsv -t tf_names.txt +""" + +import argparse +import time +import pandas as pd +from arboreto.algo import grnboost2, genie3 +from arboreto.utils import load_tf_names + + +def compare_networks(network1, network2, name1, name2, top_n=100): + """Compare two inferred networks.""" + print(f"\n{'='*60}") + print("Network Comparison") + print(f"{'='*60}") + + # Basic statistics + print(f"\n{name1} Statistics:") + print(f" Total links: {len(network1)}") + print(f" Unique TFs: {network1['TF'].nunique()}") + print(f" Unique targets: {network1['target'].nunique()}") + print(f" Importance range: [{network1['importance'].min():.3f}, {network1['importance'].max():.3f}]") + + print(f"\n{name2} Statistics:") + print(f" Total links: {len(network2)}") + print(f" Unique TFs: {network2['TF'].nunique()}") + print(f" Unique targets: {network2['target'].nunique()}") + print(f" Importance range: [{network2['importance'].min():.3f}, {network2['importance'].max():.3f}]") + + # Compare top predictions + print(f"\nTop {top_n} Predictions Overlap:") + + # Create edge sets for top N predictions + top_edges1 = set( + zip(network1.head(top_n)['TF'], network1.head(top_n)['target']) + ) + top_edges2 = set( + zip(network2.head(top_n)['TF'], network2.head(top_n)['target']) + ) + + # Calculate overlap + overlap = top_edges1 & top_edges2 + only_net1 = top_edges1 - top_edges2 + only_net2 = top_edges2 - top_edges1 + + overlap_pct = (len(overlap) / top_n) * 100 + + print(f" Shared edges: {len(overlap)} ({overlap_pct:.1f}%)") + print(f" Only in {name1}: {len(only_net1)}") + print(f" Only in {name2}: {len(only_net2)}") + + # Show some example overlapping edges + if overlap: + print(f"\nExample overlapping predictions:") + for i, (tf, target) in enumerate(list(overlap)[:5], 1): + print(f" {i}. {tf} -> {target}") + + +def main(): + parser = argparse.ArgumentParser( + description='Compare GRNBoost2 and GENIE3 algorithms' + ) + parser.add_argument( + 'expression_file', + help='Path to expression data file (TSV/CSV format)' + ) + parser.add_argument( + '-t', '--tf-file', + help='Path to file containing transcription factor names (one per line)', + default=None + ) + parser.add_argument( + '--grnboost2-output', + help='Output file path for GRNBoost2 results', + default='grnboost2_network.tsv' + ) + parser.add_argument( + '--genie3-output', + help='Output file path for GENIE3 results', + default='genie3_network.tsv' + ) + parser.add_argument( + '-s', '--seed', + type=int, + help='Random seed for reproducibility', + default=42 + ) + parser.add_argument( + '--sep', + help='Separator for input file (default: tab)', + default='\t' + ) + parser.add_argument( + '--transpose', + action='store_true', + help='Transpose the expression matrix (use if genes are rows)' + ) + parser.add_argument( + '--top-n', + type=int, + help='Number of top predictions to compare (default: 100)', + default=100 + ) + + args = parser.parse_args() + + # Load expression data + print(f"Loading expression data from {args.expression_file}...") + expression_data = pd.read_csv(args.expression_file, sep=args.sep, index_col=0) + + # Transpose if needed + if args.transpose: + print("Transposing expression matrix...") + expression_data = expression_data.T + + print(f"Expression data shape: {expression_data.shape}") + print(f" Observations (rows): {expression_data.shape[0]}") + print(f" Genes (columns): {expression_data.shape[1]}") + + # Load TF names if provided + tf_names = None + if args.tf_file: + print(f"Loading transcription factor names from {args.tf_file}...") + tf_names = load_tf_names(args.tf_file) + print(f" Found {len(tf_names)} transcription factors") + else: + print("No TF file provided. Using all genes as potential regulators.") + + # Run GRNBoost2 + print("\n" + "="*60) + print("Running GRNBoost2...") + print("="*60) + start_time = time.time() + + grnboost2_network = grnboost2( + expression_data=expression_data, + tf_names=tf_names, + seed=args.seed + ) + + grnboost2_time = time.time() - start_time + print(f"GRNBoost2 completed in {grnboost2_time:.2f} seconds") + + # Save GRNBoost2 results + grnboost2_network.to_csv(args.grnboost2_output, sep='\t', index=False) + print(f"Results saved to {args.grnboost2_output}") + + # Run GENIE3 + print("\n" + "="*60) + print("Running GENIE3...") + print("="*60) + start_time = time.time() + + genie3_network = genie3( + expression_data=expression_data, + tf_names=tf_names, + seed=args.seed + ) + + genie3_time = time.time() - start_time + print(f"GENIE3 completed in {genie3_time:.2f} seconds") + + # Save GENIE3 results + genie3_network.to_csv(args.genie3_output, sep='\t', index=False) + print(f"Results saved to {args.genie3_output}") + + # Compare runtimes + print("\n" + "="*60) + print("Runtime Comparison") + print("="*60) + print(f"GRNBoost2: {grnboost2_time:.2f} seconds") + print(f"GENIE3: {genie3_time:.2f} seconds") + speedup = genie3_time / grnboost2_time + print(f"Speedup: {speedup:.2f}x (GRNBoost2 is {speedup:.2f}x faster)") + + # Compare networks + compare_networks( + grnboost2_network, + genie3_network, + "GRNBoost2", + "GENIE3", + top_n=args.top_n + ) + + print("\n" + "="*60) + print("Comparison complete!") + print("="*60) + + +if __name__ == '__main__': + main() diff --git a/scientific-packages/arboreto/scripts/distributed_inference.py b/scientific-packages/arboreto/scripts/distributed_inference.py new file mode 100644 index 0000000..41c882a --- /dev/null +++ b/scientific-packages/arboreto/scripts/distributed_inference.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Distributed GRN inference script using arboreto with custom Dask configuration. + +This script demonstrates how to use arboreto with a custom Dask LocalCluster +for better control over computational resources. + +Usage: + python distributed_inference.py [options] + +Example: + python distributed_inference.py expression_data.tsv -t tf_names.txt -w 8 -m 4GB +""" + +import argparse +import pandas as pd +from dask.distributed import Client, LocalCluster +from arboreto.algo import grnboost2 +from arboreto.utils import load_tf_names + + +def main(): + parser = argparse.ArgumentParser( + description='Distributed GRN inference using GRNBoost2 with custom Dask cluster' + ) + parser.add_argument( + 'expression_file', + help='Path to expression data file (TSV/CSV format)' + ) + parser.add_argument( + '-t', '--tf-file', + help='Path to file containing transcription factor names (one per line)', + default=None + ) + parser.add_argument( + '-o', '--output', + help='Output file path for network results', + default='network_output.tsv' + ) + parser.add_argument( + '-s', '--seed', + type=int, + help='Random seed for reproducibility', + default=42 + ) + parser.add_argument( + '-w', '--workers', + type=int, + help='Number of Dask workers', + default=4 + ) + parser.add_argument( + '-m', '--memory-limit', + help='Memory limit per worker (e.g., "4GB", "2000MB")', + default='4GB' + ) + parser.add_argument( + '--threads', + type=int, + help='Threads per worker', + default=2 + ) + parser.add_argument( + '--dashboard-port', + type=int, + help='Port for Dask dashboard (default: 8787)', + default=8787 + ) + parser.add_argument( + '--sep', + help='Separator for input file (default: tab)', + default='\t' + ) + parser.add_argument( + '--transpose', + action='store_true', + help='Transpose the expression matrix (use if genes are rows)' + ) + + args = parser.parse_args() + + # Load expression data + print(f"Loading expression data from {args.expression_file}...") + expression_data = pd.read_csv(args.expression_file, sep=args.sep, index_col=0) + + # Transpose if needed + if args.transpose: + print("Transposing expression matrix...") + expression_data = expression_data.T + + print(f"Expression data shape: {expression_data.shape}") + print(f" Observations (rows): {expression_data.shape[0]}") + print(f" Genes (columns): {expression_data.shape[1]}") + + # Load TF names if provided + tf_names = None + if args.tf_file: + print(f"Loading transcription factor names from {args.tf_file}...") + tf_names = load_tf_names(args.tf_file) + print(f" Found {len(tf_names)} transcription factors") + else: + print("No TF file provided. Using all genes as potential regulators.") + + # Set up Dask cluster + print(f"\nSetting up Dask LocalCluster...") + print(f" Workers: {args.workers}") + print(f" Threads per worker: {args.threads}") + print(f" Memory limit per worker: {args.memory_limit}") + print(f" Dashboard: http://localhost:{args.dashboard_port}") + + cluster = LocalCluster( + n_workers=args.workers, + threads_per_worker=args.threads, + memory_limit=args.memory_limit, + diagnostics_port=args.dashboard_port + ) + client = Client(cluster) + + print(f"\nDask cluster ready!") + print(f" Dashboard available at: {client.dashboard_link}") + + # Run GRNBoost2 + print("\nRunning GRNBoost2 inference with distributed computation...") + print(" (Monitor progress via the Dask dashboard)") + + try: + network = grnboost2( + expression_data=expression_data, + tf_names=tf_names, + seed=args.seed, + client_or_address=client + ) + + print(f"\nInference complete!") + print(f" Total regulatory links inferred: {len(network)}") + print(f" Unique TFs: {network['TF'].nunique()}") + print(f" Unique targets: {network['target'].nunique()}") + + # Save results + print(f"\nSaving results to {args.output}...") + network.to_csv(args.output, sep='\t', index=False) + + # Display top 10 predictions + print("\nTop 10 predicted regulatory relationships:") + print(network.head(10).to_string(index=False)) + + print("\nDone!") + + finally: + # Clean up Dask resources + print("\nClosing Dask cluster...") + client.close() + cluster.close() + + +if __name__ == '__main__': + main()