mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
Add more scientific skills
This commit is contained in:
398
scientific-packages/medchem/SKILL.md
Normal file
398
scientific-packages/medchem/SKILL.md
Normal file
@@ -0,0 +1,398 @@
|
||||
---
|
||||
name: medchem
|
||||
description: Python library for molecular filtering and prioritization in drug discovery. Use when applying medicinal chemistry rules (Rule of Five, CNS, leadlike), detecting structural alerts (PAINS, NIBR, Lilly demerits), analyzing chemical groups, calculating molecular complexity, or filtering compound libraries. Works with SMILES strings and RDKit mol objects, with built-in parallelization for large datasets.
|
||||
---
|
||||
|
||||
# Medchem
|
||||
|
||||
## Overview
|
||||
|
||||
Medchem is a Python library for molecular filtering and prioritization in drug discovery workflows. It provides hundreds of well-established and novel molecular filters, structural alerts, and medicinal chemistry rules to efficiently triage and prioritize compound libraries at scale.
|
||||
|
||||
**Key Principle:** Rules and filters are always context-specific. Avoid blindly applying filters—marketed drugs often don't pass standard medchem filters, and prodrugs may intentionally violate rules. Use these tools as guidelines combined with domain expertise.
|
||||
|
||||
## Installation
|
||||
|
||||
Install medchem via conda or pip:
|
||||
|
||||
```bash
|
||||
# Via conda
|
||||
micromamba install -c conda-forge medchem
|
||||
|
||||
# Via pip
|
||||
pip install medchem
|
||||
```
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. Medicinal Chemistry Rules
|
||||
|
||||
Apply established drug-likeness rules to molecules using the `medchem.rules` module.
|
||||
|
||||
**Available Rules:**
|
||||
- Rule of Five (Lipinski)
|
||||
- Rule of Oprea
|
||||
- Rule of CNS
|
||||
- Rule of leadlike (soft and strict)
|
||||
- Rule of three
|
||||
- Rule of Reos
|
||||
- Rule of drug
|
||||
- Rule of Veber
|
||||
- Golden triangle
|
||||
- PAINS filters
|
||||
|
||||
**Single Rule Application:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Apply Rule of Five to a SMILES string
|
||||
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O" # Aspirin
|
||||
passes = mc.rules.basic_rules.rule_of_five(smiles)
|
||||
# Returns: True
|
||||
|
||||
# Check specific rules
|
||||
passes_oprea = mc.rules.basic_rules.rule_of_oprea(smiles)
|
||||
passes_cns = mc.rules.basic_rules.rule_of_cns(smiles)
|
||||
```
|
||||
|
||||
**Multiple Rules with RuleFilters:**
|
||||
|
||||
```python
|
||||
import datamol as dm
|
||||
import medchem as mc
|
||||
|
||||
# Load molecules
|
||||
mols = [dm.to_mol(smiles) for smiles in smiles_list]
|
||||
|
||||
# Create filter with multiple rules
|
||||
rfilter = mc.rules.RuleFilters(
|
||||
rule_list=[
|
||||
"rule_of_five",
|
||||
"rule_of_oprea",
|
||||
"rule_of_cns",
|
||||
"rule_of_leadlike_soft"
|
||||
]
|
||||
)
|
||||
|
||||
# Apply filters with parallelization
|
||||
results = rfilter(
|
||||
mols=mols,
|
||||
n_jobs=-1, # Use all CPU cores
|
||||
progress=True
|
||||
)
|
||||
```
|
||||
|
||||
**Result Format:**
|
||||
Results are returned as dictionaries with pass/fail status and detailed information for each rule.
|
||||
|
||||
### 2. Structural Alert Filters
|
||||
|
||||
Detect potentially problematic structural patterns using the `medchem.structural` module.
|
||||
|
||||
**Available Filters:**
|
||||
|
||||
1. **Common Alerts** - General structural alerts derived from ChEMBL curation and literature
|
||||
2. **NIBR Filters** - Novartis Institutes for BioMedical Research filter set
|
||||
3. **Lilly Demerits** - Eli Lilly's demerit-based system (275 rules, molecules rejected at >100 demerits)
|
||||
|
||||
**Common Alerts:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Create filter
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
|
||||
# Check single molecule
|
||||
mol = dm.to_mol("c1ccccc1")
|
||||
has_alerts, details = alert_filter.check_mol(mol)
|
||||
|
||||
# Batch filtering with parallelization
|
||||
results = alert_filter(
|
||||
mols=mol_list,
|
||||
n_jobs=-1,
|
||||
progress=True
|
||||
)
|
||||
```
|
||||
|
||||
**NIBR Filters:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Apply NIBR filters
|
||||
nibr_filter = mc.structural.NIBRFilters()
|
||||
results = nibr_filter(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
**Lilly Demerits:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Calculate Lilly demerits
|
||||
lilly = mc.structural.LillyDemeritsFilters()
|
||||
results = lilly(mols=mol_list, n_jobs=-1)
|
||||
|
||||
# Each result includes demerit score and whether it passes (≤100 demerits)
|
||||
```
|
||||
|
||||
### 3. Functional API for High-Level Operations
|
||||
|
||||
The `medchem.functional` module provides convenient functions for common workflows.
|
||||
|
||||
**Quick Filtering:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Apply NIBR filters to a list
|
||||
filter_ok = mc.functional.nibr_filter(
|
||||
mols=mol_list,
|
||||
n_jobs=-1
|
||||
)
|
||||
|
||||
# Apply common alerts
|
||||
alert_results = mc.functional.common_alerts_filter(
|
||||
mols=mol_list,
|
||||
n_jobs=-1
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Chemical Groups Detection
|
||||
|
||||
Identify specific chemical groups and functional groups using `medchem.groups`.
|
||||
|
||||
**Available Groups:**
|
||||
- Hinge binders
|
||||
- Phosphate binders
|
||||
- Michael acceptors
|
||||
- Reactive groups
|
||||
- Custom SMARTS patterns
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Create group detector
|
||||
group = mc.groups.ChemicalGroup(groups=["hinge_binders"])
|
||||
|
||||
# Check for matches
|
||||
has_matches = group.has_match(mol_list)
|
||||
|
||||
# Get detailed match information
|
||||
matches = group.get_matches(mol)
|
||||
```
|
||||
|
||||
### 5. Named Catalogs
|
||||
|
||||
Access curated collections of chemical structures through `medchem.catalogs`.
|
||||
|
||||
**Available Catalogs:**
|
||||
- Functional groups
|
||||
- Protecting groups
|
||||
- Common reagents
|
||||
- Standard fragments
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Access named catalogs
|
||||
catalogs = mc.catalogs.NamedCatalogs
|
||||
|
||||
# Use catalog for matching
|
||||
catalog = catalogs.get("functional_groups")
|
||||
matches = catalog.get_matches(mol)
|
||||
```
|
||||
|
||||
### 6. Molecular Complexity
|
||||
|
||||
Calculate complexity metrics that approximate synthetic accessibility using `medchem.complexity`.
|
||||
|
||||
**Common Metrics:**
|
||||
- Bertz complexity
|
||||
- Whitlock complexity
|
||||
- Barone complexity
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Calculate complexity
|
||||
complexity_score = mc.complexity.calculate_complexity(mol)
|
||||
|
||||
# Filter by complexity threshold
|
||||
complex_filter = mc.complexity.ComplexityFilter(max_complexity=500)
|
||||
results = complex_filter(mols=mol_list)
|
||||
```
|
||||
|
||||
### 7. Constraints Filtering
|
||||
|
||||
Apply custom property-based constraints using `medchem.constraints`.
|
||||
|
||||
**Example Constraints:**
|
||||
- Molecular weight ranges
|
||||
- LogP bounds
|
||||
- TPSA limits
|
||||
- Rotatable bond counts
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Define constraints
|
||||
constraints = mc.constraints.Constraints(
|
||||
mw_range=(200, 500),
|
||||
logp_range=(-2, 5),
|
||||
tpsa_max=140,
|
||||
rotatable_bonds_max=10
|
||||
)
|
||||
|
||||
# Apply constraints
|
||||
results = constraints(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
### 8. Medchem Query Language
|
||||
|
||||
Use a specialized query language for complex filtering criteria.
|
||||
|
||||
**Query Examples:**
|
||||
```
|
||||
# Molecules passing Ro5 AND not having common alerts
|
||||
"rule_of_five AND NOT common_alerts"
|
||||
|
||||
# CNS-like molecules with low complexity
|
||||
"rule_of_cns AND complexity < 400"
|
||||
|
||||
# Leadlike molecules without Lilly demerits
|
||||
"rule_of_leadlike AND lilly_demerits == 0"
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Parse and apply query
|
||||
query = mc.query.parse("rule_of_five AND NOT common_alerts")
|
||||
results = query.apply(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
## Workflow Patterns
|
||||
|
||||
### Pattern 1: Initial Triage of Compound Library
|
||||
|
||||
Filter a large compound collection to identify drug-like candidates.
|
||||
|
||||
```python
|
||||
import datamol as dm
|
||||
import medchem as mc
|
||||
import pandas as pd
|
||||
|
||||
# Load compound library
|
||||
df = pd.read_csv("compounds.csv")
|
||||
mols = [dm.to_mol(smi) for smi in df["smiles"]]
|
||||
|
||||
# Apply primary filters
|
||||
rule_filter = mc.rules.RuleFilters(rule_list=["rule_of_five", "rule_of_veber"])
|
||||
rule_results = rule_filter(mols=mols, n_jobs=-1, progress=True)
|
||||
|
||||
# Apply structural alerts
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
alert_results = alert_filter(mols=mols, n_jobs=-1, progress=True)
|
||||
|
||||
# Combine results
|
||||
df["passes_rules"] = rule_results["pass"]
|
||||
df["has_alerts"] = alert_results["has_alerts"]
|
||||
df["drug_like"] = df["passes_rules"] & ~df["has_alerts"]
|
||||
|
||||
# Save filtered compounds
|
||||
filtered_df = df[df["drug_like"]]
|
||||
filtered_df.to_csv("filtered_compounds.csv", index=False)
|
||||
```
|
||||
|
||||
### Pattern 2: Lead Optimization Filtering
|
||||
|
||||
Apply stricter criteria during lead optimization.
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Create comprehensive filter
|
||||
filters = {
|
||||
"rules": mc.rules.RuleFilters(rule_list=["rule_of_leadlike_strict"]),
|
||||
"alerts": mc.structural.NIBRFilters(),
|
||||
"lilly": mc.structural.LillyDemeritsFilters(),
|
||||
"complexity": mc.complexity.ComplexityFilter(max_complexity=400)
|
||||
}
|
||||
|
||||
# Apply all filters
|
||||
results = {}
|
||||
for name, filt in filters.items():
|
||||
results[name] = filt(mols=candidate_mols, n_jobs=-1)
|
||||
|
||||
# Identify compounds passing all filters
|
||||
passes_all = all(r["pass"] for r in results.values())
|
||||
```
|
||||
|
||||
### Pattern 3: Identify Specific Chemical Groups
|
||||
|
||||
Find molecules containing specific functional groups or scaffolds.
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Create group detector for multiple groups
|
||||
group_detector = mc.groups.ChemicalGroup(
|
||||
groups=["hinge_binders", "phosphate_binders"]
|
||||
)
|
||||
|
||||
# Screen library
|
||||
matches = group_detector.get_all_matches(mol_list)
|
||||
|
||||
# Filter molecules with desired groups
|
||||
mol_with_groups = [mol for mol, match in zip(mol_list, matches) if match]
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Context Matters**: Don't blindly apply filters. Understand the biological target and chemical space.
|
||||
|
||||
2. **Combine Multiple Filters**: Use rules, structural alerts, and domain knowledge together for better decisions.
|
||||
|
||||
3. **Use Parallelization**: For large datasets (>1000 molecules), always use `n_jobs=-1` for parallel processing.
|
||||
|
||||
4. **Iterative Refinement**: Start with broad filters (Ro5), then apply more specific criteria (CNS, leadlike) as needed.
|
||||
|
||||
5. **Document Filtering Decisions**: Track which molecules were filtered out and why for reproducibility.
|
||||
|
||||
6. **Validate Results**: Remember that marketed drugs often fail standard filters—use these as guidelines, not absolute rules.
|
||||
|
||||
7. **Consider Prodrugs**: Molecules designed as prodrugs may intentionally violate standard medicinal chemistry rules.
|
||||
|
||||
## Resources
|
||||
|
||||
### references/api_guide.md
|
||||
Comprehensive API reference covering all medchem modules with detailed function signatures, parameters, and return types.
|
||||
|
||||
### references/rules_catalog.md
|
||||
Complete catalog of available rules, filters, and alerts with descriptions, thresholds, and literature references.
|
||||
|
||||
### scripts/filter_molecules.py
|
||||
Production-ready script for batch filtering workflows. Supports multiple input formats (CSV, SDF, SMILES), configurable filter combinations, and detailed reporting.
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python scripts/filter_molecules.py input.csv --rules rule_of_five,rule_of_cns --alerts nibr --output filtered.csv
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
Official documentation: https://medchem-docs.datamol.io/
|
||||
GitHub repository: https://github.com/datamol-io/medchem
|
||||
600
scientific-packages/medchem/references/api_guide.md
Normal file
600
scientific-packages/medchem/references/api_guide.md
Normal file
@@ -0,0 +1,600 @@
|
||||
# Medchem API Reference
|
||||
|
||||
Comprehensive reference for all medchem modules and functions.
|
||||
|
||||
## Module: medchem.rules
|
||||
|
||||
### Class: RuleFilters
|
||||
|
||||
Filter molecules based on multiple medicinal chemistry rules.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
RuleFilters(rule_list: List[str])
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `rule_list`: List of rule names to apply. See available rules below.
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1, progress: bool = False) -> Dict
|
||||
```
|
||||
- `mols`: List of RDKit molecule objects
|
||||
- `n_jobs`: Number of parallel jobs (-1 uses all cores)
|
||||
- `progress`: Show progress bar
|
||||
- **Returns**: Dictionary with results for each rule
|
||||
|
||||
**Example:**
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_five", "rule_of_cns"])
|
||||
results = rfilter(mols=mol_list, n_jobs=-1, progress=True)
|
||||
```
|
||||
|
||||
### Module: medchem.rules.basic_rules
|
||||
|
||||
Individual rule functions that can be applied to single molecules.
|
||||
|
||||
#### rule_of_five()
|
||||
|
||||
```python
|
||||
rule_of_five(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Lipinski's Rule of Five for oral bioavailability.
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight ≤ 500 Da
|
||||
- LogP ≤ 5
|
||||
- H-bond donors ≤ 5
|
||||
- H-bond acceptors ≤ 10
|
||||
|
||||
**Parameters:**
|
||||
- `mol`: SMILES string or RDKit molecule object
|
||||
|
||||
**Returns:** True if molecule passes all criteria
|
||||
|
||||
#### rule_of_three()
|
||||
|
||||
```python
|
||||
rule_of_three(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Rule of Three for fragment screening libraries.
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight ≤ 300 Da
|
||||
- LogP ≤ 3
|
||||
- H-bond donors ≤ 3
|
||||
- H-bond acceptors ≤ 3
|
||||
- Rotatable bonds ≤ 3
|
||||
- Polar surface area ≤ 60 Ų
|
||||
|
||||
#### rule_of_oprea()
|
||||
|
||||
```python
|
||||
rule_of_oprea(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Oprea's lead-like criteria for hit-to-lead optimization.
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight: 200-350 Da
|
||||
- LogP: -2 to 4
|
||||
- Rotatable bonds ≤ 7
|
||||
- Rings ≤ 4
|
||||
|
||||
#### rule_of_cns()
|
||||
|
||||
```python
|
||||
rule_of_cns(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
CNS drug-likeness rules.
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight ≤ 450 Da
|
||||
- LogP: -1 to 5
|
||||
- H-bond donors ≤ 2
|
||||
- TPSA ≤ 90 Ų
|
||||
|
||||
#### rule_of_leadlike_soft()
|
||||
|
||||
```python
|
||||
rule_of_leadlike_soft(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Soft lead-like criteria (more permissive).
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight: 250-450 Da
|
||||
- LogP: -3 to 4
|
||||
- Rotatable bonds ≤ 10
|
||||
|
||||
#### rule_of_leadlike_strict()
|
||||
|
||||
```python
|
||||
rule_of_leadlike_strict(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Strict lead-like criteria (more restrictive).
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight: 200-350 Da
|
||||
- LogP: -2 to 3.5
|
||||
- Rotatable bonds ≤ 7
|
||||
- Rings: 1-3
|
||||
|
||||
#### rule_of_veber()
|
||||
|
||||
```python
|
||||
rule_of_veber(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Veber's rules for oral bioavailability.
|
||||
|
||||
**Criteria:**
|
||||
- Rotatable bonds ≤ 10
|
||||
- TPSA ≤ 140 Ų
|
||||
|
||||
#### rule_of_reos()
|
||||
|
||||
```python
|
||||
rule_of_reos(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Rapid Elimination Of Swill (REOS) filter.
|
||||
|
||||
**Criteria:**
|
||||
- Molecular weight: 200-500 Da
|
||||
- LogP: -5 to 5
|
||||
- H-bond donors: 0-5
|
||||
- H-bond acceptors: 0-10
|
||||
|
||||
#### rule_of_drug()
|
||||
|
||||
```python
|
||||
rule_of_drug(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Combined drug-likeness criteria.
|
||||
|
||||
**Criteria:**
|
||||
- Passes Rule of Five
|
||||
- Passes Veber rules
|
||||
- No PAINS substructures
|
||||
|
||||
#### golden_triangle()
|
||||
|
||||
```python
|
||||
golden_triangle(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Golden Triangle for drug-likeness balance.
|
||||
|
||||
**Criteria:**
|
||||
- 200 ≤ MW ≤ 50×LogP + 400
|
||||
- LogP: -2 to 5
|
||||
|
||||
#### pains_filter()
|
||||
|
||||
```python
|
||||
pains_filter(mol: Union[str, Chem.Mol]) -> bool
|
||||
```
|
||||
|
||||
Pan Assay INterference compoundS (PAINS) filter.
|
||||
|
||||
**Returns:** True if molecule does NOT contain PAINS substructures
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.structural
|
||||
|
||||
### Class: CommonAlertsFilters
|
||||
|
||||
Filter for common structural alerts derived from ChEMBL and literature.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
CommonAlertsFilters()
|
||||
```
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1, progress: bool = False) -> List[Dict]
|
||||
```
|
||||
|
||||
Apply common alerts filter to a list of molecules.
|
||||
|
||||
**Returns:** List of dictionaries with keys:
|
||||
- `has_alerts`: Boolean indicating if molecule has alerts
|
||||
- `alert_details`: List of matched alert patterns
|
||||
- `num_alerts`: Number of alerts found
|
||||
|
||||
```python
|
||||
check_mol(mol: Chem.Mol) -> Tuple[bool, List[str]]
|
||||
```
|
||||
|
||||
Check a single molecule for structural alerts.
|
||||
|
||||
**Returns:** Tuple of (has_alerts, list_of_alert_names)
|
||||
|
||||
### Class: NIBRFilters
|
||||
|
||||
Novartis NIBR medicinal chemistry filters.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
NIBRFilters()
|
||||
```
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1, progress: bool = False) -> List[bool]
|
||||
```
|
||||
|
||||
Apply NIBR filters to molecules.
|
||||
|
||||
**Returns:** List of booleans (True if molecule passes)
|
||||
|
||||
### Class: LillyDemeritsFilters
|
||||
|
||||
Eli Lilly's demerit-based structural alert system (275 rules).
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
LillyDemeritsFilters()
|
||||
```
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1, progress: bool = False) -> List[Dict]
|
||||
```
|
||||
|
||||
Calculate Lilly demerits for molecules.
|
||||
|
||||
**Returns:** List of dictionaries with keys:
|
||||
- `demerits`: Total demerit score
|
||||
- `passes`: Boolean (True if demerits ≤ 100)
|
||||
- `matched_patterns`: List of matched patterns with scores
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.functional
|
||||
|
||||
High-level functional API for common operations.
|
||||
|
||||
### nibr_filter()
|
||||
|
||||
```python
|
||||
nibr_filter(mols: List[Chem.Mol], n_jobs: int = 1) -> List[bool]
|
||||
```
|
||||
|
||||
Apply NIBR filters using functional API.
|
||||
|
||||
**Parameters:**
|
||||
- `mols`: List of molecules
|
||||
- `n_jobs`: Parallelization level
|
||||
|
||||
**Returns:** List of pass/fail booleans
|
||||
|
||||
### common_alerts_filter()
|
||||
|
||||
```python
|
||||
common_alerts_filter(mols: List[Chem.Mol], n_jobs: int = 1) -> List[Dict]
|
||||
```
|
||||
|
||||
Apply common alerts filter using functional API.
|
||||
|
||||
**Returns:** List of results dictionaries
|
||||
|
||||
### lilly_demerits_filter()
|
||||
|
||||
```python
|
||||
lilly_demerits_filter(mols: List[Chem.Mol], n_jobs: int = 1) -> List[Dict]
|
||||
```
|
||||
|
||||
Calculate Lilly demerits using functional API.
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.groups
|
||||
|
||||
### Class: ChemicalGroup
|
||||
|
||||
Detect specific chemical groups in molecules.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
ChemicalGroup(groups: List[str], custom_smarts: Optional[Dict[str, str]] = None)
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
- `groups`: List of predefined group names
|
||||
- `custom_smarts`: Dictionary mapping custom group names to SMARTS patterns
|
||||
|
||||
**Predefined Groups:**
|
||||
- `"hinge_binders"`: Kinase hinge binding motifs
|
||||
- `"phosphate_binders"`: Phosphate binding groups
|
||||
- `"michael_acceptors"`: Michael acceptor electrophiles
|
||||
- `"reactive_groups"`: General reactive functionalities
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
has_match(mols: List[Chem.Mol]) -> List[bool]
|
||||
```
|
||||
|
||||
Check if molecules contain any of the specified groups.
|
||||
|
||||
```python
|
||||
get_matches(mol: Chem.Mol) -> Dict[str, List[Tuple]]
|
||||
```
|
||||
|
||||
Get detailed match information for a single molecule.
|
||||
|
||||
**Returns:** Dictionary mapping group names to lists of atom indices
|
||||
|
||||
```python
|
||||
get_all_matches(mols: List[Chem.Mol]) -> List[Dict]
|
||||
```
|
||||
|
||||
Get match information for all molecules.
|
||||
|
||||
**Example:**
|
||||
```python
|
||||
group = mc.groups.ChemicalGroup(groups=["hinge_binders", "phosphate_binders"])
|
||||
matches = group.get_all_matches(mol_list)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.catalogs
|
||||
|
||||
### Class: NamedCatalogs
|
||||
|
||||
Access to curated chemical catalogs.
|
||||
|
||||
**Available Catalogs:**
|
||||
- `"functional_groups"`: Common functional groups
|
||||
- `"protecting_groups"`: Protecting group structures
|
||||
- `"reagents"`: Common reagents
|
||||
- `"fragments"`: Standard fragments
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
catalog = mc.catalogs.NamedCatalogs.get("functional_groups")
|
||||
matches = catalog.get_matches(mol)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.complexity
|
||||
|
||||
Calculate molecular complexity metrics.
|
||||
|
||||
### calculate_complexity()
|
||||
|
||||
```python
|
||||
calculate_complexity(mol: Chem.Mol, method: str = "bertz") -> float
|
||||
```
|
||||
|
||||
Calculate complexity score for a molecule.
|
||||
|
||||
**Parameters:**
|
||||
- `mol`: RDKit molecule
|
||||
- `method`: Complexity metric ("bertz", "whitlock", "barone")
|
||||
|
||||
**Returns:** Complexity score (higher = more complex)
|
||||
|
||||
### Class: ComplexityFilter
|
||||
|
||||
Filter molecules by complexity threshold.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
ComplexityFilter(max_complexity: float, method: str = "bertz")
|
||||
```
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1) -> List[bool]
|
||||
```
|
||||
|
||||
Filter molecules exceeding complexity threshold.
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.constraints
|
||||
|
||||
### Class: Constraints
|
||||
|
||||
Apply custom property-based constraints.
|
||||
|
||||
**Constructor:**
|
||||
```python
|
||||
Constraints(
|
||||
mw_range: Optional[Tuple[float, float]] = None,
|
||||
logp_range: Optional[Tuple[float, float]] = None,
|
||||
tpsa_max: Optional[float] = None,
|
||||
tpsa_range: Optional[Tuple[float, float]] = None,
|
||||
hbd_max: Optional[int] = None,
|
||||
hba_max: Optional[int] = None,
|
||||
rotatable_bonds_max: Optional[int] = None,
|
||||
rings_range: Optional[Tuple[int, int]] = None,
|
||||
aromatic_rings_max: Optional[int] = None,
|
||||
)
|
||||
```
|
||||
|
||||
**Parameters:** All parameters are optional. Specify only the constraints needed.
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
__call__(mols: List[Chem.Mol], n_jobs: int = 1) -> List[Dict]
|
||||
```
|
||||
|
||||
Apply constraints to molecules.
|
||||
|
||||
**Returns:** List of dictionaries with keys:
|
||||
- `passes`: Boolean indicating if all constraints pass
|
||||
- `violations`: List of constraint names that failed
|
||||
|
||||
**Example:**
|
||||
```python
|
||||
constraints = mc.constraints.Constraints(
|
||||
mw_range=(200, 500),
|
||||
logp_range=(-2, 5),
|
||||
tpsa_max=140
|
||||
)
|
||||
results = constraints(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.query
|
||||
|
||||
Query language for complex filtering.
|
||||
|
||||
### parse()
|
||||
|
||||
```python
|
||||
parse(query: str) -> Query
|
||||
```
|
||||
|
||||
Parse a medchem query string into a Query object.
|
||||
|
||||
**Query Syntax:**
|
||||
- Operators: `AND`, `OR`, `NOT`
|
||||
- Comparisons: `<`, `>`, `<=`, `>=`, `==`, `!=`
|
||||
- Properties: `complexity`, `lilly_demerits`, `mw`, `logp`, `tpsa`
|
||||
- Rules: `rule_of_five`, `rule_of_cns`, etc.
|
||||
- Filters: `common_alerts`, `nibr_filter`, `pains_filter`
|
||||
|
||||
**Example Queries:**
|
||||
```python
|
||||
"rule_of_five AND NOT common_alerts"
|
||||
"rule_of_cns AND complexity < 400"
|
||||
"mw > 200 AND mw < 500 AND logp < 5"
|
||||
"(rule_of_five OR rule_of_oprea) AND NOT pains_filter"
|
||||
```
|
||||
|
||||
### Class: Query
|
||||
|
||||
**Methods:**
|
||||
|
||||
```python
|
||||
apply(mols: List[Chem.Mol], n_jobs: int = 1) -> List[bool]
|
||||
```
|
||||
|
||||
Apply parsed query to molecules.
|
||||
|
||||
**Example:**
|
||||
```python
|
||||
query = mc.query.parse("rule_of_five AND NOT common_alerts")
|
||||
results = query.apply(mols=mol_list, n_jobs=-1)
|
||||
passing_mols = [mol for mol, passes in zip(mol_list, results) if passes]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Module: medchem.utils
|
||||
|
||||
Utility functions for working with molecules.
|
||||
|
||||
### batch_process()
|
||||
|
||||
```python
|
||||
batch_process(
|
||||
mols: List[Chem.Mol],
|
||||
func: Callable,
|
||||
n_jobs: int = 1,
|
||||
progress: bool = False,
|
||||
batch_size: Optional[int] = None
|
||||
) -> List
|
||||
```
|
||||
|
||||
Process molecules in parallel batches.
|
||||
|
||||
**Parameters:**
|
||||
- `mols`: List of molecules
|
||||
- `func`: Function to apply to each molecule
|
||||
- `n_jobs`: Number of parallel workers
|
||||
- `progress`: Show progress bar
|
||||
- `batch_size`: Size of processing batches
|
||||
|
||||
### standardize_mol()
|
||||
|
||||
```python
|
||||
standardize_mol(mol: Chem.Mol) -> Chem.Mol
|
||||
```
|
||||
|
||||
Standardize molecule representation (sanitize, neutralize charges, etc.).
|
||||
|
||||
---
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Pattern: Parallel Processing
|
||||
|
||||
All filters support parallelization:
|
||||
|
||||
```python
|
||||
# Use all CPU cores
|
||||
results = filter_object(mols=mol_list, n_jobs=-1, progress=True)
|
||||
|
||||
# Use specific number of cores
|
||||
results = filter_object(mols=mol_list, n_jobs=4, progress=True)
|
||||
```
|
||||
|
||||
### Pattern: Combining Multiple Filters
|
||||
|
||||
```python
|
||||
import medchem as mc
|
||||
|
||||
# Apply multiple filters
|
||||
rule_filter = mc.rules.RuleFilters(rule_list=["rule_of_five"])
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
lilly_filter = mc.structural.LillyDemeritsFilters()
|
||||
|
||||
# Get results
|
||||
rule_results = rule_filter(mols=mol_list, n_jobs=-1)
|
||||
alert_results = alert_filter(mols=mol_list, n_jobs=-1)
|
||||
lilly_results = lilly_filter(mols=mol_list, n_jobs=-1)
|
||||
|
||||
# Combine criteria
|
||||
passing_mols = [
|
||||
mol for i, mol in enumerate(mol_list)
|
||||
if rule_results[i]["passes"]
|
||||
and not alert_results[i]["has_alerts"]
|
||||
and lilly_results[i]["passes"]
|
||||
]
|
||||
```
|
||||
|
||||
### Pattern: Working with DataFrames
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import datamol as dm
|
||||
import medchem as mc
|
||||
|
||||
# Load data
|
||||
df = pd.read_csv("molecules.csv")
|
||||
df["mol"] = df["smiles"].apply(dm.to_mol)
|
||||
|
||||
# Apply filters
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_five", "rule_of_cns"])
|
||||
results = rfilter(mols=df["mol"].tolist(), n_jobs=-1)
|
||||
|
||||
# Add results to dataframe
|
||||
df["passes_ro5"] = [r["rule_of_five"] for r in results]
|
||||
df["passes_cns"] = [r["rule_of_cns"] for r in results]
|
||||
|
||||
# Filter dataframe
|
||||
filtered_df = df[df["passes_ro5"] & df["passes_cns"]]
|
||||
```
|
||||
604
scientific-packages/medchem/references/rules_catalog.md
Normal file
604
scientific-packages/medchem/references/rules_catalog.md
Normal file
@@ -0,0 +1,604 @@
|
||||
# Medchem Rules and Filters Catalog
|
||||
|
||||
Comprehensive catalog of all available medicinal chemistry rules, structural alerts, and filters in medchem.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Drug-Likeness Rules](#drug-likeness-rules)
|
||||
2. [Lead-Likeness Rules](#lead-likeness-rules)
|
||||
3. [Fragment Rules](#fragment-rules)
|
||||
4. [CNS Rules](#cns-rules)
|
||||
5. [Structural Alert Filters](#structural-alert-filters)
|
||||
6. [Chemical Group Patterns](#chemical-group-patterns)
|
||||
|
||||
---
|
||||
|
||||
## Drug-Likeness Rules
|
||||
|
||||
### Rule of Five (Lipinski)
|
||||
|
||||
**Reference:** Lipinski et al., Adv Drug Deliv Rev (1997) 23:3-25
|
||||
|
||||
**Purpose:** Predict oral bioavailability
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight ≤ 500 Da
|
||||
- LogP ≤ 5
|
||||
- Hydrogen Bond Donors ≤ 5
|
||||
- Hydrogen Bond Acceptors ≤ 10
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_five(mol)
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- One of the most widely used filters in drug discovery
|
||||
- About 90% of orally active drugs comply with these rules
|
||||
- Exceptions exist, especially for natural products and antibiotics
|
||||
|
||||
---
|
||||
|
||||
### Rule of Veber
|
||||
|
||||
**Reference:** Veber et al., J Med Chem (2002) 45:2615-2623
|
||||
|
||||
**Purpose:** Additional criteria for oral bioavailability
|
||||
|
||||
**Criteria:**
|
||||
- Rotatable Bonds ≤ 10
|
||||
- Topological Polar Surface Area (TPSA) ≤ 140 Ų
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_veber(mol)
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- Complements Rule of Five
|
||||
- TPSA correlates with cell permeability
|
||||
- Rotatable bonds affect molecular flexibility
|
||||
|
||||
---
|
||||
|
||||
### Rule of Drug
|
||||
|
||||
**Purpose:** Combined drug-likeness assessment
|
||||
|
||||
**Criteria:**
|
||||
- Passes Rule of Five
|
||||
- Passes Veber rules
|
||||
- Does not contain PAINS substructures
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_drug(mol)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### REOS (Rapid Elimination Of Swill)
|
||||
|
||||
**Reference:** Walters & Murcko, Adv Drug Deliv Rev (2002) 54:255-271
|
||||
|
||||
**Purpose:** Filter out compounds unlikely to be drugs
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight: 200-500 Da
|
||||
- LogP: -5 to 5
|
||||
- Hydrogen Bond Donors: 0-5
|
||||
- Hydrogen Bond Acceptors: 0-10
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_reos(mol)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Golden Triangle
|
||||
|
||||
**Reference:** Johnson et al., J Med Chem (2009) 52:5487-5500
|
||||
|
||||
**Purpose:** Balance lipophilicity and molecular weight
|
||||
|
||||
**Criteria:**
|
||||
- 200 ≤ MW ≤ 50 × LogP + 400
|
||||
- LogP: -2 to 5
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.golden_triangle(mol)
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- Defines optimal physicochemical space
|
||||
- Visual representation resembles a triangle on MW vs LogP plot
|
||||
|
||||
---
|
||||
|
||||
## Lead-Likeness Rules
|
||||
|
||||
### Rule of Oprea
|
||||
|
||||
**Reference:** Oprea et al., J Chem Inf Comput Sci (2001) 41:1308-1315
|
||||
|
||||
**Purpose:** Identify lead-like compounds for optimization
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight: 200-350 Da
|
||||
- LogP: -2 to 4
|
||||
- Rotatable Bonds ≤ 7
|
||||
- Number of Rings ≤ 4
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_oprea(mol)
|
||||
```
|
||||
|
||||
**Rationale:** Lead compounds should have "room to grow" during optimization
|
||||
|
||||
---
|
||||
|
||||
### Rule of Leadlike (Soft)
|
||||
|
||||
**Purpose:** Permissive lead-like criteria
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight: 250-450 Da
|
||||
- LogP: -3 to 4
|
||||
- Rotatable Bonds ≤ 10
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_leadlike_soft(mol)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Rule of Leadlike (Strict)
|
||||
|
||||
**Purpose:** Restrictive lead-like criteria
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight: 200-350 Da
|
||||
- LogP: -2 to 3.5
|
||||
- Rotatable Bonds ≤ 7
|
||||
- Number of Rings: 1-3
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_leadlike_strict(mol)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Fragment Rules
|
||||
|
||||
### Rule of Three
|
||||
|
||||
**Reference:** Congreve et al., Drug Discov Today (2003) 8:876-877
|
||||
|
||||
**Purpose:** Screen fragment libraries for fragment-based drug discovery
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight ≤ 300 Da
|
||||
- LogP ≤ 3
|
||||
- Hydrogen Bond Donors ≤ 3
|
||||
- Hydrogen Bond Acceptors ≤ 3
|
||||
- Rotatable Bonds ≤ 3
|
||||
- Polar Surface Area ≤ 60 Ų
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_three(mol)
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- Fragments are grown into leads during optimization
|
||||
- Lower complexity allows more starting points
|
||||
|
||||
---
|
||||
|
||||
## CNS Rules
|
||||
|
||||
### Rule of CNS
|
||||
|
||||
**Purpose:** Central nervous system drug-likeness
|
||||
|
||||
**Criteria:**
|
||||
- Molecular Weight ≤ 450 Da
|
||||
- LogP: -1 to 5
|
||||
- Hydrogen Bond Donors ≤ 2
|
||||
- TPSA ≤ 90 Ų
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.rule_of_cns(mol)
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
- Blood-brain barrier penetration requires specific properties
|
||||
- Lower TPSA and HBD count improve BBB permeability
|
||||
- Tight constraints reflect CNS challenges
|
||||
|
||||
---
|
||||
|
||||
## Structural Alert Filters
|
||||
|
||||
### PAINS (Pan Assay INterference compoundS)
|
||||
|
||||
**Reference:** Baell & Holloway, J Med Chem (2010) 53:2719-2740
|
||||
|
||||
**Purpose:** Identify compounds that interfere with assays
|
||||
|
||||
**Categories:**
|
||||
- Catechols
|
||||
- Quinones
|
||||
- Rhodanines
|
||||
- Hydroxyphenylhydrazones
|
||||
- Alkyl/aryl aldehydes
|
||||
- Michael acceptors (specific patterns)
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
mc.rules.basic_rules.pains_filter(mol)
|
||||
# Returns True if NO PAINS found
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- PAINS compounds show activity in multiple assays through non-specific mechanisms
|
||||
- Common false positives in screening campaigns
|
||||
- Should be deprioritized in lead selection
|
||||
|
||||
---
|
||||
|
||||
### Common Alerts Filters
|
||||
|
||||
**Source:** Derived from ChEMBL curation and medicinal chemistry literature
|
||||
|
||||
**Purpose:** Flag common problematic structural patterns
|
||||
|
||||
**Alert Categories:**
|
||||
1. **Reactive Groups**
|
||||
- Epoxides
|
||||
- Aziridines
|
||||
- Acid halides
|
||||
- Isocyanates
|
||||
|
||||
2. **Metabolic Liabilities**
|
||||
- Hydrazines
|
||||
- Thioureas
|
||||
- Anilines (certain patterns)
|
||||
|
||||
3. **Aggregators**
|
||||
- Polyaromatic systems
|
||||
- Long aliphatic chains
|
||||
|
||||
4. **Toxicophores**
|
||||
- Nitro aromatics
|
||||
- Aromatic N-oxides
|
||||
- Certain heterocycles
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
has_alerts, details = alert_filter.check_mol(mol)
|
||||
```
|
||||
|
||||
**Return Format:**
|
||||
```python
|
||||
{
|
||||
"has_alerts": True,
|
||||
"alert_details": ["reactive_epoxide", "metabolic_hydrazine"],
|
||||
"num_alerts": 2
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### NIBR Filters
|
||||
|
||||
**Source:** Novartis Institutes for BioMedical Research
|
||||
|
||||
**Purpose:** Industrial medicinal chemistry filtering rules
|
||||
|
||||
**Features:**
|
||||
- Proprietary filter set developed from Novartis experience
|
||||
- Balances drug-likeness with practical medicinal chemistry
|
||||
- Includes both structural alerts and property filters
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
nibr_filter = mc.structural.NIBRFilters()
|
||||
results = nibr_filter(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
**Return Format:** Boolean list (True = passes)
|
||||
|
||||
---
|
||||
|
||||
### Lilly Demerits Filter
|
||||
|
||||
**Reference:** Based on Eli Lilly medicinal chemistry rules
|
||||
|
||||
**Source:** 275 structural patterns accumulated over 18 years
|
||||
|
||||
**Purpose:** Identify assay interference and problematic functionalities
|
||||
|
||||
**Mechanism:**
|
||||
- Each matched pattern adds demerits
|
||||
- Molecules with >100 demerits are rejected
|
||||
- Some patterns add 10-50 demerits, others add 100+ (instant rejection)
|
||||
|
||||
**Demerit Categories:**
|
||||
|
||||
1. **High Demerits (>50):**
|
||||
- Known toxic groups
|
||||
- Highly reactive functionalities
|
||||
- Strong metal chelators
|
||||
|
||||
2. **Medium Demerits (20-50):**
|
||||
- Metabolic liabilities
|
||||
- Aggregation-prone structures
|
||||
- Frequent hitters
|
||||
|
||||
3. **Low Demerits (5-20):**
|
||||
- Minor concerns
|
||||
- Context-dependent issues
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
lilly_filter = mc.structural.LillyDemeritsFilters()
|
||||
results = lilly_filter(mols=mol_list, n_jobs=-1)
|
||||
```
|
||||
|
||||
**Return Format:**
|
||||
```python
|
||||
{
|
||||
"demerits": 35,
|
||||
"passes": True, # (demerits ≤ 100)
|
||||
"matched_patterns": [
|
||||
{"pattern": "phenolic_ester", "demerits": 20},
|
||||
{"pattern": "aniline_derivative", "demerits": 15}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Chemical Group Patterns
|
||||
|
||||
### Hinge Binders
|
||||
|
||||
**Purpose:** Identify kinase hinge-binding motifs
|
||||
|
||||
**Common Patterns:**
|
||||
- Aminopyridines
|
||||
- Aminopyrimidines
|
||||
- Indazoles
|
||||
- Benzimidazoles
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
group = mc.groups.ChemicalGroup(groups=["hinge_binders"])
|
||||
has_hinge = group.has_match(mol_list)
|
||||
```
|
||||
|
||||
**Application:** Kinase inhibitor design
|
||||
|
||||
---
|
||||
|
||||
### Phosphate Binders
|
||||
|
||||
**Purpose:** Identify phosphate-binding groups
|
||||
|
||||
**Common Patterns:**
|
||||
- Basic amines in specific geometries
|
||||
- Guanidinium groups
|
||||
- Arginine mimetics
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
group = mc.groups.ChemicalGroup(groups=["phosphate_binders"])
|
||||
```
|
||||
|
||||
**Application:** Kinase inhibitors, phosphatase inhibitors
|
||||
|
||||
---
|
||||
|
||||
### Michael Acceptors
|
||||
|
||||
**Purpose:** Identify electrophilic Michael acceptor groups
|
||||
|
||||
**Common Patterns:**
|
||||
- α,β-Unsaturated carbonyls
|
||||
- α,β-Unsaturated nitriles
|
||||
- Vinyl sulfones
|
||||
- Acrylamides
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
group = mc.groups.ChemicalGroup(groups=["michael_acceptors"])
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
- Can be desirable for covalent inhibitors
|
||||
- Often flagged as reactive alerts in screening
|
||||
|
||||
---
|
||||
|
||||
### Reactive Groups
|
||||
|
||||
**Purpose:** Identify generally reactive functionalities
|
||||
|
||||
**Common Patterns:**
|
||||
- Epoxides
|
||||
- Aziridines
|
||||
- Acyl halides
|
||||
- Isocyanates
|
||||
- Sulfonyl chlorides
|
||||
|
||||
**Usage:**
|
||||
```python
|
||||
group = mc.groups.ChemicalGroup(groups=["reactive_groups"])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Custom SMARTS Patterns
|
||||
|
||||
Define custom structural patterns using SMARTS:
|
||||
|
||||
```python
|
||||
custom_patterns = {
|
||||
"my_warhead": "[C;H0](=O)C(F)(F)F", # Trifluoromethyl ketone
|
||||
"my_scaffold": "c1ccc2c(c1)ncc(n2)N", # Aminobenzimidazole
|
||||
}
|
||||
|
||||
group = mc.groups.ChemicalGroup(
|
||||
groups=["hinge_binders"],
|
||||
custom_smarts=custom_patterns
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Filter Selection Guidelines
|
||||
|
||||
### Initial Screening (High-Throughput)
|
||||
|
||||
Recommended filters:
|
||||
- Rule of Five
|
||||
- PAINS filter
|
||||
- Common Alerts (permissive settings)
|
||||
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_five", "pains_filter"])
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Hit-to-Lead
|
||||
|
||||
Recommended filters:
|
||||
- Rule of Oprea or Leadlike (soft)
|
||||
- NIBR filters
|
||||
- Lilly Demerits
|
||||
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_oprea"])
|
||||
nibr_filter = mc.structural.NIBRFilters()
|
||||
lilly_filter = mc.structural.LillyDemeritsFilters()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Lead Optimization
|
||||
|
||||
Recommended filters:
|
||||
- Rule of Drug
|
||||
- Leadlike (strict)
|
||||
- Full structural alert analysis
|
||||
- Complexity filters
|
||||
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_drug", "rule_of_leadlike_strict"])
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
complexity_filter = mc.complexity.ComplexityFilter(max_complexity=400)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### CNS Targets
|
||||
|
||||
Recommended filters:
|
||||
- Rule of CNS
|
||||
- Reduced PAINS criteria (CNS-focused)
|
||||
- BBB permeability constraints
|
||||
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_cns"])
|
||||
constraints = mc.constraints.Constraints(
|
||||
tpsa_max=90,
|
||||
hbd_max=2,
|
||||
mw_range=(300, 450)
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Fragment-Based Drug Discovery
|
||||
|
||||
Recommended filters:
|
||||
- Rule of Three
|
||||
- Minimal complexity
|
||||
- Basic reactive group check
|
||||
|
||||
```python
|
||||
rfilter = mc.rules.RuleFilters(rule_list=["rule_of_three"])
|
||||
complexity_filter = mc.complexity.ComplexityFilter(max_complexity=250)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Important Considerations
|
||||
|
||||
### False Positives and False Negatives
|
||||
|
||||
**Filters are guidelines, not absolutes:**
|
||||
|
||||
1. **False Positives** (good drugs flagged):
|
||||
- ~10% of marketed drugs fail Rule of Five
|
||||
- Natural products often violate standard rules
|
||||
- Prodrugs intentionally break rules
|
||||
- Antibiotics and antivirals frequently non-compliant
|
||||
|
||||
2. **False Negatives** (bad compounds passing):
|
||||
- Passing filters doesn't guarantee success
|
||||
- Target-specific issues not captured
|
||||
- In vivo properties not fully predicted
|
||||
|
||||
### Context-Specific Application
|
||||
|
||||
**Different contexts require different criteria:**
|
||||
|
||||
- **Target Class:** Kinases vs GPCRs vs ion channels have different optimal spaces
|
||||
- **Modality:** Small molecules vs PROTACs vs molecular glues
|
||||
- **Administration Route:** Oral vs IV vs topical
|
||||
- **Disease Area:** CNS vs oncology vs infectious disease
|
||||
- **Stage:** Screening vs hit-to-lead vs lead optimization
|
||||
|
||||
### Complementing with Machine Learning
|
||||
|
||||
Modern approaches combine rules with ML:
|
||||
|
||||
```python
|
||||
# Rule-based pre-filtering
|
||||
rule_results = mc.rules.RuleFilters(rule_list=["rule_of_five"])(mols)
|
||||
filtered_mols = [mol for mol, r in zip(mols, rule_results) if r["passes"]]
|
||||
|
||||
# ML model scoring on filtered set
|
||||
ml_scores = ml_model.predict(filtered_mols)
|
||||
|
||||
# Combined decision
|
||||
final_candidates = [
|
||||
mol for mol, score in zip(filtered_mols, ml_scores)
|
||||
if score > threshold
|
||||
]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
1. Lipinski CA et al. Adv Drug Deliv Rev (1997) 23:3-25
|
||||
2. Veber DF et al. J Med Chem (2002) 45:2615-2623
|
||||
3. Oprea TI et al. J Chem Inf Comput Sci (2001) 41:1308-1315
|
||||
4. Congreve M et al. Drug Discov Today (2003) 8:876-877
|
||||
5. Baell JB & Holloway GA. J Med Chem (2010) 53:2719-2740
|
||||
6. Johnson TW et al. J Med Chem (2009) 52:5487-5500
|
||||
7. Walters WP & Murcko MA. Adv Drug Deliv Rev (2002) 54:255-271
|
||||
8. Hann MM & Oprea TI. Curr Opin Chem Biol (2004) 8:255-263
|
||||
9. Rishton GM. Drug Discov Today (1997) 2:382-384
|
||||
418
scientific-packages/medchem/scripts/filter_molecules.py
Normal file
418
scientific-packages/medchem/scripts/filter_molecules.py
Normal file
@@ -0,0 +1,418 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch molecular filtering using medchem library.
|
||||
|
||||
This script provides a production-ready workflow for filtering compound libraries
|
||||
using medchem rules, structural alerts, and custom constraints.
|
||||
|
||||
Usage:
|
||||
python filter_molecules.py input.csv --rules rule_of_five,rule_of_cns --alerts nibr --output filtered.csv
|
||||
python filter_molecules.py input.sdf --rules rule_of_drug --lilly --complexity 400 --output results.csv
|
||||
python filter_molecules.py smiles.txt --nibr --pains --n-jobs -1 --output clean.csv
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
import json
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
import datamol as dm
|
||||
import medchem as mc
|
||||
from rdkit import Chem
|
||||
from tqdm import tqdm
|
||||
except ImportError as e:
|
||||
print(f"Error: Missing required package: {e}")
|
||||
print("Install dependencies: pip install medchem datamol pandas tqdm")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def load_molecules(input_file: Path, smiles_column: str = "smiles") -> Tuple[pd.DataFrame, List[Chem.Mol]]:
|
||||
"""
|
||||
Load molecules from various file formats.
|
||||
|
||||
Supports:
|
||||
- CSV/TSV with SMILES column
|
||||
- SDF files
|
||||
- Plain text files with one SMILES per line
|
||||
|
||||
Returns:
|
||||
Tuple of (DataFrame with metadata, list of RDKit molecules)
|
||||
"""
|
||||
suffix = input_file.suffix.lower()
|
||||
|
||||
if suffix == ".sdf":
|
||||
print(f"Loading SDF file: {input_file}")
|
||||
supplier = Chem.SDMolSupplier(str(input_file))
|
||||
mols = [mol for mol in supplier if mol is not None]
|
||||
|
||||
# Create DataFrame from SDF properties
|
||||
data = []
|
||||
for mol in mols:
|
||||
props = mol.GetPropsAsDict()
|
||||
props["smiles"] = Chem.MolToSmiles(mol)
|
||||
data.append(props)
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
elif suffix in [".csv", ".tsv"]:
|
||||
print(f"Loading CSV/TSV file: {input_file}")
|
||||
sep = "\t" if suffix == ".tsv" else ","
|
||||
df = pd.read_csv(input_file, sep=sep)
|
||||
|
||||
if smiles_column not in df.columns:
|
||||
print(f"Error: Column '{smiles_column}' not found in file")
|
||||
print(f"Available columns: {', '.join(df.columns)}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Converting SMILES to molecules...")
|
||||
mols = [dm.to_mol(smi) for smi in tqdm(df[smiles_column], desc="Parsing")]
|
||||
|
||||
elif suffix == ".txt":
|
||||
print(f"Loading text file: {input_file}")
|
||||
with open(input_file) as f:
|
||||
smiles_list = [line.strip() for line in f if line.strip()]
|
||||
|
||||
df = pd.DataFrame({"smiles": smiles_list})
|
||||
print(f"Converting SMILES to molecules...")
|
||||
mols = [dm.to_mol(smi) for smi in tqdm(smiles_list, desc="Parsing")]
|
||||
|
||||
else:
|
||||
print(f"Error: Unsupported file format: {suffix}")
|
||||
print("Supported formats: .csv, .tsv, .sdf, .txt")
|
||||
sys.exit(1)
|
||||
|
||||
# Filter out invalid molecules
|
||||
valid_indices = [i for i, mol in enumerate(mols) if mol is not None]
|
||||
if len(valid_indices) < len(mols):
|
||||
n_invalid = len(mols) - len(valid_indices)
|
||||
print(f"Warning: {n_invalid} invalid molecules removed")
|
||||
df = df.iloc[valid_indices].reset_index(drop=True)
|
||||
mols = [mols[i] for i in valid_indices]
|
||||
|
||||
print(f"Loaded {len(mols)} valid molecules")
|
||||
return df, mols
|
||||
|
||||
|
||||
def apply_rule_filters(mols: List[Chem.Mol], rules: List[str], n_jobs: int) -> pd.DataFrame:
|
||||
"""Apply medicinal chemistry rule filters."""
|
||||
print(f"\nApplying rule filters: {', '.join(rules)}")
|
||||
|
||||
rfilter = mc.rules.RuleFilters(rule_list=rules)
|
||||
results = rfilter(mols=mols, n_jobs=n_jobs, progress=True)
|
||||
|
||||
# Convert to DataFrame
|
||||
df_results = pd.DataFrame(results)
|
||||
|
||||
# Add summary column
|
||||
df_results["passes_all_rules"] = df_results.all(axis=1)
|
||||
|
||||
return df_results
|
||||
|
||||
|
||||
def apply_structural_alerts(mols: List[Chem.Mol], alert_type: str, n_jobs: int) -> pd.DataFrame:
|
||||
"""Apply structural alert filters."""
|
||||
print(f"\nApplying {alert_type} structural alerts...")
|
||||
|
||||
if alert_type == "common":
|
||||
alert_filter = mc.structural.CommonAlertsFilters()
|
||||
results = alert_filter(mols=mols, n_jobs=n_jobs, progress=True)
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"has_common_alerts": [r["has_alerts"] for r in results],
|
||||
"num_common_alerts": [r["num_alerts"] for r in results],
|
||||
"common_alert_details": [", ".join(r["alert_details"]) if r["alert_details"] else "" for r in results]
|
||||
})
|
||||
|
||||
elif alert_type == "nibr":
|
||||
nibr_filter = mc.structural.NIBRFilters()
|
||||
results = nibr_filter(mols=mols, n_jobs=n_jobs, progress=True)
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"passes_nibr": results
|
||||
})
|
||||
|
||||
elif alert_type == "lilly":
|
||||
lilly_filter = mc.structural.LillyDemeritsFilters()
|
||||
results = lilly_filter(mols=mols, n_jobs=n_jobs, progress=True)
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"lilly_demerits": [r["demerits"] for r in results],
|
||||
"passes_lilly": [r["passes"] for r in results],
|
||||
"lilly_patterns": [", ".join([p["pattern"] for p in r["matched_patterns"]]) for r in results]
|
||||
})
|
||||
|
||||
elif alert_type == "pains":
|
||||
results = [mc.rules.basic_rules.pains_filter(mol) for mol in tqdm(mols, desc="PAINS")]
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"passes_pains": results
|
||||
})
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown alert type: {alert_type}")
|
||||
|
||||
return df_results
|
||||
|
||||
|
||||
def apply_complexity_filter(mols: List[Chem.Mol], max_complexity: float, method: str = "bertz") -> pd.DataFrame:
|
||||
"""Calculate molecular complexity."""
|
||||
print(f"\nCalculating molecular complexity (method={method}, max={max_complexity})...")
|
||||
|
||||
complexity_scores = [
|
||||
mc.complexity.calculate_complexity(mol, method=method)
|
||||
for mol in tqdm(mols, desc="Complexity")
|
||||
]
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"complexity_score": complexity_scores,
|
||||
"passes_complexity": [score <= max_complexity for score in complexity_scores]
|
||||
})
|
||||
|
||||
return df_results
|
||||
|
||||
|
||||
def apply_constraints(mols: List[Chem.Mol], constraints: Dict, n_jobs: int) -> pd.DataFrame:
|
||||
"""Apply custom property constraints."""
|
||||
print(f"\nApplying constraints: {constraints}")
|
||||
|
||||
constraint_filter = mc.constraints.Constraints(**constraints)
|
||||
results = constraint_filter(mols=mols, n_jobs=n_jobs, progress=True)
|
||||
|
||||
df_results = pd.DataFrame({
|
||||
"passes_constraints": [r["passes"] for r in results],
|
||||
"constraint_violations": [", ".join(r["violations"]) if r["violations"] else "" for r in results]
|
||||
})
|
||||
|
||||
return df_results
|
||||
|
||||
|
||||
def apply_chemical_groups(mols: List[Chem.Mol], groups: List[str]) -> pd.DataFrame:
|
||||
"""Detect chemical groups."""
|
||||
print(f"\nDetecting chemical groups: {', '.join(groups)}")
|
||||
|
||||
group_detector = mc.groups.ChemicalGroup(groups=groups)
|
||||
results = group_detector.get_all_matches(mols)
|
||||
|
||||
df_results = pd.DataFrame()
|
||||
for group in groups:
|
||||
df_results[f"has_{group}"] = [bool(r.get(group)) for r in results]
|
||||
|
||||
return df_results
|
||||
|
||||
|
||||
def generate_summary(df: pd.DataFrame, output_file: Path):
|
||||
"""Generate filtering summary report."""
|
||||
summary_file = output_file.parent / f"{output_file.stem}_summary.txt"
|
||||
|
||||
with open(summary_file, "w") as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("MEDCHEM FILTERING SUMMARY\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
f.write(f"Total molecules processed: {len(df)}\n\n")
|
||||
|
||||
# Rule results
|
||||
rule_cols = [col for col in df.columns if col.startswith("rule_") or col == "passes_all_rules"]
|
||||
if rule_cols:
|
||||
f.write("RULE FILTERS:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
for col in rule_cols:
|
||||
if col in df.columns and df[col].dtype == bool:
|
||||
n_pass = df[col].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" {col}: {n_pass} passed ({pct:.1f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
# Structural alerts
|
||||
alert_cols = [col for col in df.columns if "alert" in col.lower() or "nibr" in col.lower() or "lilly" in col.lower() or "pains" in col.lower()]
|
||||
if alert_cols:
|
||||
f.write("STRUCTURAL ALERTS:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
if "has_common_alerts" in df.columns:
|
||||
n_clean = (~df["has_common_alerts"]).sum()
|
||||
pct = 100 * n_clean / len(df)
|
||||
f.write(f" No common alerts: {n_clean} ({pct:.1f}%)\n")
|
||||
if "passes_nibr" in df.columns:
|
||||
n_pass = df["passes_nibr"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" Passes NIBR: {n_pass} ({pct:.1f}%)\n")
|
||||
if "passes_lilly" in df.columns:
|
||||
n_pass = df["passes_lilly"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" Passes Lilly: {n_pass} ({pct:.1f}%)\n")
|
||||
avg_demerits = df["lilly_demerits"].mean()
|
||||
f.write(f" Average Lilly demerits: {avg_demerits:.1f}\n")
|
||||
if "passes_pains" in df.columns:
|
||||
n_pass = df["passes_pains"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" Passes PAINS: {n_pass} ({pct:.1f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
# Complexity
|
||||
if "complexity_score" in df.columns:
|
||||
f.write("COMPLEXITY:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
avg_complexity = df["complexity_score"].mean()
|
||||
f.write(f" Average complexity: {avg_complexity:.1f}\n")
|
||||
if "passes_complexity" in df.columns:
|
||||
n_pass = df["passes_complexity"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" Within threshold: {n_pass} ({pct:.1f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
# Constraints
|
||||
if "passes_constraints" in df.columns:
|
||||
f.write("CONSTRAINTS:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
n_pass = df["passes_constraints"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write(f" Passes all constraints: {n_pass} ({pct:.1f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
# Overall pass rate
|
||||
pass_cols = [col for col in df.columns if col.startswith("passes_")]
|
||||
if pass_cols:
|
||||
df["passes_all_filters"] = df[pass_cols].all(axis=1)
|
||||
n_pass = df["passes_all_filters"].sum()
|
||||
pct = 100 * n_pass / len(df)
|
||||
f.write("OVERALL:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.write(f" Molecules passing all filters: {n_pass} ({pct:.1f}%)\n")
|
||||
|
||||
f.write("\n" + "=" * 80 + "\n")
|
||||
|
||||
print(f"\nSummary report saved to: {summary_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch molecular filtering using medchem",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
# Input/Output
|
||||
parser.add_argument("input", type=Path, help="Input file (CSV, TSV, SDF, or TXT)")
|
||||
parser.add_argument("--output", "-o", type=Path, required=True, help="Output CSV file")
|
||||
parser.add_argument("--smiles-column", default="smiles", help="Name of SMILES column (default: smiles)")
|
||||
|
||||
# Rule filters
|
||||
parser.add_argument("--rules", help="Comma-separated list of rules (e.g., rule_of_five,rule_of_cns)")
|
||||
|
||||
# Structural alerts
|
||||
parser.add_argument("--common-alerts", action="store_true", help="Apply common structural alerts")
|
||||
parser.add_argument("--nibr", action="store_true", help="Apply NIBR filters")
|
||||
parser.add_argument("--lilly", action="store_true", help="Apply Lilly demerits filter")
|
||||
parser.add_argument("--pains", action="store_true", help="Apply PAINS filter")
|
||||
|
||||
# Complexity
|
||||
parser.add_argument("--complexity", type=float, help="Maximum complexity threshold")
|
||||
parser.add_argument("--complexity-method", default="bertz", choices=["bertz", "whitlock", "barone"],
|
||||
help="Complexity calculation method")
|
||||
|
||||
# Constraints
|
||||
parser.add_argument("--mw-range", help="Molecular weight range (e.g., 200,500)")
|
||||
parser.add_argument("--logp-range", help="LogP range (e.g., -2,5)")
|
||||
parser.add_argument("--tpsa-max", type=float, help="Maximum TPSA")
|
||||
parser.add_argument("--hbd-max", type=int, help="Maximum H-bond donors")
|
||||
parser.add_argument("--hba-max", type=int, help="Maximum H-bond acceptors")
|
||||
parser.add_argument("--rotatable-bonds-max", type=int, help="Maximum rotatable bonds")
|
||||
|
||||
# Chemical groups
|
||||
parser.add_argument("--groups", help="Comma-separated chemical groups to detect")
|
||||
|
||||
# Processing options
|
||||
parser.add_argument("--n-jobs", type=int, default=-1, help="Number of parallel jobs (-1 = all cores)")
|
||||
parser.add_argument("--no-summary", action="store_true", help="Don't generate summary report")
|
||||
parser.add_argument("--filter-output", action="store_true", help="Only output molecules passing all filters")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load molecules
|
||||
df, mols = load_molecules(args.input, args.smiles_column)
|
||||
|
||||
# Apply filters
|
||||
result_dfs = [df]
|
||||
|
||||
# Rules
|
||||
if args.rules:
|
||||
rule_list = [r.strip() for r in args.rules.split(",")]
|
||||
df_rules = apply_rule_filters(mols, rule_list, args.n_jobs)
|
||||
result_dfs.append(df_rules)
|
||||
|
||||
# Structural alerts
|
||||
if args.common_alerts:
|
||||
df_alerts = apply_structural_alerts(mols, "common", args.n_jobs)
|
||||
result_dfs.append(df_alerts)
|
||||
|
||||
if args.nibr:
|
||||
df_nibr = apply_structural_alerts(mols, "nibr", args.n_jobs)
|
||||
result_dfs.append(df_nibr)
|
||||
|
||||
if args.lilly:
|
||||
df_lilly = apply_structural_alerts(mols, "lilly", args.n_jobs)
|
||||
result_dfs.append(df_lilly)
|
||||
|
||||
if args.pains:
|
||||
df_pains = apply_structural_alerts(mols, "pains", args.n_jobs)
|
||||
result_dfs.append(df_pains)
|
||||
|
||||
# Complexity
|
||||
if args.complexity:
|
||||
df_complexity = apply_complexity_filter(mols, args.complexity, args.complexity_method)
|
||||
result_dfs.append(df_complexity)
|
||||
|
||||
# Constraints
|
||||
constraints = {}
|
||||
if args.mw_range:
|
||||
mw_min, mw_max = map(float, args.mw_range.split(","))
|
||||
constraints["mw_range"] = (mw_min, mw_max)
|
||||
if args.logp_range:
|
||||
logp_min, logp_max = map(float, args.logp_range.split(","))
|
||||
constraints["logp_range"] = (logp_min, logp_max)
|
||||
if args.tpsa_max:
|
||||
constraints["tpsa_max"] = args.tpsa_max
|
||||
if args.hbd_max:
|
||||
constraints["hbd_max"] = args.hbd_max
|
||||
if args.hba_max:
|
||||
constraints["hba_max"] = args.hba_max
|
||||
if args.rotatable_bonds_max:
|
||||
constraints["rotatable_bonds_max"] = args.rotatable_bonds_max
|
||||
|
||||
if constraints:
|
||||
df_constraints = apply_constraints(mols, constraints, args.n_jobs)
|
||||
result_dfs.append(df_constraints)
|
||||
|
||||
# Chemical groups
|
||||
if args.groups:
|
||||
group_list = [g.strip() for g in args.groups.split(",")]
|
||||
df_groups = apply_chemical_groups(mols, group_list)
|
||||
result_dfs.append(df_groups)
|
||||
|
||||
# Combine results
|
||||
df_final = pd.concat(result_dfs, axis=1)
|
||||
|
||||
# Filter output if requested
|
||||
if args.filter_output:
|
||||
pass_cols = [col for col in df_final.columns if col.startswith("passes_")]
|
||||
if pass_cols:
|
||||
df_final["passes_all"] = df_final[pass_cols].all(axis=1)
|
||||
df_final = df_final[df_final["passes_all"]]
|
||||
print(f"\nFiltered to {len(df_final)} molecules passing all filters")
|
||||
|
||||
# Save results
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df_final.to_csv(args.output, index=False)
|
||||
print(f"\nResults saved to: {args.output}")
|
||||
|
||||
# Generate summary
|
||||
if not args.no_summary:
|
||||
generate_summary(df_final, args.output)
|
||||
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user