#!/usr/bin/env python3 """ Substructure Filter Filter molecules based on substructure patterns using SMARTS. Supports inclusion and exclusion filters, and custom pattern libraries. Usage: python substructure_filter.py molecules.smi --pattern "c1ccccc1" --output filtered.smi python substructure_filter.py database.sdf --exclude "C(=O)Cl" --filter-type functional-groups """ import argparse import sys from pathlib import Path try: from rdkit import Chem except ImportError: print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit") sys.exit(1) # Common SMARTS pattern libraries PATTERN_LIBRARIES = { 'functional-groups': { 'alcohol': '[OH][C]', 'aldehyde': '[CH1](=O)', 'ketone': '[C](=O)[C]', 'carboxylic_acid': 'C(=O)[OH]', 'ester': 'C(=O)O[C]', 'amide': 'C(=O)N', 'amine': '[NX3]', 'ether': '[C][O][C]', 'nitrile': 'C#N', 'nitro': '[N+](=O)[O-]', 'halide': '[C][F,Cl,Br,I]', 'thiol': '[C][SH]', 'sulfide': '[C][S][C]', }, 'rings': { 'benzene': 'c1ccccc1', 'pyridine': 'n1ccccc1', 'pyrrole': 'n1cccc1', 'furan': 'o1cccc1', 'thiophene': 's1cccc1', 'imidazole': 'n1cncc1', 'indole': 'c1ccc2[nH]ccc2c1', 'naphthalene': 'c1ccc2ccccc2c1', }, 'pains': { 'rhodanine': 'S1C(=O)NC(=S)C1', 'catechol': 'c1ccc(O)c(O)c1', 'quinone': 'O=C1C=CC(=O)C=C1', 'michael_acceptor': 'C=CC(=O)', 'alkyl_halide': '[C][I,Br]', }, 'privileged': { 'biphenyl': 'c1ccccc1-c2ccccc2', 'piperazine': 'N1CCNCC1', 'piperidine': 'N1CCCCC1', 'morpholine': 'N1CCOCC1', } } def load_molecules(file_path, keep_props=True): """Load molecules from file.""" path = Path(file_path) if not path.exists(): print(f"Error: File not found: {file_path}") return [] molecules = [] if path.suffix.lower() in ['.sdf', '.mol']: suppl = Chem.SDMolSupplier(str(path)) elif path.suffix.lower() in ['.smi', '.smiles', '.txt']: suppl = Chem.SmilesMolSupplier(str(path), titleLine=False) else: print(f"Error: Unsupported file format: {path.suffix}") return [] for idx, mol in enumerate(suppl): if mol is None: print(f"Warning: Failed to parse molecule {idx+1}") continue molecules.append(mol) return molecules def create_pattern_query(pattern_string): """Create SMARTS query from string or SMILES.""" # Try as SMARTS first query = Chem.MolFromSmarts(pattern_string) if query is not None: return query # Try as SMILES query = Chem.MolFromSmiles(pattern_string) if query is not None: return query print(f"Error: Invalid pattern: {pattern_string}") return None def filter_molecules(molecules, include_patterns=None, exclude_patterns=None, match_all_include=False): """ Filter molecules based on substructure patterns. Args: molecules: List of RDKit Mol objects include_patterns: List of (name, pattern) tuples to include exclude_patterns: List of (name, pattern) tuples to exclude match_all_include: If True, molecule must match ALL include patterns Returns: Tuple of (filtered_molecules, match_info) """ filtered = [] match_info = [] for idx, mol in enumerate(molecules): if mol is None: continue # Check exclusion patterns first excluded = False exclude_matches = [] if exclude_patterns: for name, pattern in exclude_patterns: if mol.HasSubstructMatch(pattern): excluded = True exclude_matches.append(name) if excluded: match_info.append({ 'index': idx + 1, 'smiles': Chem.MolToSmiles(mol), 'status': 'excluded', 'matches': exclude_matches }) continue # Check inclusion patterns if include_patterns: include_matches = [] for name, pattern in include_patterns: if mol.HasSubstructMatch(pattern): include_matches.append(name) # Decide if molecule passes inclusion filter if match_all_include: passed = len(include_matches) == len(include_patterns) else: passed = len(include_matches) > 0 if passed: filtered.append(mol) match_info.append({ 'index': idx + 1, 'smiles': Chem.MolToSmiles(mol), 'status': 'included', 'matches': include_matches }) else: match_info.append({ 'index': idx + 1, 'smiles': Chem.MolToSmiles(mol), 'status': 'no_match', 'matches': [] }) else: # No inclusion patterns, keep all non-excluded filtered.append(mol) match_info.append({ 'index': idx + 1, 'smiles': Chem.MolToSmiles(mol), 'status': 'included', 'matches': [] }) return filtered, match_info def write_molecules(molecules, output_file): """Write molecules to file.""" output_path = Path(output_file) if output_path.suffix.lower() in ['.sdf']: writer = Chem.SDWriter(str(output_path)) for mol in molecules: writer.write(mol) writer.close() elif output_path.suffix.lower() in ['.smi', '.smiles', '.txt']: with open(output_path, 'w') as f: for mol in molecules: smiles = Chem.MolToSmiles(mol) name = mol.GetProp('_Name') if mol.HasProp('_Name') else '' f.write(f"{smiles} {name}\n") else: print(f"Error: Unsupported output format: {output_path.suffix}") return print(f"Wrote {len(molecules)} molecules to {output_file}") def write_report(match_info, output_file): """Write detailed match report.""" import csv with open(output_file, 'w', newline='') as f: fieldnames = ['Index', 'SMILES', 'Status', 'Matches'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for info in match_info: writer.writerow({ 'Index': info['index'], 'SMILES': info['smiles'], 'Status': info['status'], 'Matches': ', '.join(info['matches']) }) def print_summary(total, filtered, match_info): """Print filtering summary.""" print("\n" + "="*60) print("Filtering Summary") print("="*60) print(f"Total molecules: {total}") print(f"Passed filter: {len(filtered)}") print(f"Filtered out: {total - len(filtered)}") print(f"Pass rate: {len(filtered)/total*100:.1f}%") # Count by status status_counts = {} for info in match_info: status = info['status'] status_counts[status] = status_counts.get(status, 0) + 1 print("\nBreakdown:") for status, count in status_counts.items(): print(f" {status:15s}: {count}") print("="*60) def main(): parser = argparse.ArgumentParser( description='Filter molecules by substructure patterns', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=f""" Pattern libraries: --filter-type functional-groups Common functional groups --filter-type rings Ring systems --filter-type pains PAINS (Pan-Assay Interference) --filter-type privileged Privileged structures Examples: # Include molecules with benzene ring python substructure_filter.py molecules.smi --pattern "c1ccccc1" -o filtered.smi # Exclude reactive groups python substructure_filter.py database.sdf --exclude "C(=O)Cl" -o clean.sdf # Filter by functional groups python substructure_filter.py molecules.smi --filter-type functional-groups -o fg.smi # Remove PAINS python substructure_filter.py compounds.smi --filter-type pains --exclude-mode -o clean.smi # Multiple patterns python substructure_filter.py mol.smi --pattern "c1ccccc1" --pattern "N" -o aromatic_amines.smi """ ) parser.add_argument('input', help='Input file (SDF or SMILES)') parser.add_argument('--pattern', '-p', action='append', help='SMARTS/SMILES pattern to include (can specify multiple)') parser.add_argument('--exclude', '-e', action='append', help='SMARTS/SMILES pattern to exclude (can specify multiple)') parser.add_argument('--filter-type', choices=PATTERN_LIBRARIES.keys(), help='Use predefined pattern library') parser.add_argument('--exclude-mode', action='store_true', help='Use filter-type patterns for exclusion instead of inclusion') parser.add_argument('--match-all', action='store_true', help='Molecule must match ALL include patterns') parser.add_argument('--output', '-o', help='Output file') parser.add_argument('--report', '-r', help='Write detailed report to CSV') parser.add_argument('--list-patterns', action='store_true', help='List available pattern libraries and exit') args = parser.parse_args() # List patterns mode if args.list_patterns: print("\nAvailable Pattern Libraries:") print("="*60) for lib_name, patterns in PATTERN_LIBRARIES.items(): print(f"\n{lib_name}:") for name, pattern in patterns.items(): print(f" {name:25s}: {pattern}") sys.exit(0) # Load molecules print(f"Loading molecules from: {args.input}") molecules = load_molecules(args.input) if not molecules: print("Error: No valid molecules loaded") sys.exit(1) print(f"Loaded {len(molecules)} molecules") # Prepare patterns include_patterns = [] exclude_patterns = [] # Add custom include patterns if args.pattern: for pattern_str in args.pattern: query = create_pattern_query(pattern_str) if query: include_patterns.append(('custom', query)) # Add custom exclude patterns if args.exclude: for pattern_str in args.exclude: query = create_pattern_query(pattern_str) if query: exclude_patterns.append(('custom', query)) # Add library patterns if args.filter_type: lib_patterns = PATTERN_LIBRARIES[args.filter_type] for name, pattern_str in lib_patterns.items(): query = create_pattern_query(pattern_str) if query: if args.exclude_mode: exclude_patterns.append((name, query)) else: include_patterns.append((name, query)) if not include_patterns and not exclude_patterns: print("Error: No patterns specified") sys.exit(1) # Print filter configuration print(f"\nFilter configuration:") if include_patterns: print(f" Include patterns: {len(include_patterns)}") if args.match_all: print(" Mode: Match ALL") else: print(" Mode: Match ANY") if exclude_patterns: print(f" Exclude patterns: {len(exclude_patterns)}") # Perform filtering print("\nFiltering...") filtered, match_info = filter_molecules( molecules, include_patterns=include_patterns if include_patterns else None, exclude_patterns=exclude_patterns if exclude_patterns else None, match_all_include=args.match_all ) # Print summary print_summary(len(molecules), filtered, match_info) # Write output if args.output: write_molecules(filtered, args.output) if args.report: write_report(match_info, args.report) print(f"Detailed report written to: {args.report}") if __name__ == '__main__': main()