Generating Reaction Library Compounds
Generating Reaction Library Compounds
from io import BytesIO
import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdRGroupDecomposition
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem import Draw
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
DrawingOptions.bondLineWidth=1.8
IPythonConsole.ipython_useSVG=True
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.warning')
import rdkit
print(rdkit.__version__)
Motivation
Modern compound library size is increasing fast thanks to parallel synthesis. Enamine REAL Space now have more than 10 billion compounds available for purchase and other vendors are also offering libraries that are much larger than traditional screening library size.
The papers (1 and 2) from Enamine scientists discusses how they have constructed such library in detail. I got interested in building such library myself and tested if I can use RDKit to attempt such library
Synthons
Prepare a three groups; core, block1, and block2. For this particular blocks, they all have carboxyl group. The core has one tertiary amine, which can be reacted first, and N-Boc group, which can be deprotected and reacted in a second step.
Because the reaction order can be swapped, this three synthons will result in four different arrangement of functional group depending on the order of reaction:
- deacylation of group1, deprotection, deacylation of group1
- deacylation of group1, deprotection, deacylation of group2
- deacylation of group2, deprotection, deacylation of group1
- deacylation of group2, deprotection, deacylation of group2
Below is a core molecule, which have two tertiary amines and one of the amines is protected by Boc group, and two reactant groups, each having one carboxylate group, which can be acylated and readily reacted with a tertiary amine.
core_smiles = 'C2CCNCC2C1CN(C(=O)OC(C)(C)(C))CC1'
core = Chem.MolFromSmiles(core_smiles)
core
group1_smiles = 'c12ccccc1NN=C2(C(=O)O)'
group1 = Chem.MolFromSmiles(group1_smiles)
group1
group2_smiles = 'C1CCCCC1C(=O)O'
group2 = Chem.MolFromSmiles(group2_smiles)
group2
rxn_acylation_smarts = '[C;D3:1](=O)[O;D1:2]>>[C:1](=O)[*]'
rxn_acylamine_smarts = '[*:1][C:2](=O)[*;D1:3].[ND2:4]([*:5])([*:6])>>[N:4]([*:5])([*:6])([C:2](=O)[*:1])'
rxn_deprotection_smarts = '[N:1]C(=O)OC(C)(C)(C)>>[N:1]'
rxn = AllChem.ReactionFromSmarts(rxn_acylation_smarts)
group1_product = rxn.RunReactants((group1,))[0][0]
group1_product
rxn = AllChem.ReactionFromSmarts(rxn_acylamine_smarts)
product1 = rxn.RunReactants((group1_product,core))[0][0]
product1
rxn = AllChem.ReactionFromSmarts(rxn_deprotection_smarts)
product2 = rxn.RunReactants((product1,))[0][0]
product2
rxn = AllChem.ReactionFromSmarts(rxn_acylation_smarts)
group2_product = rxn.RunReactants((group2,))[0][0]
group2_product
rxn = AllChem.ReactionFromSmarts(rxn_acylamine_smarts)
final_product = rxn.RunReactants((group2_product,product2))[0][0]
final_product
core_smiles_arr = [core_smiles]
block_smiles_arr = [group1_smiles, group2_smiles]
# turn smiles to mol object
core_arr = [Chem.MolFromSmiles(smiles) for smiles in core_smiles_arr]
block_arr = [Chem.MolFromSmiles(smiles) for smiles in block_smiles_arr]
# set names for image generation
for i, core in enumerate(core_arr):
core.SetProp("_Name", f'Core{i+1}')
for i, block in enumerate(block_arr):
block.SetProp("_Name", f'Block{i+1}')
from itertools import product
product_library = []
def reaction(core, group1, group2):
rxn = AllChem.ReactionFromSmarts(rxn_acylation_smarts)
group1_product = rxn.RunReactants((group1,))[0][0]
rxn = AllChem.ReactionFromSmarts(rxn_acylamine_smarts)
product1 = rxn.RunReactants((group1_product,core))[0][0]
rxn = AllChem.ReactionFromSmarts(rxn_deprotection_smarts)
product2 = rxn.RunReactants((product1,))[0][0]
rxn = AllChem.ReactionFromSmarts(rxn_acylation_smarts)
group2_product = rxn.RunReactants((group2,))[0][0]
rxn = AllChem.ReactionFromSmarts(rxn_acylamine_smarts)
product = rxn.RunReactants((group2_product,product2))[0][0]
return product
for core in core_arr:
for block1, block2 in product(block_arr, repeat=2):
compound = reaction(core, block1, block2)
compound.SetProp('_Name', f'{block1.GetProp("_Name")}-{core.GetProp("_Name")}-{block2.GetProp("_Name")}')
product_library.append(compound)
from rdkit.Chem.Draw import rdMolDraw2D
from io import BytesIO
ms = [Chem.RemoveHs(m) for m in product_library]
for m in ms: tmp=AllChem.Compute2DCoords(m)
legends=["%s" % (x.GetProp("_Name"), ) for i, x in enumerate(ms)]
molsPerRow = 4
subImgSize = (300, 300)
width = subImgSize[0] * molsPerRow
height = subImgSize[1] * int(len(ms) / molsPerRow)
d2d = rdMolDraw2D.MolDraw2DCairo(width, height, subImgSize[0], subImgSize[1])
d2d.drawOptions().legendFontSize=24
d2d.DrawMolecules(ms,legends=legends)
d2d.FinishDrawing()
img = BytesIO()
img.write(d2d.GetDrawingText())
from PIL import Image
Image.open(img)
Conclusion
The actual code for building library was straightforward, as long as the fragments and the reactions are well curated, but I can imagine maintaining such curated fragment/reaction library at scale will be certainly a challenge.
Also, if one attempts to build the entire possible library, the amount of computation will grow exponentially. For example, if we have 300 functional group blocks and 200 core blocks, which permits two reactions, the number of possible combinations are two billion already.
The paper also illustrates how such database can be constructed on-the-fly with clever use of virtual screening method, alleviate the need to store the entire library on disk and reduce computation significantly.