R-Group Decomposition highlighting made easier (and exposed to JS)¶
This work was inspired by a popular 2021 RDKit blog post by Greg,
https://greglandrum.github.io/rdkit-blog/posts/2021-08-07-rgd-and-highlighting.html
which was about highlighting R groups extracted through R-group decomposition on the original molecule.
The Datagrok team wished to include this sort of visualization in their web application for data analysis, but the original code written by Greg was a Python implementation relying on many molecule- and atom-level access function which are not exposed to RDKitJS.
Even more importantly, R group decomposition itself was not yet exposed to RDKitJS.
This Jupyter notebook demonstrates that the R-group highlighting is much easier to achieve from Python than it was originally thanks to dedicated functionality which I have implemented in the RDKit core.
import os
from io import StringIO, BytesIO
import json
import pandas as pd
import urllib.request
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize=(450, 200)
from rdkit.Chem import rdRGroupDecomposition, rdqueries, rdDepictor, PandasTools
from rdkit.Chem.Draw import rdMolDraw2D, IPythonConsole
from rdkit import Geometry
from IPython.display import Image
from PIL import Image as pilImage
IPythonConsole.InteractiveRenderer.rdkitStructureRendererJsUrl = "https://unpkg.com/rdkit-structure-renderer@0.0.1/dist/rdkit-structure-renderer-module.js"
IPythonConsole.InteractiveRenderer.minimalLibJsUrl = "https://unpkg.com/rdkit-structure-renderer@0.0.1/public/RDKit_minimal.0.0.1.js"
IPythonConsole.InteractiveRenderer.setEnabled()
import rdkit
print(rdkit.__version__)
2024.09.1pre
cdk2_rgd_dataset = "cdk2_rgd_dataset.csv"
Fetch cdk2_rgd_dataset
from KNIME repo if not already cached locally:
if not os.path.exists(cdk2_rgd_dataset):
rdDepictor.SetPreferCoordGen(True)
cdk2_rgd_dataset_csv_url = "https://api.hub.knime.com/repository/Users/greglandrum/Public/Presentations/2020/2020_11_05_RDKitPython_Webinar/Data/cdk2_rgd_dataset.csv:data?version=current-state"
response = urllib.request.urlopen(cdk2_rgd_dataset_csv_url)
assert response.status == 200
with StringIO(response.read().decode("utf-8")) as hnd_in:
with open(cdk2_rgd_dataset, "w") as hnd_out:
hnd_out.write(hnd_in.read())
assert os.path.exists(cdk2_rgd_dataset)
with open(cdk2_rgd_dataset) as hnd:
df = pd.read_csv(hnd)
df_doc1 = df[df.assay_chembl_id == 'CHEMBL827377']
chembl_ids = df_doc1.compound_chembl_id.to_list()
mols = df_doc1["canonical_smiles"].apply(Chem.MolFromSmiles).to_list()
This is the core we are going to use for the R-group decomposition of cdk2_rgd_dataset
:
core = Chem.MolFromSmiles("c1cc(-c2c([*:1])nn3nc([*:2])ccc23)nc(N(c2ccc([*:4])c([*:3])c2))n1")
rdDepictor.Compute2DCoords(core)
core
We use default parameters for R-group decomposition, and we request that the target molecule is included in the results in addition to core and R groups:
params = rdRGroupDecomposition.RGroupDecompositionParameters()
params.includeTargetMolInResults = True
We run R-group decomposition:
rdkit.RDLogger.DisableLog("rdApp.warning")
groups, _ = rdRGroupDecomposition.RGroupDecompose([core], mols, asSmiles=False, asRows=True, options=params)
rdkit.RDLogger.EnableLog("rdApp.warning")
groups
is a list
of dict
; each element of the list
is a dict
where the keys are Core
, Mol
(the original molecule) and R groups R#
:
groups[0]
{'Core': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a05f0>, 'Mol': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0740>, 'R1': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a07b0>, 'R2': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0820>, 'R3': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0890>, 'R4': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0900>, 'R5': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0970>, 'R6': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a09e0>, 'R7': <rdkit.Chem.rdchem.Mol at 0x7f1b8a0a0a50>}
class RGroupHighLights:
"""Class to extract and store highlight maps from R groups."""
def __init__(self):
self.atom_map = {}
self.bond_map = {}
self.radii = {}
self.linewidth_multipliers = {}
# ------------------
# set up our colormap
# the three choices here are all "colorblind" colormaps
# "Tol" colormap from https://davidmathlogic.com/colorblind
colors = [(51,34,136), (17,119,51), (68,170,153), (136,204,238), (221,204,119), (204,102,119), (170,68,153), (136,34,85)]
# "IBM" colormap from https://davidmathlogic.com/colorblind
colors = [(100, 143, 255), (120, 94, 240), (220, 38, 127), (254, 97, 0), (255, 176, 0)]
# Okabe_Ito colormap from https://jfly.uni-koeln.de/color/
colors = [(230, 159, 0),(86, 180, 233), (0, 158, 115), (240, 228, 66), (0, 114, 178), (213, 94, 0), (204, 121, 167)]
for i, x in enumerate(colors):
colors[i] = tuple(y/255 for y in x)
self.colors = colors
def update(self, rgroup, rlabel_idx):
"""Update highlights with information from this R group"""
target_atoms = json.loads(rgroup.GetProp("_rgroupTargetAtoms")) if rgroup.HasProp("_rgroupTargetAtoms") else []
target_bonds = json.loads(rgroup.GetProp("_rgroupTargetBonds")) if rgroup.HasProp("_rgroupTargetBonds") else []
color = self.colors[rlabel_idx % len(self.colors)]
for i in target_atoms:
v = self.atom_map.get(i, [])
v.append(color)
self.atom_map[i] = v
self.radii[i] = 0.4
for i in target_bonds:
v = self.bond_map.get(i, [])
v.append(color)
self.bond_map[i] = v
self.linewidth_multipliers[i] = 2
This function takes as input one element of the group
array and returns a PNG or SVG image aligned to the core and with highlighted R groups:
def highlight_rgroups(row, width=350, height=200, legend="",
labels=("R1", "R2", "R3", "R4"), svg=False):
"""
Params:
row: a dict whose keys are Mol, Core and all R extracted groups
(i.e., an element of the array generated by rdRGroupDecomposition.RGroupDecompose
width, height: size of each image
legend: optional legend to be associated to the mol image
labels: labels to be associated to highlights
svg: if True an SVG is generated, if False a PNG
"""
mol = row["Mol"]
core = row["Core"]
ps = Chem.AdjustQueryParameters.NoAdjustments()
ps.makeDummiesQueries = True
qcore = Chem.AdjustQueryProperties(core, ps)
highlights = RGroupHighLights()
for rlabel, rgroup in row.items():
if rlabel not in labels:
continue
rlabel_idx = labels.index(rlabel)
highlights.update(rgroup, rlabel_idx)
rdDepictor.GenerateDepictionMatching2DStructure(mol, qcore, allowRGroups=True)
if svg:
drawer = rdMolDraw2D.MolDraw2DSVG
else:
drawer = rdMolDraw2D.MolDraw2DCairo
d2d = drawer(width, height)
d2d.drawOptions().useBWAtomPalette()
#----------------------
# now draw the molecule, with highlights:
d2d.DrawMoleculeWithHighlights(mol, legend, highlights.atom_map, highlights.bond_map,
highlights.radii, highlights.linewidth_multipliers)
d2d.FinishDrawing()
img = d2d.GetDrawingText()
return img
The following function is largely taken from Greg's original blog post and generates a large static PNG image with all molecules drawn in a grid with a legend and colored highlights:
def draw_multiple(groups, labels, legends=None, nPerRow=4, subImageSize=(250, 200)):
nRows = (len(groups) - 1) // nPerRow + 1
imgSize = (subImageSize[0] * nPerRow, subImageSize[1] * nRows)
res = pilImage.new("RGB", imgSize, color="white")
for i, group in enumerate(groups):
col = i % nPerRow
row = i // nPerRow
if legends:
legend = legends[i]
else:
legend = ''
png = highlight_rgroups(group, labels=labels, legend=legend,
width=subImageSize[0], height=subImageSize[1])
with BytesIO(png) as hnd:
img = pilImage.open(hnd)
res.paste(img, box=(col * subImageSize[0], row * subImageSize[1]))
with BytesIO() as hnd:
res.save(hnd, format="PNG")
png = hnd.getvalue()
return png
Image(draw_multiple(groups, labels=("R1", "R2", "R3", "R4"), legends=chembl_ids, subImageSize=(300, 250)))
Thanks to the Jupyter RDKit InteractiveRenderer we can also visualize the highlights in a pandas
DataFrame with copy/paste and other nice interactive features enabled:
Here we set the default image size that will be used for R group images:
PandasTools.molSize = (200, 100)
Here we prepare custom drawing options to draw the molecules in black&white , to visualize the colored highlights, and to increase the size of the image with colored highlights such that it will be more visible:
bw_palette = {str(i): [0, 0, 0] for i in range(120)}
for group in groups:
highlights = RGroupHighLights()
for rlabel_idx, rlabel in enumerate(("R1", "R2", "R3", "R4")):
highlights.update(group[rlabel], rlabel_idx)
draw_opts_json = json.dumps({
"atomColourPalette": bw_palette,
"highlightAtomMultipleColors": highlights.atom_map,
"highlightBondMultipleColors": highlights.bond_map,
"highlightAtomRadii": highlights.radii,
"highlightLineWidthMultipliers": highlights.linewidth_multipliers
})
IPythonConsole.InteractiveRenderer.setOpts(group["Mol"], {
"width": 300,
"height": 200,
"drawOpts": draw_opts_json
})
Now it is just a matter of populating the DataFrame with molecules...
df = pd.DataFrame([{k: group[k] for k in ("Mol", "R1", "R2", "R3", "R4")} for group in groups])
...ChEMBL ids as DataFrame index...
df.insert(0, "ChEMBL id", [chembl_ids[i] for i in df.index])
df = df.set_index("ChEMBL id")
...ask PandasTools
to visualize RDKit molecules as images, and voila:
PandasTools.ChangeMoleculeRendering(df)
df
Mol | R1 | R2 | R3 | R4 | |
---|---|---|---|---|---|
ChEMBL id | |||||
CHEMBL182493 | |||||
CHEMBL182326 | |||||
CHEMBL183064 | |||||
CHEMBL361038 | |||||
CHEMBL362296 | |||||
... | ... | ... | ... | ... | ... |
CHEMBL364646 | |||||
CHEMBL189874 | |||||
CHEMBL186826 | |||||
CHEMBL434158 | |||||
CHEMBL434729 |
91 rows × 5 columns