Cheminformatics basics - Clustering molecules

This tutorials is made using references from : * https://github.com/PatWalters/practical_cheminformatics_tutorials * https://doc.datamol.io/stable/tutorials/Clustering.html * https://greglandrum.github.io/rdkit-blog/similarity/tutorial/2020/11/18/sphere-exclusion-clustering.html

import os 
import sys
import pandas as pd
import numpy as np
import operator

from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
IPythonConsole.ipython_useSVG=True  #SVG's tend to look nicer than the png counterparts

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
rdDepictor.SetPreferCoordGen(True)
import rdkit
%matplotlib inline
print(rdkit.__version__)

2022.03.5

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'

# "Infinite" DPI vector output -- overkill 
%config InlineBackend.figure_format = 'svg'
 
# Plot matplotlib plots with white background: 
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
 
plt.rcParams.update(plot_params)

# clustering specific imports from Rdkit 
from rdkit.Chem import DataStructs 
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

x = pd.read_csv('./data/small_molecule_data/tox21.csv')

x.sample(5)

	smiles	NR-AR	NR-AR-LBD	NR-AhR	NR-Aromatase	NR-ER	NR-ER-LBD	NR-PPAR-gamma	SR-ARE	SR-ATAD5	SR-HSE	SR-MMP	SR-p53
439	CC(Cl)(Cl)C(=O)O	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5205	Nc1ccc([As](=O)(O)O)cc1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
7260	CCCCOC(=O)CC	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2195	COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)cc(C)n1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5563	Cc1ccc(C)c(OCCCC(C)(C)C(=O)O)c1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

mol_obj = [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
len(mol_obj)

[16:41:47] WARNING: not removing hydrogen atom without neighbors

Generate fingerprints for the molecules - using Morgan FP2

from rdkit.Chem import rdMolDescriptors
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj]

Calculate distance matrix for the molecules

dists  = [] 
n_fps = len(fps)
for i in range(1, n_fps):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([ 1-x for x in sim ])

plt.hist(dists);

Clustering the molecules based on the FPs and the distance matrix

cutoff_distance = 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
mol_clusters = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
cluster_id_list = [0]*n_fps

for idx, cluster in enumerate(mol_clusters, 1):
    for member in cluster:
        cluster_id_list[member] = idx

# Add the cluster_id to the dataframe 
x['cluster'] = cluster_id_list

x['cluster'].value_counts(sort=True, ascending=False)

2       21
1       21
3       15
5       13
4       13
        ..
4643     1
4644     1
4645     1
4646     1
689      1
Name: cluster, Length: 6675, dtype: int64

Calculate intra cluster similarity for the molecules

x_42 = x.loc[ x['cluster'] ==  1]

x_42.shape

(21, 14)

Draw.MolsToGridImage( [mol_obj[x] for x in x_42.index], subImgSize=(200, 200), molsPerRow=4)

# intra-cluster similarity 
res = []
cfps = [ fps[i] for i in x_42.index ]
for i in range(1, x_42.shape[0]):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
    res.extend( [1-x for x in sim] )

plt.hist(res)

(array([26.,  8., 24.,  0., 40., 14., 58., 16., 14., 10.]),
 array([0.        , 0.04901961, 0.09803922, 0.14705882, 0.19607843,
        0.24509804, 0.29411765, 0.34313725, 0.39215686, 0.44117647,
        0.49019608]),
 <BarContainer object of 10 artists>)

Pick the most diverse molecules from the cluster

Implementation of Sphere exclusion algorithm (also called Leader) from Roger Sayles.

from rdkit.SimDivFilters import rdSimDivPickers
lp = rdSimDivPickers.LeaderPicker()

threshold = 0.65 # <- minimum distance between clusters 
picks = lp.LazyBitVectorPick(fps, len(fps), threshold)
len(picks)