Cheminformatics basics - Clustering molecules

Walkthrough on clustering molecules. This notebook is made using an eclectic collection of different walkthroughs and tutorials found on various blogs
Published

November 20, 2022

This tutorials is made using references from : * https://github.com/PatWalters/practical_cheminformatics_tutorials * https://doc.datamol.io/stable/tutorials/Clustering.html * https://greglandrum.github.io/rdkit-blog/similarity/tutorial/2020/11/18/sphere-exclusion-clustering.html

import os 
import sys
import pandas as pd
import numpy as np
import operator
from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
IPythonConsole.ipython_useSVG=True  #SVG's tend to look nicer than the png counterparts

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
rdDepictor.SetPreferCoordGen(True)
import rdkit
%matplotlib inline
print(rdkit.__version__)
2022.03.5
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'

# "Infinite" DPI vector output -- overkill 
%config InlineBackend.figure_format = 'svg'
 
# Plot matplotlib plots with white background: 
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
 
plt.rcParams.update(plot_params)
# clustering specific imports from Rdkit 
from rdkit.Chem import DataStructs 
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
x = pd.read_csv('./data/small_molecule_data/tox21.csv')
x.sample(5)
smiles NR-AR NR-AR-LBD NR-AhR NR-Aromatase NR-ER NR-ER-LBD NR-PPAR-gamma SR-ARE SR-ATAD5 SR-HSE SR-MMP SR-p53
439 CC(Cl)(Cl)C(=O)O 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5205 Nc1ccc([As](=O)(O)O)cc1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7260 CCCCOC(=O)CC 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2195 COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)cc(C)n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5563 Cc1ccc(C)c(OCCCC(C)(C)C(=O)O)c1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
mol_obj = [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
len(mol_obj)
[16:41:47] WARNING: not removing hydrogen atom without neighbors
7831

Generate fingerprints for the molecules - using Morgan FP2

from rdkit.Chem import rdMolDescriptors
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj]

Calculate distance matrix for the molecules

dists  = [] 
n_fps = len(fps)
for i in range(1, n_fps):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([ 1-x for x in sim ])
plt.hist(dists);

Clustering the molecules based on the FPs and the distance matrix

cutoff_distance = 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
mol_clusters = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
cluster_id_list = [0]*n_fps

for idx, cluster in enumerate(mol_clusters, 1):
    for member in cluster:
        cluster_id_list[member] = idx
# Add the cluster_id to the dataframe 
x['cluster'] = cluster_id_list
x['cluster'].value_counts(sort=True, ascending=False)
2       21
1       21
3       15
5       13
4       13
        ..
4643     1
4644     1
4645     1
4646     1
689      1
Name: cluster, Length: 6675, dtype: int64

Calculate intra cluster similarity for the molecules

x_42 = x.loc[ x['cluster'] ==  1]
x_42.shape
(21, 14)
Draw.MolsToGridImage( [mol_obj[x] for x in x_42.index], subImgSize=(200, 200), molsPerRow=4)

# intra-cluster similarity 
res = []
cfps = [ fps[i] for i in x_42.index ]
for i in range(1, x_42.shape[0]):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
    res.extend( [1-x for x in sim] )
plt.hist(res)
(array([26.,  8., 24.,  0., 40., 14., 58., 16., 14., 10.]),
 array([0.        , 0.04901961, 0.09803922, 0.14705882, 0.19607843,
        0.24509804, 0.29411765, 0.34313725, 0.39215686, 0.44117647,
        0.49019608]),
 <BarContainer object of 10 artists>)

Pick the most diverse molecules from the cluster

Implementation of Sphere exclusion algorithm (also called Leader) from Roger Sayles.

from rdkit.SimDivFilters import rdSimDivPickers
lp = rdSimDivPickers.LeaderPicker()
threshold = 0.65 # <- minimum distance between clusters 
picks = lp.LazyBitVectorPick(fps, len(fps), threshold)
len(picks)
3291