import os
import sys
import pandas as pd
import numpy as np
import operator
This tutorials is made using references from : * https://github.com/PatWalters/practical_cheminformatics_tutorials * https://doc.datamol.io/stable/tutorials/Clustering.html * https://greglandrum.github.io/rdkit-blog/similarity/tutorial/2020/11/18/sphere-exclusion-clustering.html
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
=True #SVG's tend to look nicer than the png counterparts
IPythonConsole.ipython_useSVG
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
True)
rdDepictor.SetPreferCoordGen(import rdkit
%matplotlib inline
print(rdkit.__version__)
2022.03.5
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'
# "Infinite" DPI vector output -- overkill
%config InlineBackend.figure_format = 'svg'
# Plot matplotlib plots with white background:
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
= {
plot_params 'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
plt.rcParams.update(plot_params)
# clustering specific imports from Rdkit
from rdkit.Chem import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
= pd.read_csv('./data/small_molecule_data/tox21.csv') x
5) x.sample(
smiles | NR-AR | NR-AR-LBD | NR-AhR | NR-Aromatase | NR-ER | NR-ER-LBD | NR-PPAR-gamma | SR-ARE | SR-ATAD5 | SR-HSE | SR-MMP | SR-p53 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
439 | CC(Cl)(Cl)C(=O)O | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5205 | Nc1ccc([As](=O)(O)O)cc1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
7260 | CCCCOC(=O)CC | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2195 | COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)cc(C)n1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5563 | Cc1ccc(C)c(OCCCC(C)(C)C(=O)O)c1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
= [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
mol_obj len(mol_obj)
[16:41:47] WARNING: not removing hydrogen atom without neighbors
7831
Generate fingerprints for the molecules - using Morgan FP2
from rdkit.Chem import rdMolDescriptors
= [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj] fps
Calculate distance matrix for the molecules
= []
dists = len(fps)
n_fps for i in range(1, n_fps):
= DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
sim 1-x for x in sim ]) dists.extend([
; plt.hist(dists)
Clustering the molecules based on the FPs and the distance matrix
= 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
cutoff_distance = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
mol_clusters = [0]*n_fps
cluster_id_list
for idx, cluster in enumerate(mol_clusters, 1):
for member in cluster:
= idx cluster_id_list[member]
# Add the cluster_id to the dataframe
'cluster'] = cluster_id_list x[
'cluster'].value_counts(sort=True, ascending=False) x[
2 21
1 21
3 15
5 13
4 13
..
4643 1
4644 1
4645 1
4646 1
689 1
Name: cluster, Length: 6675, dtype: int64
Calculate intra cluster similarity for the molecules
= x.loc[ x['cluster'] == 1] x_42
x_42.shape
(21, 14)
for x in x_42.index], subImgSize=(200, 200), molsPerRow=4) Draw.MolsToGridImage( [mol_obj[x]
# intra-cluster similarity
= []
res = [ fps[i] for i in x_42.index ]
cfps for i in range(1, x_42.shape[0]):
= DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
sim 1-x for x in sim] ) res.extend( [
plt.hist(res)
(array([26., 8., 24., 0., 40., 14., 58., 16., 14., 10.]),
array([0. , 0.04901961, 0.09803922, 0.14705882, 0.19607843,
0.24509804, 0.29411765, 0.34313725, 0.39215686, 0.44117647,
0.49019608]),
<BarContainer object of 10 artists>)
Pick the most diverse molecules from the cluster
Implementation of Sphere exclusion algorithm (also called Leader) from Roger Sayles.
from rdkit.SimDivFilters import rdSimDivPickers
= rdSimDivPickers.LeaderPicker() lp
= 0.65 # <- minimum distance between clusters
threshold = lp.LazyBitVectorPick(fps, len(fps), threshold)
picks len(picks)
3291