import os 
import sys
import pandas as pd
import numpy as np
import operator
from rdkit import Chem
from rdkit import DataStructs

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
IPythonConsole.ipython_useSVG=True  #SVG's tend to look nicer than the png counterparts

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
rdDepictor.SetPreferCoordGen(True)
import rdkit
%matplotlib inline
print(rdkit.__version__)
2022.03.5
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'

# "Infinite" DPI vector output -- overkill 
%config InlineBackend.figure_format = 'svg'
 
# Plot matplotlib plots with white background: 
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
 
plt.rcParams.update(plot_params)
from rdkit.Chem import DataStructs 
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
x = pd.read_csv('./data/small_molecule_data/tox21.csv')
x.sample(5)
smiles NR-AR NR-AR-LBD NR-AhR NR-Aromatase NR-ER NR-ER-LBD NR-PPAR-gamma SR-ARE SR-ATAD5 SR-HSE SR-MMP SR-p53
439 CC(Cl)(Cl)C(=O)O 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5205 Nc1ccc([As](=O)(O)O)cc1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7260 CCCCOC(=O)CC 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2195 COC(=O)c1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)cc(C)n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5563 Cc1ccc(C)c(OCCCC(C)(C)C(=O)O)c1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
mol_obj = [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
len(mol_obj)
[16:41:47] WARNING: not removing hydrogen atom without neighbors
7831

Generate fingerprints for the molecules - using Morgan FP2

from rdkit.Chem import rdMolDescriptors
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj]

Calculate distance matrix for the molecules

dists  = [] 
n_fps = len(fps)
for i in range(1, n_fps):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    dists.extend([ 1-x for x in sim ])
plt.hist(dists);
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> 2022-11-27T16:43:00.690058 image/svg+xml Matplotlib v3.5.3, https://matplotlib.org/

Clustering the molecules based on the FPs and the distance matrix

cutoff_distance = 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
mol_clusters = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
cluster_id_list = [0]*n_fps

for idx, cluster in enumerate(mol_clusters, 1):
    for member in cluster:
        cluster_id_list[member] = idx
x['cluster'] = cluster_id_list
x['cluster'].value_counts(sort=True, ascending=False)
2       21
1       21
3       15
5       13
4       13
        ..
4643     1
4644     1
4645     1
4646     1
689      1
Name: cluster, Length: 6675, dtype: int64

Calculate intra cluster similarity for the molecules

x_42 = x.loc[ x['cluster'] ==  1]
x_42.shape
(21, 14)
Draw.MolsToGridImage( [mol_obj[x] for x in x_42.index], subImgSize=(200, 200), molsPerRow=4)
res = []
cfps = [ fps[i] for i in x_42.index ]
for i in range(1, x_42.shape[0]):
    sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
    res.extend( [1-x for x in sim] )
plt.hist(res)
(array([26.,  8., 24.,  0., 40., 14., 58., 16., 14., 10.]),
 array([0.        , 0.04901961, 0.09803922, 0.14705882, 0.19607843,
        0.24509804, 0.29411765, 0.34313725, 0.39215686, 0.44117647,
        0.49019608]),
 <BarContainer object of 10 artists>)
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> 2022-11-27T16:47:33.927849 image/svg+xml Matplotlib v3.5.3, https://matplotlib.org/

Pick the most diverse molecules from the cluster

Implementation of Sphere exclusion algorithm (also called Leader) from Roger Sayles.

from rdkit.SimDivFilters import rdSimDivPickers
lp = rdSimDivPickers.LeaderPicker()
threshold = 0.65 # <- minimum distance between clusters 
picks = lp.LazyBitVectorPick(fps, len(fps), threshold)
len(picks)
3291