import os
import sys
import pandas as pd
import numpy as np
import operator
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
IPythonConsole.ipython_useSVG=True #SVG's tend to look nicer than the png counterparts
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
rdDepictor.SetPreferCoordGen(True)
import rdkit
%matplotlib inline
print(rdkit.__version__)
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'
# "Infinite" DPI vector output -- overkill
%config InlineBackend.figure_format = 'svg'
# Plot matplotlib plots with white background:
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
plt.rcParams.update(plot_params)
from rdkit.Chem import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
x = pd.read_csv('./data/small_molecule_data/tox21.csv')
x.sample(5)
mol_obj = [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
len(mol_obj)
Generate fingerprints for the molecules - using Morgan FP2
from rdkit.Chem import rdMolDescriptors
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj]
Calculate distance matrix for the molecules
dists = []
n_fps = len(fps)
for i in range(1, n_fps):
sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([ 1-x for x in sim ])
plt.hist(dists);
Clustering the molecules based on the FPs and the distance matrix
cutoff_distance = 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
mol_clusters = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
cluster_id_list = [0]*n_fps
for idx, cluster in enumerate(mol_clusters, 1):
for member in cluster:
cluster_id_list[member] = idx
x['cluster'] = cluster_id_list
x['cluster'].value_counts(sort=True, ascending=False)
Calculate intra cluster similarity for the molecules
x_42 = x.loc[ x['cluster'] == 1]
x_42.shape
Draw.MolsToGridImage( [mol_obj[x] for x in x_42.index], subImgSize=(200, 200), molsPerRow=4)
res = []
cfps = [ fps[i] for i in x_42.index ]
for i in range(1, x_42.shape[0]):
sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
res.extend( [1-x for x in sim] )
plt.hist(res)
from rdkit.SimDivFilters import rdSimDivPickers
lp = rdSimDivPickers.LeaderPicker()
threshold = 0.65 # <- minimum distance between clusters
picks = lp.LazyBitVectorPick(fps, len(fps), threshold)
len(picks)