Cheminformatics basics - Clustering molecules
Walkthrough on clustering molecules. This notebook is made using an eclectic collection of different walkthroughs and tutorials found on various blogs
import os
import sys
import pandas as pd
import numpy as np
import operator
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
IPythonConsole.ipython_useSVG=True #SVG's tend to look nicer than the png counterparts
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor, rdMolDescriptors
import time
rdDepictor.SetPreferCoordGen(True)
import rdkit
%matplotlib inline
print(rdkit.__version__)
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'
# "Infinite" DPI vector output -- overkill
%config InlineBackend.figure_format = 'svg'
# Plot matplotlib plots with white background:
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
plt.rcParams.update(plot_params)
from rdkit.Chem import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
x = pd.read_csv('./data/small_molecule_data/tox21.csv')
x.sample(5)
mol_obj = [Chem.MolFromSmiles(smi, sanitize=True) for smi in list(x['smiles'])]
len(mol_obj)
Generate fingerprints for the molecules - using Morgan FP2
from rdkit.Chem import rdMolDescriptors
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_obj]
Calculate distance matrix for the molecules
dists = []
n_fps = len(fps)
for i in range(1, n_fps):
sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([ 1-x for x in sim ])
plt.hist(dists);
Clustering the molecules based on the FPs and the distance matrix
cutoff_distance = 0.35 # Any molecules closer than this are kept in one cluster so <= cutoff_distance
mol_clusters = Butina.ClusterData(dists, n_fps, distThresh=cutoff_distance, isDistData=True)
cluster_id_list = [0]*n_fps
for idx, cluster in enumerate(mol_clusters, 1):
for member in cluster:
cluster_id_list[member] = idx
x['cluster'] = cluster_id_list
x['cluster'].value_counts(sort=True, ascending=False)
Calculate intra cluster similarity for the molecules
x_42 = x.loc[ x['cluster'] == 1]
x_42.shape
Draw.MolsToGridImage( [mol_obj[x] for x in x_42.index], subImgSize=(200, 200), molsPerRow=4)
res = []
cfps = [ fps[i] for i in x_42.index ]
for i in range(1, x_42.shape[0]):
sim = DataStructs.cDataStructs.BulkTanimotoSimilarity(cfps[i], cfps[:i])
res.extend( [1-x for x in sim] )
plt.hist(res)
from rdkit.SimDivFilters import rdSimDivPickers
lp = rdSimDivPickers.LeaderPicker()
threshold = 0.65 # <- minimum distance between clusters
picks = lp.LazyBitVectorPick(fps, len(fps), threshold)
len(picks)