Rdkit code snippets and recipes that I revisit now and again. The snippets are adopted from different python scripts written over time, ignore the variable names.
Fingerprints
Quick ECFP fingerprint
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
# Convert to Chem.Mol:
mol = Chem.MolFromSmiles(smiles)
# Counts by default - unfolded
rdMolDescriptors.GetMorganFingerprint(mol, radius)
# Folded counts
rdMolDescriptors.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
#Folded FP bit vectors as per the size of the bits
morgan_fp_bit_vect = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
# Convert to numpy
fp = np.zeros((0,), dtype=np.int16)
DataStructs.ConvertToNumpyArray(morgan_fp_bit_vect, fp)
Loading data
Tanimoto similarity matrix
Adapted from Andrew White:
import itertools
def tanimoto_matrix(slist):
'''
Compute pair-wise Tanimoto similarity between a list of smiles with ECFP4 FPs
'''
fp = [ AllChem.GetMorganFingerprint( Chem.MolFromSmiles(s), 2 ) for s in slist ]
ts = list(
DataStructs.cDataStructs.TanimotoSimilarity(x,y) for x, y, in itertools.product(fp, repeat=2)
)
return np.array(ts).reshape(len(fp), len(fp))
Loading ZINC dataset
Adapted from Andrew White:
tranches = pd.read_csv('https://gist.githubusercontent.com/whitead/f47887e45bbd2f38332182d2d422da6b/raw/a3948beac9b9034dab432b697c5ec238503ac5d0/tranches.txt')
def get_mol_batch(batch_size = 32):
for t in tranches.values:
d = pd.read_csv(t[0], sep=' ')
for i in range(len(d) // batch_size):
yield d.iloc[i * batch_size:(i + 1) * batch_size, 0].values
Viewing molecules
SDF
Simple implementation
inf = open('./example.sdf','rb')
#import gzip
#inf = gzip.open('gzip_file')
fsuppl = Chem.ForwardSDMolSupplier(inf)
mol_list = []
for mol in fsuppl:
if mol is None:
continue
print(mol.GetNumAtoms())
mol_list.append(mol)
As a Pandas DataFrame
sdf_df = PandasTools.LoadSDF('./example.sdf')
sdf_df['NumHeavyAtoms']= sdf_df.apply(lambda x: x['ROMol'].GetNumHeavyAtoms(), axis=1)
PandasTools.WriteSDF(sdf_df, 'out.sdf', molColName='ROMol', idName='ID', properties=list(sdf_df.columns), allNumeric=False)
Viewing molecules inline
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolsToGridImage
IPythonConsole.ipython_useSVG=True #SVG's tend to look nicer than the png counterparts
sample_mol_list = [Chem.MolFromSmiles(x) for x in smiles_list]
MolsToGridImage(sample_mol_list, molsPerRow=5)
Viewing molecules in a grid
import pandas as pd
from rdkit.Chem import PandasTools
>> PandasTools.AddMoleculeColumnToFrame(df, smilesCol='smiles')
>> esol_data.head(1)
>> PandasTools.FrameToGridImage(df.head(8), legendsCol="logSolubility", molsPerRow=4)
Adding new values as a column
df["n_Atoms"] = df['ROMol'].map(lambda x: x.GetNumAtoms())
df.head(1)
Molecules in a xlsx file
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
>> smiles = ['c1ccccc1', 'c1ccccc1O', 'c1cc(O)ccc1O']
>> df = pd.DataFrame({'ID':['Benzene', 'Phenol', 'Hydroquinone'], 'SMILES':smiles})
>> df['Mol Image'] = [Chem.MolFromSmiles(s) for s in df['SMILES']]
>> PandasTools.SaveXlsxFromFrame(df, 'test.xlsx', molCol='Mol Image')
Viewing substructures
def viz_substruct(main_smile, substructure_smarts):
mol_file = Chem.MolFromSmiles(main_smile)
sub_pattern = Chem.MolFromSmarts(substructure_smarts)
hit_ats = list(mol_file.GetSubstructMatch(sub_pattern)) # Returns the indices of the molecule’s atoms that match a substructure query
hit_bonds = []
for bond in sub_pattern.GetBonds():
aid1 = hit_ats[bond.GetBeginAtomIdx()]
aid2 = hit_ats[bond.GetEndAtomIdx()]
hit_bonds.append( mol_file.GetBondBetweenAtoms(aid1, aid2).GetIdx() )
d2d = rdMolDraw2D.MolDraw2DSVG(400, 400) # or MolDraw2DCairo to get PNGs
rdMolDraw2D.PrepareAndDrawMolecule(d2d, mol_file, highlightAtoms=hit_ats, highlightBonds=hit_bonds)
d2d.FinishDrawing()
return SVG(d2d.GetDrawingText())
>> diclofenac = 'O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl'
>> substruct_smarts = 'O=CCccN'
>> viz_substruct(diclofenac, substruct_smarts)
Reactions
View a reaction
rxn = AllChem.ReactionFromSmarts('[C:1]=[C:2].[C:3]=[*:4][*:5]=[C:6]>>[C:1]1[C:2][C:3][*:4]=[*:5][C:6]1')
Get changed atoms in a reaction: https://greglandrum.github.io/rdkit-blog/tutorial/reactions/2021/11/26/highlighting-changed-bonds-in-reactions.html
Run enumerations
Borrowed and inspired from Pat Walters gist
rxn = AllChem.ReactionFromSmarts("[#6:10]-[#7H2:9].[#7]-[c:4]1[c:5][c:6][c:7][c:8][c:3]1-[#6](-[OH])=O.[#6H:1](-[#6:2])=O>>[#6:10]-[#7:9]-c1n[c:1](-[#6:2])n[c:4]2[c:5][c:6][c:7][c:8][c:3]12")
react_dict = {'R1':[],'R2':[],'R3':[], 'product':[]}
for r1, r2, r3 in product( *[primary_amines_list, amino_benzoic_list, aldehyde_list]):
react_dict['R1'].append(r1)
react_dict['R2'].append(r2)
react_dict['R3'].append(r3)
r1_mol = Chem.MolFromSmiles(r1)
r2_mol = Chem.MolFromSmiles(r2)
r3_mol = Chem.MolFromSmiles(r3)
for prod in rxn.RunReactants([r1_mol,r2_mol,r3_mol]):
prod_mol = prod[0]
Chem.SanitizeMol(prod_mol)
react_dict['product'].append(Chem.MolToSmiles(prod_mol))
react_df = pd.DataFrame(react_dict)
More examples here
Edit, merge, react molecules
Molecule tinkering using Rdkit: http://asteeves.github.io/blog/2015/01/14/editing-in-rdkit/
Using mol2grid
mols2grid is an interactive chemical viewer for 2D structures of small molecules, based on RDKit.