Rdkit code snippets and recipes that I revisit now and again. The snippets are adopted from different python scripts written over time, ignore the variable names.
Tutorials & Walkthroughs
*Mute warnings
from rdkit import RDLogger
'rdApp.*') RDLogger.DisableLog(
Fingerprints
Quick ECFP fingerprint
from rdkit.Chem import AllChem
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
# Convert to Chem.Mol:
= Chem.MolFromSmiles(smiles)
mol
# Counts by default - unfolded
rdMolDescriptors.GetMorganFingerprint(mol, radius)
# Folded counts
=num_bits)
rdMolDescriptors.GetHashedMorganFingerprint(mol, radius, nBits
#Folded FP bit vectors as per the size of the bits
= rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
morgan_fp_bit_vect
# Convert to numpy
= np.zeros((0,), dtype=np.int16)
fp DataStructs.ConvertToNumpyArray(morgan_fp_bit_vect, fp)
Loading data
Tanimoto similarity matrix
Adapted from Andrew White:
import itertools
def tanimoto_matrix(slist):
'''
Compute pair-wise Tanimoto similarity between a list of smiles with ECFP4 FPs
'''
= [ AllChem.GetMorganFingerprint( Chem.MolFromSmiles(s), 2 ) for s in slist ]
fp = list(
ts for x, y, in itertools.product(fp, repeat=2)
DataStructs.cDataStructs.TanimotoSimilarity(x,y)
)return np.array(ts).reshape(len(fp), len(fp))
Loading ZINC dataset
Adapted from Andrew White:
= pd.read_csv('https://gist.githubusercontent.com/whitead/f47887e45bbd2f38332182d2d422da6b/raw/a3948beac9b9034dab432b697c5ec238503ac5d0/tranches.txt')
tranches def get_mol_batch(batch_size = 32):
for t in tranches.values:
= pd.read_csv(t[0], sep=' ')
d for i in range(len(d) // batch_size):
yield d.iloc[i * batch_size:(i + 1) * batch_size, 0].values
Viewing molecules
SDF
Simple implementation
= open('./example.sdf','rb')
inf #import gzip
#inf = gzip.open('gzip_file')
= Chem.ForwardSDMolSupplier(inf)
fsuppl = []
mol_list for mol in fsuppl:
if mol is None:
continue
print(mol.GetNumAtoms())
mol_list.append(mol)
As a Pandas DataFrame
= PandasTools.LoadSDF('./example.sdf')
sdf_df 'NumHeavyAtoms']= sdf_df.apply(lambda x: x['ROMol'].GetNumHeavyAtoms(), axis=1)
sdf_df['out.sdf', molColName='ROMol', idName='ID', properties=list(sdf_df.columns), allNumeric=False) PandasTools.WriteSDF(sdf_df,
Viewing molecules inline
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import MolsToGridImage
=True #SVG's tend to look nicer than the png counterparts
IPythonConsole.ipython_useSVG
= [Chem.MolFromSmiles(x) for x in smiles_list]
sample_mol_list =5) MolsToGridImage(sample_mol_list, molsPerRow
Viewing molecules in a grid
import pandas as pd
from rdkit.Chem import PandasTools
>> PandasTools.AddMoleculeColumnToFrame(df, smilesCol='smiles')
>> esol_data.head(1)
>> PandasTools.FrameToGridImage(df.head(8), legendsCol="logSolubility", molsPerRow=4)
Adding new values as a column
"n_Atoms"] = df['ROMol'].map(lambda x: x.GetNumAtoms())
df[1) df.head(
Molecules in a xlsx file
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
>> smiles = ['c1ccccc1', 'c1ccccc1O', 'c1cc(O)ccc1O']
>> df = pd.DataFrame({'ID':['Benzene', 'Phenol', 'Hydroquinone'], 'SMILES':smiles})
>> df['Mol Image'] = [Chem.MolFromSmiles(s) for s in df['SMILES']]
>> PandasTools.SaveXlsxFromFrame(df, 'test.xlsx', molCol='Mol Image')
Viewing substructures
def viz_substruct(main_smile, substructure_smarts):
= Chem.MolFromSmiles(main_smile)
mol_file = Chem.MolFromSmarts(substructure_smarts)
sub_pattern
= list(mol_file.GetSubstructMatch(sub_pattern)) # Returns the indices of the molecule’s atoms that match a substructure query
hit_ats = []
hit_bonds
for bond in sub_pattern.GetBonds():
= hit_ats[bond.GetBeginAtomIdx()]
aid1 = hit_ats[bond.GetEndAtomIdx()]
aid2
hit_bonds.append( mol_file.GetBondBetweenAtoms(aid1, aid2).GetIdx() )
= rdMolDraw2D.MolDraw2DSVG(400, 400) # or MolDraw2DCairo to get PNGs
d2d =hit_ats, highlightBonds=hit_bonds)
rdMolDraw2D.PrepareAndDrawMolecule(d2d, mol_file, highlightAtoms
d2d.FinishDrawing()return SVG(d2d.GetDrawingText())
>> diclofenac = 'O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl'
>> substruct_smarts = 'O=CCccN'
>> viz_substruct(diclofenac, substruct_smarts)
Quick substructure filter in-line to Pandas df
def NO_NO(smile):
''' Detects a Tau binder fragment'''
= Chem.MolFromSmiles(smile)
mol = Chem.MolFromSmarts('[NR1][OR1]')
smt1if mol.HasSubstructMatch(smt1):
return True
else:
return False
>> linker_2_stereos['Ring_NO'] = linker_2_stereos['SMILES'].apply(NO_NO)
Standardize molecules
Borrowed from here
from rdkit.Chem.MolStandardize import rdMolStandardize
def standardize(mol):
# follows the steps in
# https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
# as described **excellently** (by Greg) in
# https://www.youtube.com/watch?v=eWTApNX8dJQ
# removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
= rdMolStandardize.Cleanup(mol)
clean_mol
# if many fragments, get the "parent" (the actual mol we are interested in)
= rdMolStandardize.FragmentParent(clean_mol)
parent_clean_mol
# try to neutralize molecule
= rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
uncharger = uncharger.uncharge(parent_clean_mol)
uncharged_parent_clean_mol
# note that no attempt is made at reionization at this step
# nor at ionization at some pH (rdkit has no pKa caculator)
# the main aim to to represent all molecules from different sources
# in a (single) standard way, for use in ML, catalogue, etc.
= rdMolStandardize.TautomerEnumerator() # idem
te = te.Canonicalize(uncharged_parent_clean_mol)
taut_uncharged_parent_clean_mol
return taut_uncharged_parent_clean_mol
Find and change atoms in the molecule
def find_UI_change_to_12(smile, attach_type='UI'):
= Chem.MolFromSmiles(smile)
mol1 = Chem.MolFromSmarts('[U]*.[I]*')
sub_pattern = list(mol1.GetSubstructMatch(sub_pattern))
hit_ats = Chem.EditableMol(mol1) #Chem.RWMol()
emol 0], hit_ats[1])
emol.RemoveBond(hit_ats[2], hit_ats[3])
emol.RemoveBond(hit_ats[= emol.GetMol()
mol2
if attach_type == 'UI':
# Attached to U
1]).SetIsotope(2)
mol2.GetAtomWithIdx(hit_ats[# Attached to I
3]).SetIsotope(1)
mol2.GetAtomWithIdx(hit_ats[
else:
# Attached to U
1]).SetIsotope(1)
mol2.GetAtomWithIdx(hit_ats[# Attached to I
3]).SetIsotope(2)
mol2.GetAtomWithIdx(hit_ats[
= standardize(mol2)
mol2 return(mol2, Chem.MolToSmiles(mol2))
Reactions
View a reaction
= AllChem.ReactionFromSmarts('[C:1]=[C:2].[C:3]=[*:4][*:5]=[C:6]>>[C:1]1[C:2][C:3][*:4]=[*:5][C:6]1') rxn
Get changed atoms in a reaction: https://greglandrum.github.io/rdkit-blog/tutorial/reactions/2021/11/26/highlighting-changed-bonds-in-reactions.html
Run enumerations
Borrowed and inspired from Pat Walters gist
= AllChem.ReactionFromSmarts("[#6:10]-[#7H2:9].[#7]-[c:4]1[c:5][c:6][c:7][c:8][c:3]1-[#6](-[OH])=O.[#6H:1](-[#6:2])=O>>[#6:10]-[#7:9]-c1n[c:1](-[#6:2])n[c:4]2[c:5][c:6][c:7][c:8][c:3]12")
rxn
= {'R1':[],'R2':[],'R3':[], 'product':[]}
react_dict for r1, r2, r3 in product( *[primary_amines_list, amino_benzoic_list, aldehyde_list]):
'R1'].append(r1)
react_dict['R2'].append(r2)
react_dict['R3'].append(r3)
react_dict[= Chem.MolFromSmiles(r1)
r1_mol = Chem.MolFromSmiles(r2)
r2_mol = Chem.MolFromSmiles(r3)
r3_mol for prod in rxn.RunReactants([r1_mol,r2_mol,r3_mol]):
= prod[0]
prod_mol
Chem.SanitizeMol(prod_mol)'product'].append(Chem.MolToSmiles(prod_mol))
react_dict[
= pd.DataFrame(react_dict) react_df
Zip molecules
molzip
lets you take a molecule containing multiple fragments and ‘zip’ them together. Atoms which should be bonded to the final molecule are labelled by connecting them to dummy atoms. The code looks at matching dummy atoms (with same isotopic labels) in the fragment and adds bonds between them.
= Chem.MolFromSmiles('[*:1]c1nc([*:2])nc([*:3])c1.CO[*:1].[*:2]N(CO)C.Cl[*:3]')
sample
sample
Chem.molzip(sample)
RDKit documentation on Molzip and R-group decomposition
MolZip documentation and early version here
More examples here
Edit, merge, react molecules
Molecule tinkering using Rdkit: http://asteeves.github.io/blog/2015/01/14/editing-in-rdkit/
Using mol2grid
mols2grid is an interactive chemical viewer for 2D structures of small molecules, based on RDKit.
Useful set of functions in RDkit
Pat Walters has made a GitHub repo collecting useful RDKit functions here.