Classifying reactions using machine-learning

python
chemical-science
data-analysis
Author

Pushkar G. Ghanekar

What?

Using data-driven methods to classify reactions in different categories.

Why?

Categorically sorting (new) reactions can help with better documentation and developing a broader understanding of mechanisms possibles in the reactions.

How?

A chemical reaction is descirbed using a three-level reaction ontology based on the hierarchy proposed by Carey, Laffan, Thomson and Williams in 2006.

In this scheme, every reaction is grouped using 3 layers of information: superclass >> class >> type

Fo example: Suzuki reaction is as follows:

"3 Carbon-Carbon bond formation" (Superclass)
    |- "3.1. Suzuki coupling" (Class)
        |- 3.1.1 Bromo OR 3.1.2 Chloro OR 3.1.3 Iodo Suzuki Coupling (Type)
    |- "3.5 Palladium-catalyzed C-C bond formation" (Class) 
        |- 3.5.3 Negishi coupling (Type)

Researchers at NextMove software were among the first groups to scrap US Patent literature for chemical reactions and use the categories defined above to systematically classify the reactions.

Another important step in this process is the atom-atom mapping of the chemical reactions. While not a crucial step (as newer algorithms can perform this task without explicit atom-mapping) it is an important pre-processing standardization operation.

Atom-atom mapping helps to understand which reactant atom becomes which product atom during the reaction. From this information it is possible to identify reaction centers and sets of bonds made and broken during the reaction.

This is also useful in distinguishing reactants and products.

By convention:

Reactant: Contribute one more more atoms to the product

Reagents (solvent, catalyst): Do not contribute any atom to the product(s)

Relevant papers in this field can be found here

Using the Schneider et. al. paper for reference - https://pubs.acs.org/doi/10.1021/ci5006614

import os 
import pandas as pd 
import numpy as np 
import rdkit
from rdkit import Chem 
from rdkit.Chem import AllChem
from rdkit.Chem import Draw 
from rdkit.Chem.Draw import IPythonConsole 
from IPython.display import Image

IPythonConsole.ipython_useSVG=True 
try: 
    import cPickle as pickle 
except: 
    import pickle 
# View reactions 
def display_rxn(rxn_smarts):
    rxn = AllChem.ReactionFromSmarts(rxn_smarts,useSmiles=True)
    d2d = Draw.MolDraw2DCairo(800,200)
    d2d.DrawReaction(rxn)
    png = d2d.GetDrawingText()
    return Image(png)
# Mute all errors except critical
Chem.WrapLogs()
lg = rdkit.RDLogger.logger() 
lg.setLevel(rdkit.RDLogger.CRITICAL)
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm

# High DPI rendering for mac
%config InlineBackend.figure_format = 'retina'

# Plot matplotlib plots with white background: 
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
# Data directories 
data_dir = 'DATA/Schneider_etal_ChemReactionClassification/data'
# reaction types
with open(os.path.join(data_dir, 'reactionTypes_training_test_set_patent_data.pkl'), 'rb') as f: 
    reaction_types = pickle.load(f)

# reaction classification data
with open(os.path.join(data_dir, 'names_rTypes_classes_superclasses_training_test_set_patent_data.pkl'), 'rb') as f: 
    names_rTypes = pickle.load(f)
len(reaction_types)
50

names_rTypes is a super set of all possible reaction there are

names_rTypes
{'3.1.1': 'Bromo Suzuki coupling',
 '6.1.5': 'N-Bn deprotection',
 '3.1.6': 'Chloro Suzuki-type coupling',
 '3.1.5': 'Bromo Suzuki-type coupling',
 '6.1.1': 'N-Boc deprotection',
 '9.1.6': 'Hydroxy to chloro',
 '7.2': 'Amide to amine reduction',
 '7.3': 'Cyano or imine to amine',
 '7.1': 'Nitro to amine reduction',
 '6.3': 'ROH deprotections',
 '6.2': 'RCO2H deprotections',
 '6.1': 'NH deprotections',
 '7.9': 'Other reductions',
 '6.1.3': 'N-Cbz deprotection',
 '10.1': 'Halogenation',
 '10.2': 'Nitration',
 '10.4': 'Other functional group addition',
 '1.6.2': 'Bromo N-alkylation',
 '1.6.4': 'Chloro N-alkylation',
 '8': 'Oxidations',
 '1.6.8': 'Iodo N-alkylation',
 '1.7.7': 'Mitsunobu aryl ether synthesis',
 '1.8.5': 'Thioether synthesis',
 '10.1.1': 'Bromination',
 '10.1.2': 'Chlorination',
 '10.1.5': 'Wohl-Ziegler bromination',
 '9.3.1': 'Carboxylic acid to acid chloride',
 '7.9.2': 'Carboxylic acid to alcohol reduction',
 '3.4': 'Stille reaction',
 '3.3': 'Sonogashira reaction',
 '3.1': 'Suzuki coupling',
 '2.3': 'N-acylation to urea',
 '2.2': 'N-sulfonylation',
 '2.1': 'N-acylation to amide',
 '2.7': 'O-sulfonylation',
 '2.6': 'O-acylation to ester',
 '7.2.1': 'Amide to amine reduction',
 '3': 'C-C bond formation',
 '7': 'Reductions',
 '10.4.2': 'Methylation',
 '3.4.1': 'Stille reaction',
 '6.2.1': 'CO2H-Et deprotection',
 '6.2.3': 'CO2H-tBu deprotection',
 '6.2.2': 'CO2H-Me deprotection',
 '2.2.3': 'Sulfonamide Schotten-Baumann',
 '8.1': 'Alcohols to aldehydes',
 '8.2': 'Oxidations at sulfur',
 '10.2.1': 'Nitration',
 '2': 'Acylation and related processes',
 '6': 'Deprotections',
 '9.1': 'Alcohol to halide',
 '9.3': 'Acid to acid chloride',
 '1.3.7': 'Chloro N-arylation',
 '1.3.6': 'Bromo N-arylation',
 '1.3.8': 'Fluoro N-arylation',
 '8.2.1': 'Sulfanyl to sulfinyl',
 '10': 'Functional group addition (FGA)',
 '2.6.1': 'Ester Schotten-Baumann',
 '2.6.3': 'Fischer-Speier esterification',
 '3.3.1': 'Sonogashira coupling',
 '6.3.7': 'Methoxy to hydroxy',
 '6.3.1': 'O-Bn deprotection',
 '1.6': 'Heteroaryl N-alkylation',
 '1.7': 'O-substitution',
 '1.2': 'Reductive amination',
 '1.3': 'N-arylation with Ar-X',
 '1.8': 'S-substitution',
 '2.7.2': 'Sulfonic ester Schotten-Baumann',
 '2.1.2': 'Carboxylic acid + amine reaction',
 '2.1.1': 'Amide Schotten-Baumann',
 '2.1.7': 'N-acetylation',
 '5.1': 'NH protections',
 '1': 'Heteroatom alkylation and arylation',
 '5': 'Protections',
 '1.7.9': 'Williamson ether synthesis',
 '9': 'Functional group interconversion (FGI)',
 '1.7.6': 'Methyl esterification',
 '1.7.4': 'Hydroxy to methoxy',
 '2.3.1': 'Isocyanate + amine reaction',
 '1.2.4': 'Eschweiler-Clarke methylation',
 '1.2.5': 'Ketone reductive amination',
 '1.2.1': 'Aldehyde reductive amination',
 '8.1.4': 'Alcohol to aldehyde oxidation',
 '8.1.5': 'Alcohol to ketone oxidation',
 '5.1.1': 'N-Boc protection',
 '7.1.1': 'Nitro to amino',
 '7.3.1': 'Nitrile reduction'}
# Loading the rxn files  
import gzip 
infile = gzip.open( os.path.join(data_dir, 'training_test_set_patent_data.pkl.gz'), 'rb' )
rxn_data_list = []

lineNo = 0
while True:
    lineNo+=1
    
    try:
        smi,lbl,klass = pickle.load(infile) 
        
    except EOFError:
        break
        
    rxn_data_list.append([smi,lbl,klass])
    
    if lineNo%10000 == 0:
        print("Done "+str(lineNo))        
Done 10000
Done 20000
Done 30000
Done 40000
Done 50000
len(rxn_data_list)
50000

Viewing it as Pandas dataframe

column_names = ['SMILES', 'Patent No', 'Rxn Class']
df_rxn = pd.DataFrame(rxn_data_list, columns=column_names)
df_rxn
SMILES Patent No Rxn Class
0 [CH3:17][S:14](=[O:15])(=[O:16])[N:11]1[CH2:10... US06887874 6.1.5
1 O.O.[Na+].[CH3:1][c:2]1[cH:7][c:6]([N+:8](=O)[... US07056926 7.1.1
2 [CH3:1][O:2][c:3]1[cH:4][cH:5][c:6](-[c:9]2[cH... US08492378 1.8.5
3 Cl.[CH3:43][CH2:42][S:44](=[O:45])(=[O:46])Cl.... US08592454 2.2.3
4 [CH3:25][O:24][c:21]1[cH:22][cH:23][c:17]([O:1... US06716851 1.3.7
... ... ... ...
49995 [BH4-].[Na+].[CH3:25][O:24][c:19]1[cH:18][c:17... US08324216 7.3.1
49996 [BH4-].[Na+].[N:30]#[C:29][c:26]1[cH:25][cH:24... US07595398 7.3.1
49997 [N:15]#[C:14][CH2:13][c:1]1[cH:2][n:3][n:4]2[c... US08273761 7.3.1
49998 B.Cl.CO.[CH3:12][C:8]([OH:13])([CH2:9][C:10]#[... US08609849 7.3.1
49999 [CH3:2][CH2:1][O:3][C:4](=[O:5])[C:6]1([C:14]#... US07030267 7.3.1

50000 rows × 3 columns

df_rxn.dtypes
SMILES       object
Patent No    object
Rxn Class    object
dtype: object
df_rxn['Rxn Class'].value_counts()
6.1.5     1000
3.3.1     1000
1.3.8     1000
1.3.6     1000
3.1.5     1000
6.2.3     1000
3.4.1     1000
6.1.3     1000
1.7.6     1000
10.1.2    1000
9.1.6     1000
10.1.5    1000
10.4.2    1000
7.1.1     1000
6.3.1     1000
1.7.7     1000
7.9.2     1000
8.1.5     1000
1.7.4     1000
7.2.1     1000
8.1.4     1000
8.2.1     1000
7.3.1     1000
2.1.7     1000
9.3.1     1000
6.1.1     1000
6.3.7     1000
2.1.2     1000
1.8.5     1000
2.2.3     1000
1.3.7     1000
1.7.9     1000
6.2.2     1000
2.7.2     1000
2.6.1     1000
1.6.8     1000
3.1.1     1000
1.6.2     1000
1.2.1     1000
1.6.4     1000
1.2.5     1000
2.3.1     1000
5.1.1     1000
10.1.1    1000
2.1.1     1000
2.6.3     1000
6.2.1     1000
10.2.1    1000
1.2.4     1000
3.1.6     1000
Name: Rxn Class, dtype: int64
df_rxn.iloc[42069]
SMILES       [H][H].[O:32]=[C:18]1[NH:17][C:16](=[O:33])[C@...
Patent No                                           US08377927
Rxn Class                                                6.3.1
Name: 42069, dtype: object
df_rxn.SMILES[42069]
'[H][H].[O:32]=[C:18]1[NH:17][C:16](=[O:33])[C@@H:15]([c:12]2[cH:11][cH:10][c:9]([O:8]Cc3ccccc3)[cH:14][cH:13]2)[C@@H:19]1[c:20]1[cH:21][n:22]2[c:31]3[c:30]1[cH:29][cH:28][cH:27][c:26]3[CH2:25][CH2:24][CH2:23]2>>[O:32]=[C:18]1[NH:17][C:16](=[O:33])[C@@H:15]([c:12]2[cH:13][cH:14][c:9]([OH:8])[cH:10][cH:11]2)[C@@H:19]1[c:20]1[cH:21][n:22]2[c:31]3[c:30]1[cH:29][cH:28][cH:27][c:26]3[CH2:25][CH2:24][CH2:23]2'
display_rxn(df_rxn.SMILES[42069])

Generate Chemical Entries object in Rdkit from the RXN SMILES

%%time 
# Convert Smiles strings to reaction objects - this takes the most time and might be helpful if parallelized 
from rdkit.Chem import rdChemReactions # Main reaction analysis class 
df_rxn['rxn_obj'] = df_rxn['SMILES'].apply(rdChemReactions.ReactionFromSmarts)
CPU times: user 13.9 s, sys: 1.61 s, total: 15.5 s
Wall time: 15.5 s
df_rxn['rxn_obj'][42069]

temp_rxn = df_rxn['rxn_obj'][42069]
type(temp_rxn)
rdkit.Chem.rdChemReactions.ChemicalReaction

Fingerprints in RDkit

More information here: https://www.rdkit.org/UGM/2012/Landrum_RDKit_UGM.Fingerprints.Final.pptx.pdf

Base reaction class in RDKit reaction class now moved to a new class name: http://rdkit.org/docs/source/rdkit.Chem.rdChemReactions.html

Here I am using Reaction Difference FPs for converting to FPs - another option is to use the Transformation FPs

Fingerprint Type Meaning
Difference FPs Take difference of structural FPs of reactant and product
Structural FPs Concatenate the FPs of reactant and product in 1 vector

Another option: - Adding in agent during the fingerprint generation – weighting its importance - Appending the agent after the FP formation

# Check to see if you can convert this to RDkit FPs  
AllChem.ReactionFingerprintParams()
<rdkit.Chem.rdChemReactions.ReactionFingerprintParams at 0x2ba8fcb9f670>
Chem.rdChemReactions.ReactionFingerprintParams()
<rdkit.Chem.rdChemReactions.ReactionFingerprintParams at 0x2ba8fcb988b0>
rdChemReactions.CreateDifferenceFingerprintForReaction(temp_rxn)
<rdkit.DataStructs.cDataStructs.UIntSparseIntVect at 0x2ba8fcfa87b0>

Function to include agent in the FPs

# Featurize the agents in the rxn 
## This is taken from the paper SI 

def create_agent_feature_FP(rxn):    
    rxn.RemoveUnmappedReactantTemplates()
    agent_feature_Fp = [0.0]*9
    
    for nra in range(rxn.GetNumAgentTemplates()):
        mol = rxn.GetAgentTemplate(nra)
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSSSR(mol)
        
        try:
            ri = mol.GetRingInfo()
            agent_feature_Fp[0] += Descriptors.MolWt(mol)
            agent_feature_Fp[1] += mol.GetNumAtoms()
            agent_feature_Fp[2] += ri.NumRings()
            agent_feature_Fp[3] += Descriptors.MolLogP(mol)
            agent_feature_Fp[4] += Descriptors.NumRadicalElectrons(mol)
            agent_feature_Fp[5] += Descriptors.TPSA(mol)
            agent_feature_Fp[6] += Descriptors.NumHeteroatoms(mol)
            agent_feature_Fp[7] += Descriptors.NumHAcceptors(mol)
            agent_feature_Fp[8] += Descriptors.NumHDonors(mol)
        except:
            continue 
            
    return agent_feature_Fp

def create_agent_morgan2_FP(rxn):    
    rxn.RemoveUnmappedReactantTemplates()
    morgan2 = None
    
    for nra in range(rxn.GetNumAgentTemplates()):
        mol = rxn.GetAgentTemplate(nra)
        mol.UpdatePropertyCache(strict=False)
        Chem.GetSSSR(mol)
        try:
            mg2 = AllChem.GetMorganFingerprint(mol,radius=2)
            if morgan2 is None and mg2 is not None:
                morgan2 = mg2
            elif mg2 is not None:
                morgan2 += mg2
        except:
            print("Cannot build agent Fp\n")
    
    if morgan2 is None: 
        morgan2 = DataStructs.UIntSparseIntVect(2048)
        
    return morgan2

# Include agents in the fingerprint as either a reactant or product
## Inputs are reaction object, fp_type object, int, int

# Create dictionary of all Molecular Fingerprinting types with names
fptype_dict = {"AtomPairFP": AllChem.FingerprintType.AtomPairFP,
               "MorganFP": AllChem.FingerprintType.MorganFP, 
               "TopologicalFP": AllChem.FingerprintType.TopologicalTorsion, 
               "PatternFP": AllChem.FingerprintType.PatternFP, 
               "RDKitFP": AllChem.FingerprintType.RDKitFP}

# Construct a difference fingerprint for a ChemicalReaction by subtracting the reactant fingerprint from the product fingerprint
def diff_fpgen(rxn, fptype_dict = fptype_dict, fp_type = 'MorganFP', include_agent=True, agent_weight=1, nonagent_weight=10):
    params = rdChemReactions.ReactionFingerprintParams()
    params.fptype = fptype_dict[fp_type]
    params.includeAgents = include_agent
    
    if include_agent == True: 
        '''
        If including agent then how is it weighted? 
        '''
        params.agentWeight = agent_weight
        params.nonAgentWeight = nonagent_weight
        
    fp = rdChemReactions.CreateDifferenceFingerprintForReaction(rxn,params)
    return fp

Function to convert sparse to numpy array

from rdkit import DataStructs
def fingerprint2Numpy(FPs):
    fp_np = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(FPs, fp_np)
    return fp_np

# convert a hashed SparseIntvect into a numpy float vector
def hashedFPToNPfloat(fp,fpsz=2048):
    nfp = np.zeros((fpsz,), float)
    for idx,v in fp.GetNonzeroElements().items():
        nfp[idx]+=float(v)
    return nfp

Convert the rxn objects to FPs and save pickle

df_rxn.sample(2)
SMILES Patent No Rxn Class rxn_obj
37512 [OH-].[Na+].Cl.[K+].[BH3-]C#N.[CH3:5][CH2:4][N... US06964966 1.2.5 <rdkit.Chem.rdChemReactions.ChemicalReaction o...
934 [OH-].[K+].[CH3:14][C@H:5]([CH2:6][c:7]1[cH:8]... 05166218 1.7.9 <rdkit.Chem.rdChemReactions.ChemicalReaction o...
%%time
df_rxn['FP_Morgan_wo_agents'] = df_rxn['rxn_obj'].apply(diff_fpgen)
CPU times: user 18.5 s, sys: 1.05 s, total: 19.5 s
Wall time: 19.6 s

Adding in agents is giving me problem right now - debug it eventually

df_rxn[‘Agent_Morgan_FP2’] = df_rxn[‘rxn_obj’].apply(create_agent_feature_FP)

Make training and test set

%%time 
X_FPs = np.array( [hashedFPToNPfloat(x) for x in df_rxn['FP_Morgan_wo_agents']] )
CPU times: user 3.38 s, sys: 591 ms, total: 3.97 s
Wall time: 4 s
Y_class = np.array( df_rxn['Rxn Class'] )
rtypes = sorted(list(reaction_types))
rtype_int = [int(''.join(entry.split('.'))) for entry in rtypes]
len(set(rtype_int))
50

Note on multi-class classification:

https://scikit-learn.org/stable/modules/multiclass.html#multiclass-classification

LabelBinarizer is not needed if you are using an estimator that already supports multiclass data.

https://scikit-learn.org/stable/modules/preprocessing_targets.html#preprocessing-targets

Option 1: OHE

Create one hot encoding – does it help to create OHE now? Not sure but doing it here as a first pass.

Y_class_labels = [ rtypes.index(i) for i in Y_class]

Y_class_OHE = np.zeros(shape=(len(Y_class_labels), len(rtypes)), dtype=int) for i, j in enumerate(Y_class_labels): Y_class_OHE[i][j] = 1

rxn_dict = {i:0 for i in rtypes} for i, j in enumerate(Y_train): rxn_class_id = int(np.argmax(j)) rxn_dict[ rtypes[rxn_class_id] ] += 1

rxn_dict

Option 2: Leave as is

leave_as_is = True 
if leave_as_is == True:
    Y_target = Y_class
else: 
    Y_target = Y_class_OHE 
from sklearn.model_selection import StratifiedShuffleSplit
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for train_idx, test_idx in stratSplit.split(X_FPs, Y_target):
    X_train = X_FPs[train_idx]
    Y_train = Y_target[train_idx]
    
    X_test = X_FPs[test_idx]
    Y_test = Y_target[test_idx]

Random Forest

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=200,n_estimators=250,random_state=42)
model.fit(X_train, Y_train)
RandomForestClassifier(max_depth=200, n_estimators=250, random_state=42)
Y_test_predict = model.predict(X_test)

Y_test_predict_classes = [ np.argmax(i) for i in Y_test_predict ]

Y_test_class = [ np.argmax(i) for i in Y_test ]

from sklearn.metrics import confusion_matrix, classification_report
report_real = classification_report(Y_test, Y_test_predict, output_dict=True)
cmat_real = confusion_matrix(Y_test,Y_test_predict)
sum(cmat_real,0)
array([488, 517, 503, 515, 508, 494, 498, 473, 503, 529, 489, 442, 510,
       500, 480, 475, 504, 498, 475, 520, 476, 489, 502, 503, 519, 503,
       498, 479, 521, 495, 500, 497, 484, 534, 515, 516, 498, 494, 502,
       534, 499, 509, 506, 501, 503, 496, 498, 497, 504, 507])
from sklearn import metrics
# evaluate model calculating recall, precision and F-score, return the confusion matrix
def evaluateModel(_model, _testFPs, _test_rxn_labels, _sorted_rxn_label, _names_rTypes):
    
    preds = _model.predict(_testFPs)
    
    #pred_class = [ int(np.argmax(pred_entry)) for pred_entry in preds ]
    #testReactionTypes_class = [ int(np.argmax(test_entry))for test_entry in testReactionTypes ]

    cmat = metrics.confusion_matrix(_test_rxn_labels, preds)
    
    colCounts = sum(cmat,0)
    rowCounts = sum(cmat,1)

    print('%2s %7s %7s %7s     %s'%("ID","recall","prec","F-score ","reaction class"))
    
    sum_recall=0
    sum_prec=0
    
    for i, rxn_class_label in enumerate(_sorted_rxn_label):
        
        recall = 0
        if rowCounts[i] > 0:
            recall = float(cmat[i,i])/rowCounts[i]
        sum_recall += recall
        
        prec = 0
        if colCounts[i] > 0:
            prec = float(cmat[i,i])/colCounts[i]
        sum_prec += prec
        
        f_score = 0
        if (recall + prec) > 0:
            f_score = 2 * (recall * prec) / (recall + prec)   
        
        print('%2d % .4f % .4f % .4f % 9s %s'%(i, recall, prec, f_score,  rxn_class_label, _names_rTypes[rxn_class_label]))
    
    mean_recall = sum_recall/len(_sorted_rxn_label)
    mean_prec = sum_prec/len(_sorted_rxn_label)
    
    if (mean_recall + mean_prec) > 0:
        mean_fscore = 2*(mean_recall*mean_prec)/(mean_recall+mean_prec)
    
    print("Mean:% 3.2f % 7.2f % 7.2f"%(mean_recall,mean_prec,mean_fscore))
    
    return cmat
cmat_rFP_agentFeature = evaluateModel(model, X_test, Y_test, rtypes, names_rTypes)
ID  recall    prec F-score      reaction class
 0  0.9939  0.9959  0.9949     1.2.1 Aldehyde reductive amination
 1  0.9459  0.9478  0.9469     1.2.4 Eschweiler-Clarke methylation
 2  0.9821  0.9841  0.9831     1.2.5 Ketone reductive amination
 3  0.9516  0.9534  0.9525     1.3.6 Bromo N-arylation
 4  0.9666  0.9685  0.9676     1.3.7 Chloro N-arylation
 5  0.9818  0.9838  0.9828     1.3.8 Fluoro N-arylation
 6  0.9639  0.9659  0.9649     1.6.2 Bromo N-alkylation
 7  0.9810  0.9831  0.9820     1.6.4 Chloro N-alkylation
 8  0.9365  0.9384  0.9374     1.6.8 Iodo N-alkylation
 9  0.9245  0.9263  0.9254     1.7.4 Hydroxy to methoxy
10  0.9837  0.9857  0.9847     1.7.6 Methyl esterification
11  0.9865  0.9887  0.9876     1.7.7 Mitsunobu aryl ether synthesis
12  0.9413  0.9431  0.9422     1.7.9 Williamson ether synthesis
13  0.9900  0.9920  0.9910     1.8.5 Thioether synthesis
14  0.9854  0.9875  0.9865    10.1.1 Bromination
15  0.9874  0.9895  0.9884    10.1.2 Chlorination
16  0.9901  0.9921  0.9911    10.1.5 Wohl-Ziegler bromination
17  0.9920  0.9940  0.9930    10.2.1 Nitration
18  0.8634  0.8653  0.8644    10.4.2 Methylation
19  0.9347  0.9365  0.9356     2.1.1 Amide Schotten-Baumann
20  0.9748  0.9769  0.9759     2.1.2 Carboxylic acid + amine reaction
21  0.9776  0.9796  0.9785     2.1.7 N-acetylation
22  0.9901  0.9920  0.9910     2.2.3 Sulfonamide Schotten-Baumann
23  0.9921  0.9940  0.9930     2.3.1 Isocyanate + amine reaction
24  0.9558  0.9576  0.9567     2.6.1 Ester Schotten-Baumann
25  0.9841  0.9861  0.9851     2.6.3 Fischer-Speier esterification
26  0.9980  1.0000  0.9990     2.7.2 Sulfonic ester Schotten-Baumann
27  0.9792  0.9812  0.9802     3.1.1 Bromo Suzuki coupling
28  0.9368  0.9386  0.9377     3.1.5 Bromo Suzuki-type coupling
29  0.9980  1.0000  0.9990     3.1.6 Chloro Suzuki-type coupling
30  0.9940  0.9960  0.9950     3.3.1 Sonogashira coupling
31  0.9900  0.9920  0.9910     3.4.1 Stille reaction
32  0.9856  0.9876  0.9866     5.1.1 N-Boc protection
33  0.9327  0.9345  0.9336     6.1.1 N-Boc deprotection
34  0.9690  0.9709  0.9699     6.1.3 N-Cbz deprotection
35  0.9632  0.9651  0.9642     6.1.5 N-Bn deprotection
36  0.9800  0.9819  0.9809     6.2.1 CO2H-Et deprotection
37  0.9879  0.9899  0.9889     6.2.2 CO2H-Me deprotection
38  0.9901  0.9920  0.9910     6.2.3 CO2H-tBu deprotection
39  0.9327  0.9345  0.9336     6.3.1 O-Bn deprotection
40  0.9880  0.9900  0.9890     6.3.7 Methoxy to hydroxy
41  0.9784  0.9804  0.9794     7.1.1 Nitro to amino
42  0.9783  0.9802  0.9793     7.2.1 Amide to amine reduction
43  0.9861  0.9880  0.9870     7.3.1 Nitrile reduction
44  0.9881  0.9901  0.9891     7.9.2 Carboxylic acid to alcohol reduction
45  0.9980  1.0000  0.9990     8.1.4 Alcohol to aldehyde oxidation
46  0.9920  0.9940  0.9930     8.1.5 Alcohol to ketone oxidation
47  0.9960  0.9980  0.9970     8.2.1 Sulfanyl to sulfinyl
48  0.9703  0.9722  0.9713     9.1.6 Hydroxy to chloro
49  0.9764  0.9783  0.9773     9.3.1 Carboxylic acid to acid chloride
Mean: 0.97    0.97    0.97
def labelled_cmat(cmat, labels, figsize=(20,15), labelExtras=None, dpi=300, threshold=0.01, xlabel=True, ylabel=True, rotation=90):
    
    rowCounts = np.array(sum(cmat,1),dtype=float)
    cmat_percent = cmat / rowCounts[:,None]
    
    #zero all elements that are less than 1% of the row contents
    ncm = cmat_percent*(cmat_percent>threshold)

    fig, ax = plt.subplots(1,1, figsize=figsize)
    pax=ax.pcolor(ncm,cmap=cm.ocean_r)
    ax.set_frame_on(True)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(cmat.shape[0])+0.5, minor=False)
    ax.set_xticks(np.arange(cmat.shape[1])+0.5, minor=False)

    # want a more natural, table-like display
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    if labelExtras is not None:
        labels = [' %s %s'%(x,labelExtras[x].strip()) for x in labels]
    
    ax.set_xticklabels([], minor=False) 
    ax.set_yticklabels([], minor=False)

    if xlabel:
        ax.set_xticklabels(labels, minor=False, rotation=rotation, horizontalalignment='left') 
    if ylabel:
        ax.set_yticklabels(labels, minor=False)

    ax.grid(True)
    fig.colorbar(pax)
    plt.axis('tight')
rowCounts = np.array(sum(cmat_rFP_agentFeature,1),dtype=float)
cmat_percent = cmat_rFP_agentFeature/rowCounts[:,None]

#zero all elements that are less than 1% of the row contents
ncm = cmat_percent*(cmat_percent>0.01)


fig, ax = plt.subplots(1,1, figsize=(20,15))
pax=ax.pcolor(ncm,cmap=cm.ocean_r)
ax.set_frame_on(True)

labels = [' %s %s'%(x,names_rTypes[x].strip()) for x in rtypes]
ax.set_yticks(np.arange(cmat_rFP_agentFeature.shape[0])+0.5, minor=False)
ax.set_xticks(np.arange(cmat_rFP_agentFeature.shape[1])+0.5, minor=False)

ax.set_xticklabels(labels, minor=False, rotation=90, horizontalalignment='left') 
ax.set_yticklabels(labels, minor=False)
        
# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()
plt.show()