Macs in Chemistry

Insanely great science

HERG_RFclassifierModelOnly
In [23]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, PandasTools
from rdkit.Chem.Draw import IPythonConsole

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, roc_auc_score
from matplotlib import cm


import math
import pickle
import os
In [24]:
if os.path.exists('rf_clf.pickle'):
    print('Loading from pickle')
    # pickle file exists, just load it
    with open('rf_clf.pickle', 'rb') as f:
        rf_clf = pickle.load(f)
Loading from pickle
In [25]:
class FP:
    """
    A fingerprint class that inserts molecular fingerprints into pandas data frame
    """
    def __init__(self, fp):
        self.fp = fp
    def __str__(self):
        return "%d bit FP" % len(self.fp)
    def __len__(self):
        return len(self.fp)

def get_morgan_fp(mol):
    """
    Returns the RDKit Morgan fingerprint for a molecule
    """
    info = {}
    arr = np.zeros((1,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=False, bitInfo=info)
    DataStructs.ConvertToNumpyArray(fp, arr)
    arr = np.array([len(info[x]) if x in info else 0 for x in range(1024)])

    return FP(arr)
In [26]:
mySMILES ='Fc1ccc(cc1)n3c2ccc(Cl)cc2c(c3)C5CCN(CCN4C(=O)NCC4)CC5'
from rdkit import RDConfig
mySMILESinput = pd.DataFrame(columns=['ID','my_smiles']) #need ID to match dataframes later
mySMILESinput = mySMILESinput.append({'ID':123, 'my_smiles':mySMILES}, ignore_index=True)   #assign ID
PandasTools.AddMoleculeColumnToFrame(mySMILESinput,'my_smiles','ROMol')
In [27]:
mySMILESinput['fp'] = mySMILESinput.apply(lambda x: get_morgan_fp(x['ROMol']), axis=1) #add fingerprint
In [28]:
mySMILESinput
Out[28]:
ID my_smiles ROMol fp
0 123 Fc1ccc(cc1)n3c2ccc(Cl)cc2c(c3)C5CCN(CCN4C(=O)NCC4)CC5 Mol 1024 bit FP
In [29]:
resQuery = np.array([x.fp for x in mySMILESinput.fp])
In [30]:
resQuery
Out[30]:
array([[0, 0, 0, ..., 0, 0, 0]])
In [31]:
y_pred = rf_clf.predict(resQuery)
if y_pred == 1:
    result = "HERGActive"
if y_pred == 0:
    result = "Inactive" 
In [32]:
print(rf_clf.predict(resQuery))
print(rf_clf.predict_proba(resQuery))
[1]
[[ 0.05  0.95]]

Fingerprint Contributions

In [33]:
mols = [None, Chem.MolFromSmiles(mySMILES)]

radius = 2
bit_size = 1024
fps_morgan2 = []
info_morgan2 = []
num_mols = len(mols) - 1
mols = [mols[i+1] for i in range(num_mols)] # remove reference cmp from list
for m in mols:
    info = {}
    fps_morgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, bitInfo=info))
    info_morgan2.append(info)
    
mol_weights = []
for i,m in enumerate(mols):
    weights = []
    orig_pp = rf_clf.predict_proba(np.array(fps_morgan2[i]).reshape(1, -1))[0][1]
    # get bits for each atom
    bitmap = [~DataStructs.ExplicitBitVect(1024) for x in range(m.GetNumAtoms())]
    for bit, es in info_morgan2[i].items():
        for at1, rad in es:
            if rad == 0: # for radius 0
                bitmap[at1][bit] = 0
            else: # for radii > 0
                env = Chem.FindAtomEnvironmentOfRadiusN(m, rad, at1)
                amap = {}
                submol = Chem.PathToSubmol(m, env, atomMap=amap)
                for at2 in amap.keys():
                    bitmap[at2][bit] = 0
    # loop over atoms
    for at1 in range(m.GetNumAtoms()):
        new_fp = np.array(fps_morgan2[i] & bitmap[at1]).reshape(1, -1)
        new_pp = rf_clf.predict_proba(new_fp)[0][1]
        weights.append(orig_pp-new_pp)
    mol_weights.append(weights)
In [34]:
def generateSimilarityMaps(mols, weights, fp):
    '''Generates a similarity map for a set of molecules and weights'''
    # colormap to use
    mycm = cm.PiYG
    # loop over molecules
    for i,m in enumerate(mols):
        fig = Draw.MolToMPL(m, coordScale=1.5, size=(250,250))
        # the values 0.02 and 0.01 can be adjusted for the size of the molecule
        x,y,z = Draw.calcAtomGaussians(m, 0.02, step=0.01, weights=weights[i])
        # use the maximum absolute peak as maximum scale
        maxscale = max(math.fabs(np.min(z)), math.fabs(np.max(z)))
        # this does the coloring
        fig.axes[0].imshow(z, cmap=mycm, interpolation='bilinear', origin='lower', extent=(0,1,0,1), vmin=-maxscale, vmax=maxscale)
        # this draws 10 contour lines
        # alternatively also the z values for the lines can be specified
        fig.axes[0].contour(x, y, z, 10, colors='k', alpha=0.5)
        ax = fig.axes[0]
        ax.set_title(result) #adds HERG result to title
        # this writes the figure in a file
       # fig.savefig('pics/mol'+str(i+1)+'_'+fp+'.png', bbox_inches='tight') #if generating multiple molecules
        fig.savefig('pics/mol1_rf.png', bbox_inches='tight')

Plot Output

In [35]:
generateSimilarityMaps(mols, mol_weights, 'rf')
from IPython.display import Image
Image('pics/mol1_rf.png')
Out[35]:
In [ ]: