Macs in Chemistry

Insanely great science

rf_classifier

Imports

In [5]:
#Based on "The Catch-22 of Predicting hERG Blockade Using Publicly Accessible Bioactivity Data" DOI http://dx.doi.org/10.1021/acs.jcim.8b00150
#https://github.com/AGPreissner/Publications
In [8]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

from rdkit import Chem, DataStructs
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, roc_auc_score

import pickle
import os

Helper functions

In [9]:
class FP:
    """
    A fingerprint class that inserts molecular fingerprints into pandas data frame
    """
    def __init__(self, fp):
        self.fp = fp
    def __str__(self):
        return "%d bit FP" % len(self.fp)
    def __len__(self):
        return len(self.fp)

def get_morgan_fp(mol):
    """
    Returns the RDKit Morgan fingerprint for a molecule
    """
    info = {}
    arr = np.zeros((1,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=False, bitInfo=info)
    DataStructs.ConvertToNumpyArray(fp, arr)
    arr = np.array([len(info[x]) if x in info else 0 for x in range(1024)])

    return FP(arr)

Data preparation

In [10]:
# The data file should contain three columns 
# 1. molecule ID;
# 2. canonical SMILES; and 
# 3. activity (which is either 1 or 0)

# reading the data file into a pandas data frame
df = pd.read_csv("/Users/Chris/Projects/HERG/PublicationsMaster/hERG/datasets/training/chembl_training_T3.csv", index_col=0)

# Build ROMol objects 
PandasTools.AddMoleculeColumnToFrame(df, smilesCol='can_smiles')

# Remove molecules that could not be parsed from SMILES
df = df[~df.ROMol.isnull()]

# Calculate fingerprints and store them in df
# Note: if additional fingerprints are needed that are not available in RDkit, they must be imported with the data
df['fp'] = df.apply(lambda x: get_morgan_fp(x['ROMol']), axis=1)
#df.shape
In [11]:
df.head(n=3)
Out[11]:
can_smiles ac ROMol fp
id
CHEMBL410234 Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)n1 1 Mol 1024 bit FP
CHEMBL2407175 CCCN1C(=O)CC2(CCN(CC3CCN(C(=O)OCC)CC3)CC2)c2cccnc21 1 Mol 1024 bit FP
CHEMBL100415 CCCCN(Cc1cncn1Cc1ccc(C#N)cc1)C1CCN(Cc2ccccc2)C1=O 1 Mol 1024 bit FP

Defining X and Y

In [12]:
# create X variable (=features i.e. molecular fingerprints)
X = np.array([x.fp for x in df.fp])
#X.shape

# create Y variable (=activity values i.e. blocker;1 or non-blocker;0)
y = np.array(df.ac)
#y.shape

Cross Validation

In [13]:
# Initialize performance measures
sens     = np.array([])
spec     = np.array([])
auc      = np.array([])

# 10-fold cross-validation split
kfolds = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)

for train, test in kfolds:
    # Split data to training and test set
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]    

    # if undersampling is required - random undersampling of the majority class can be done as follows
    
    # undersampling training set
    #uSampler = RandomUnderSampler(ratio=1., replacement=False)
    #X_train, y_train = uSampler.fit_sample(X_train, y_train)
    
    # undersampling test set
    #uSampler = RandomUnderSampler(ratio=1., replacement=False)
    #X_test, y_test = uSampler.fit_sample(X_test, y_test)
    
    # Training a random forest classifier
    rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=1)
    rf_clf.fit(X_train, y_train)
    
    # Predicting the test set
    y_pred       = rf_clf.predict(X_test)
    y_pred_proba = rf_clf.predict_proba(X_test).T[1]
    
    # Append performance measures
    auc  = np.append(auc, roc_auc_score(y_test, y_pred_proba))
    sens = np.append(sens, recall_score(y_test, y_pred, pos_label=1))
    spec = np.append(spec, recall_score(y_test, y_pred, pos_label=0))
    
    
# 10-fold cross-validation performance
print('AUC:\t\t\t%.2f +/- %.2f' % (auc.mean(), auc.std()))
print('Sensitivity:\t\t%.2f +/- %.2f' % (sens.mean(), sens.std()))
print('Specificity:\t\t%.2f +/- %.2f' % (spec.mean(), spec.std()))
AUC:			0.95 +/- 0.01
Sensitivity:		0.84 +/- 0.03
Specificity:		0.92 +/- 0.02
In [14]:
print('Generating model')
    # No pickle file, generate model
rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=1)
rf_clf.fit(X_train, y_train)
with open('rf_clf.pickle', 'wb') as f:
    pickle.dump(rf_clf, f)

# pickle file saved in same directory as Jupyter Notebook
Generating model

Testing with single molecule

In [15]:
mySMILES ='Fc1ccc(cc1)n3c2ccc(Cl)cc2c(c3)C5CCN(CCN4C(=O)NCC4)CC5'
from rdkit import RDConfig
mySMILESinput = pd.DataFrame(columns=['ID','my_smiles']) #need ID to match dataframes later
mySMILESinput = mySMILESinput.append({'ID':123, 'my_smiles':mySMILES}, ignore_index=True)   #assign ID
PandasTools.AddMoleculeColumnToFrame(mySMILESinput,'my_smiles','ROMol')
In [16]:
mySMILESinput['fp'] = mySMILESinput.apply(lambda x: get_morgan_fp(x['ROMol']), axis=1) #add fingerprint
In [17]:
mySMILESinput
Out[17]:
ID my_smiles ROMol fp
0 123 Fc1ccc(cc1)n3c2ccc(Cl)cc2c(c3)C5CCN(CCN4C(=O)NCC4)CC5 Mol 1024 bit FP
In [18]:
resQuery = np.array([x.fp for x in mySMILESinput.fp])
In [ ]:
 
In [19]:
 y_pred = rf_clf.predict(resQuery)
In [20]:
y_pred
Out[20]:
array([1])
In [21]:
print(rf_clf.predict(resQuery))
print(rf_clf.predict_proba(resQuery))
[1]
[[ 0.06  0.94]]
In [ ]:
 
In [ ]:
 

Last Updated 13 June 2018