# Standard Data Science Boilerplate
import numpy as np
import pandas as pd
import scipy
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)
# Extra options
#pd.options.display.max_rows = 30
#pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
#Boilerplate RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit import DataStructs
from rdkit.DataStructs import BitVectToText
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
import pandas as pd
from rdkit.Chem import PandasTools as PandasTools
from rdkit.Chem import Descriptors as Descriptors
sdfFilePath ='FDA2019smallmols.sdf'
moldf = PandasTools.LoadSDF(sdfFilePath,molColName='SMILES')
#moldf["HBA"].fillna(0, inplace=True)
moldf["logP"]=moldf["logP"].astype({'logP': float})
moldf["logD"]=moldf["logD"].astype({'logD': float})
moldf["mass"]=moldf["mass"].astype({'mass': float})
moldf["HBA"]=moldf["HBA"].astype({'HBA': float})
moldf["HBD"]=moldf["HBD"].astype({'HBD': float})
moldf["atomCount"]=moldf["atomCount"].astype({'atomCount': float})
moldf["TPSA"]=moldf["TPSA"].astype({'TPSA': float})
moldf["RBC"]=moldf["RBC"].astype({'RBC': float})
moldf["MR"]=moldf["MR"].astype({'MR': float})
moldf["HAC"]=moldf["HAC"].astype({'HAC': float})
moldf["FractionAromatic"]=moldf["FractionAromatic"].astype({'FractionAromatic': float})
moldf.head(2)
moldf.dtypes
#Need to work out how to compute range appropriate for property
@interact
def show_molecules_more_than(column=list(moldf.select_dtypes('number').columns), x=(-4, 10, 1),):
display(HTML(f'<h2>Showing molecules with {column} > {x}<h2>'))
display(moldf.loc[moldf[column] > x, ['Active Ingredient', 'logP','logD', 'atomCount', 'mass', 'HBA', 'SMILES', 'CSID']])
@interact
def scatter(x=list(moldf.select_dtypes('number').columns),
y=list(moldf.select_dtypes('number').columns)[1:]):
moldf.iplot(kind='scatter', x=x, y=y, mode='markers',
xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
@interact
def scatter_plot(x=list(moldf.select_dtypes('number').columns),
y=list(moldf.select_dtypes('number').columns)[1:]):
moldf.iplot(kind='scatter3d', x=x, y=y, z= 'abzn', mode='markers', categories='Active Ingredient',
xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
#possible options for structures
mol=moldf.loc[3,'SMILES']
mol
import requests
import IPython.display as Disp
Mycid = moldf.loc[3, 'Pubchem']
url=('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/%s/PNG' % Mycid)
Disp.Image(requests.get(url).content)
The Jupyter notebook and data file can be downloaded from here