Flattened jupyter notebooks containing the analysis workflow in the manuscript A quantitative model for the prediction of sooting tendency from molecular structure.
Full repository (with data files) is available at pstjohn/ysi_qsar_energy_fuels
Flattened jupyter notebooks containing the analysis workflow in the manuscript A quantitative model for the prediction of sooting tendency from molecular structure.
Full repository (with data files) is available at pstjohn/ysi_qsar_energy_fuels
name: ysi | |
channels: | |
- https://conda.anaconda.org/rdkit | |
dependencies: | |
- python=3.5 | |
- jupyter | |
- pandas | |
- seaborn | |
- rdkit | |
- jinja2 | |
- pip: | |
- https://github.com/pstjohn/scikit-learn/archive/pstjohn-changes.zip | |
- pubchempy | |
- keras==0.3.3 |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from rdkit import Chem" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def process_mol(mol):\n", | |
" \n", | |
" props = mol.GetPropsAsDict()\n", | |
" try: cas = props['CAS Registry Numbers']\n", | |
" except KeyError: cas = None\n", | |
" \n", | |
" return pd.Series({\n", | |
" 'SMILES' : Chem.MolToSmiles(mol, isomericSmiles=True),\n", | |
" 'Name' : props['ChEBI Name'],\n", | |
" 'ChEBI' : props['ChEBI ID'],\n", | |
" 'CAS' : cas,\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"mols = pd.Series(Chem.SDMolSupplier('chebi_subset.sdf'))\n", | |
"mol_df = mols.apply(process_mol)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dragon_descriptors = pd.read_csv('chebi_dragon_output.tsv.gz', sep='\\t', index_col=0)\n", | |
"dragon_descriptors.index = mol_df.SMILES\n", | |
"dragon_descriptors.drop('NAME', 1, inplace=True)\n", | |
"de_duplicated = dragon_descriptors[~dragon_descriptors.index.duplicated()]\n", | |
"\n", | |
"de_duplicated.to_pickle('../ysi_utils/descriptors/dragon_chebi.p')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"mol_df.to_pickle('../ysi_utils/data/chebi.p')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
import numpy as np | |
np.random.seed(0) | |
from keras.models import Sequential | |
from keras.layers.core import Dense, Dropout, Activation | |
from keras.wrappers.scikit_learn import KerasRegressor | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import (StandardScaler, Normalizer, RobustScaler, | |
MaxAbsScaler, Imputer) | |
from sklearn.decomposition import PCA | |
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, f_regression | |
from sklearn import svm | |
from ysi_utils.descriptors import dragon | |
from ysi_utils.validation import y_train as y | |
from ysi_utils.validation import y_test | |
X = dragon.loc[y.index] | |
X_test = dragon.loc[y_test.index] | |
# These are the optimal parameters as selected by the hyperopt fitting routine. | |
params = \ | |
{'activation': 'relu', | |
'batch_size': 39, | |
'dropout1': 0.36632971139053216, | |
'dropout2': 0.030115015871571206, | |
'feature_extractor': {'n_features_to_select': 390, 'step': 10, 'type': 'RFE'}, | |
'nb_epochs': 100, | |
'num_layers': {'dropout3': 0.31435729897990794, | |
'layers': 'three', | |
'units3': 161}, | |
'optimizer': 'adam', | |
'preprocessor': 'MaxAbsScaler', | |
'units1': 140, | |
'units2': 140} | |
def get_input_size(params): | |
if 'n_components' in params['feature_extractor']: | |
return params['feature_extractor']['n_components'] | |
elif 'k' in params['feature_extractor']: | |
return params['feature_extractor']['k'] | |
elif 'n_features_to_select' in params['feature_extractor']: | |
return params['feature_extractor']['n_features_to_select'] | |
def build_model(params=params): | |
model = Sequential() | |
model.add(Dense(output_dim=params['units1'], | |
input_dim=get_input_size(params))) | |
model.add(Activation(params['activation'])) | |
model.add(Dropout(params['dropout1'])) | |
model.add(Dense(output_dim=params['units2'], init="glorot_uniform")) | |
model.add(Activation(params['activation'])) | |
model.add(Dropout(params['dropout2'])) | |
if params['num_layers']['layers']== 'three': | |
model.add(Dense(output_dim=params['num_layers']['units3'], init="glorot_uniform")) | |
model.add(Activation(params['activation'])) | |
model.add(Dropout(params['num_layers']['dropout3'])) | |
model.add(Dense(output_dim=1)) | |
model.compile(loss='mae', optimizer=params['optimizer']) | |
return model | |
ffann = KerasRegressor(build_fn=build_model, | |
nb_epoch=params['nb_epochs'], | |
batch_size=params['batch_size'], | |
verbose=0) | |
# Set up preprocessing pipeline | |
imputer = Imputer() | |
var_filter = VarianceThreshold() | |
preprocessor_dict = { | |
'StandardScaler' : StandardScaler, | |
'MaxAbsScaler' : MaxAbsScaler, | |
'Normalizer' : Normalizer, | |
'RobustScaler' : RobustScaler, | |
} | |
scaler = preprocessor_dict[params['preprocessor']]() | |
if params['feature_extractor']['type'] == 'pca': | |
opts = dict(params['feature_extractor']) | |
del opts['type'] | |
feature_extraction = PCA(**opts) | |
elif params['feature_extractor']['type'] == 'RFE': | |
opts = dict(params['feature_extractor']) | |
del opts['type'] | |
svr = svm.SVR(kernel='linear') | |
feature_extraction = RFE(estimator=svr, **opts) | |
elif params['feature_extractor']['type'] == 'SelectKBest': | |
opts = dict(params['feature_extractor']) | |
del opts['type'] | |
feature_extraction = SelectKBest(score_func=f_regression, **opts) | |
model = Pipeline(steps=[ | |
('imputer', imputer), | |
('filter', var_filter), | |
('scaler', scaler), | |
('feature_extraction', feature_extraction), | |
('ffann', ffann) | |
]) |