Skip to content

Instantly share code, notes, and snippets.

@pstjohn
Last active September 5, 2019 22:53
Show Gist options
  • Save pstjohn/b7e2f05136bf8bfd22f68f375437452d to your computer and use it in GitHub Desktop.
Save pstjohn/b7e2f05136bf8bfd22f68f375437452d to your computer and use it in GitHub Desktop.

YSI Prediction and Analysis code

Flattened jupyter notebooks containing the analysis workflow in the manuscript A quantitative model for the prediction of sooting tendency from molecular structure.

Full repository (with data files) is available at pstjohn/ysi_qsar_energy_fuels

name: ysi
channels:
- https://conda.anaconda.org/rdkit
dependencies:
- python=3.5
- jupyter
- pandas
- seaborn
- rdkit
- jinja2
- pip:
- https://github.com/pstjohn/scikit-learn/archive/pstjohn-changes.zip
- pubchempy
- keras==0.3.3
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import numpy as np
np.random.seed(0)
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (StandardScaler, Normalizer, RobustScaler,
MaxAbsScaler, Imputer)
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, f_regression
from sklearn import svm
from ysi_utils.descriptors import dragon
from ysi_utils.validation import y_train as y
from ysi_utils.validation import y_test
X = dragon.loc[y.index]
X_test = dragon.loc[y_test.index]
# These are the optimal parameters as selected by the hyperopt fitting routine.
params = \
{'activation': 'relu',
'batch_size': 39,
'dropout1': 0.36632971139053216,
'dropout2': 0.030115015871571206,
'feature_extractor': {'n_features_to_select': 390, 'step': 10, 'type': 'RFE'},
'nb_epochs': 100,
'num_layers': {'dropout3': 0.31435729897990794,
'layers': 'three',
'units3': 161},
'optimizer': 'adam',
'preprocessor': 'MaxAbsScaler',
'units1': 140,
'units2': 140}
def get_input_size(params):
if 'n_components' in params['feature_extractor']:
return params['feature_extractor']['n_components']
elif 'k' in params['feature_extractor']:
return params['feature_extractor']['k']
elif 'n_features_to_select' in params['feature_extractor']:
return params['feature_extractor']['n_features_to_select']
def build_model(params=params):
model = Sequential()
model.add(Dense(output_dim=params['units1'],
input_dim=get_input_size(params)))
model.add(Activation(params['activation']))
model.add(Dropout(params['dropout1']))
model.add(Dense(output_dim=params['units2'], init="glorot_uniform"))
model.add(Activation(params['activation']))
model.add(Dropout(params['dropout2']))
if params['num_layers']['layers']== 'three':
model.add(Dense(output_dim=params['num_layers']['units3'], init="glorot_uniform"))
model.add(Activation(params['activation']))
model.add(Dropout(params['num_layers']['dropout3']))
model.add(Dense(output_dim=1))
model.compile(loss='mae', optimizer=params['optimizer'])
return model
ffann = KerasRegressor(build_fn=build_model,
nb_epoch=params['nb_epochs'],
batch_size=params['batch_size'],
verbose=0)
# Set up preprocessing pipeline
imputer = Imputer()
var_filter = VarianceThreshold()
preprocessor_dict = {
'StandardScaler' : StandardScaler,
'MaxAbsScaler' : MaxAbsScaler,
'Normalizer' : Normalizer,
'RobustScaler' : RobustScaler,
}
scaler = preprocessor_dict[params['preprocessor']]()
if params['feature_extractor']['type'] == 'pca':
opts = dict(params['feature_extractor'])
del opts['type']
feature_extraction = PCA(**opts)
elif params['feature_extractor']['type'] == 'RFE':
opts = dict(params['feature_extractor'])
del opts['type']
svr = svm.SVR(kernel='linear')
feature_extraction = RFE(estimator=svr, **opts)
elif params['feature_extractor']['type'] == 'SelectKBest':
opts = dict(params['feature_extractor'])
del opts['type']
feature_extraction = SelectKBest(score_func=f_regression, **opts)
model = Pipeline(steps=[
('imputer', imputer),
('filter', var_filter),
('scaler', scaler),
('feature_extraction', feature_extraction),
('ffann', ffann)
])
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment