Skip to content

Instantly share code, notes, and snippets.

@rmania
Last active March 30, 2017 19:36
Show Gist options
  • Save rmania/b1847215bc95c0f72056d9160133e676 to your computer and use it in GitHub Desktop.
Save rmania/b1847215bc95c0f72056d9160133e676 to your computer and use it in GitHub Desktop.
machine learning preprocessing and feature generation code snippets
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
# some functions executing some basic preprocessing steps for ml
predict_cols = ['x']
feature_cols = list(set(df.columns) - set(predict_cols) - {'source'})
# PREPROCESSING STEPS
def _preprocess(df, predict_cols, feature_cols, do_outlier_removal=False):
col_dtypes = df[list(set(df.columns) - set(predict_cols))].dtypes
cat_features = [c for c, dtype in col_dtypes.iteritems() if dtype not in ['int64', 'int32', 'float64']]
num_features = [c for c, dtype in col_dtypes.iteritems() if dtype in ['int64', 'int32', 'float64']]
print("Encoding...")
# drop, impute na. This mask will id the rows
#mask = ~df[cat_features].isnull()
for c in cat_features:
df.loc[:, c] = LabelEncoder().fit_transform(df.loc[:, c].fillna('unknown'))
# agg = df.groupby(c).size().to_frame('size').reset_index()
# df = pd.merge(df, agg, on=c)
# df = df.drop(c, axis=1).rename(columns={'size': c})
print("Imputing...") # outcomment if not needed
imp = Imputer(missing_values=np.nan, strategy="median", axis=0)
# Impute numerical features
df[num_features] = imp.fit_transform(df[num_features])
df[num_features] = df[num_features].fillna(-1000)
if do_outlier_removal:
for col in df.columns.values:
outliers = np.where(_is_outlier(df.loc[:, (col)])) # refers to outlier function
df.ix[:, (col)].iloc[outliers] = median
print("Dropping NaN prediction rows...")
# remove na`s in target cols. Otherwise impute
#df = df.dropna(subset=predict_cols, axis=0)
# Impute targets
df[predict_cols] = df[predict_cols].fillna(-1)
return df
def _is_outlier(points, thresh=3.5):
"""
Remove points based on their "median absolute deviation".
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation # tweak if necessary
return modified_z_score > thresh
# create datetime features
def add_datetime_features(df, date_col = None):
"""
Adds features that are derived from datetime:
Input : dataframe and the column the date features should be extracted from.
date, year, month, week number, dayofweek, dayofyear.
"""
# Convert datetime_sch to datetime.
df[date_col] = pd.to_datetime(df[date_col])
df = df.sort_values(date_col, ascending=True)
# Get hour from datetime.
df = df.assign(hour=lambda x: x[date_col].dt.hour)
df = df.assign(time_qhour=lambda x: (x[date_col].dt.minute/15).astype(int) + x['hour']*4)
# Get date from datetime.
df['date'] = df[date_col].dt.date
# Compute day-like features below from date and merge that back onto dataframe.
date_features = pd.DataFrame(df['date'].unique(), columns=['date'])
# Compute features from dates.
date_features = date_features.assign(year=lambda x: [x.year for x in x['date']])
date_features = date_features.assign(month=lambda x: [x.month for x in x['date']])
date_features = date_features.assign(week=lambda x: [x.isocalendar()[1] for x in x['date']])
date_features = date_features.assign(dayofweek=lambda x: [x.weekday() for x in x['date']])
date_features = date_features.assign(dayofyear=lambda x: [x.timetuple().tm_yday for x in x['date']])
# Merge back onto dataframe.
n_rows_before = df.shape[0]
df = pd.merge(df, date_features, on=['date'], suffixes=('_old', ''))
# Check that no rows are dropped.
assert df.shape[0] == n_rows_before
return df
def add_holiday_features(df, holidays_file):
"""
https://gist.github.com/rok?direction=asc&sort=updated
Adds holiday features.
columns dc, dn and ds mean the days up until the next vacation in c (Central), s (South) and n(North) regions
in Holland
"""
holidays = pd.read_csv(holidays_file, parse_dates=['dt'])
# Convert to datetime so that we can merge on it.
holidays.dt = holidays.dt.dt.date
df = pd.merge(df, holidays, left_on=['date'], right_on=['dt'], how='left', suffixes=('old', ''))
df = df.drop(['dt', 'regions'], axis=1)
return df
class Parameters:
def __init__(self):
"""
class to define parameters for the ML model, notably on the creation of predefined train test sets and periods
"""
self.predict_cols = ['X']
self.feature_cols = [] # fill with columns
# possible train, validate , test splits
self.year_train = 2015
self.year_test = 2016
self.start_day = 1
self.start_month = 1
def get_start_date(self, period):
if period=='train':
return dt.date(self.year_train, self.start_month, self.start_day)
elif period=='test':
return dt.date(self.year_test, self.start_month, self.start_day)
else:
assert False
def get_end_date(self, period):
if period=='train':
return dt.date(self.year_train + 1, self.start_month, self.start_day)
elif period=='test':
return dt.date(self.year_test + 1, self.start_month, self.start_day)
else:
assert False
def get_dates(self, period):
return [self.get_start_date(period), self.get_end_date(period)]
p = Parameters()
# use this f.i. as such:
hist_train = ml.select_period(history, *p.get_dates('train'))
hist_test = ml.select_period(history, *p.get_dates('test'))
X_train = hist_train[p.feature_cols].values
y_train = hist_train[p.predict_cols].values.ravel()
X_test = hist_test[p.feature_cols].values
y_test = hist_test[p.predict_cols].values.ravel()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment