rmania · March 30, 2017 19:36
diff --git a/ml_preprocessing_code_snippets.py b/ml_preprocessing_code_snippets.py
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import Imputer

 # some functions executing some basic preprocessing steps for ml
 predict_cols = ['x']
 feature_cols = list(set(df.columns) - set(predict_cols) - {'source'})

 # PREPROCESSING STEPS
 def _preprocess(df, predict_cols, feature_cols, do_outlier_removal=False):
    
       
    col_dtypes = df[list(set(df.columns) - set(predict_cols))].dtypes
    cat_features = [c for c, dtype in col_dtypes.iteritems() if dtype not in ['int64', 'int32', 'float64']] 
    num_features = [c for c, dtype in col_dtypes.iteritems() if dtype in ['int64', 'int32', 'float64']]

    print("Encoding...")
    # drop, impute na. This mask will id the rows
    #mask = ~df[cat_features].isnull()
    
    for c in cat_features:
        df.loc[:, c] = LabelEncoder().fit_transform(df.loc[:, c].fillna('unknown'))
        # agg = df.groupby(c).size().to_frame('size').reset_index()
        # df = pd.merge(df, agg, on=c)
        # df = df.drop(c, axis=1).rename(columns={'size': c})

    print("Imputing...") # outcomment if not needed
    imp = Imputer(missing_values=np.nan, strategy="median", axis=0)

    # Impute numerical features
    df[num_features] = imp.fit_transform(df[num_features])
    df[num_features] = df[num_features].fillna(-1000)

    if do_outlier_removal:
        for col in df.columns.values:
            outliers = np.where(_is_outlier(df.loc[:, (col)])) # refers to outlier function
            df.ix[:, (col)].iloc[outliers] = median

    print("Dropping NaN prediction rows...")
    # remove na`s in target cols. Otherwise impute
    #df = df.dropna(subset=predict_cols, axis=0)

    # Impute targets
    df[predict_cols] = df[predict_cols].fillna(-1)

    return df
  
 def _is_outlier(points, thresh=3.5):
    """
    Remove points based on their "median absolute deviation".
    Returns a boolean array with True if points are outliers and False 
    otherwise.
    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
                 a modified z-score (based on the median absolute deviation) greater
                 than this value will be classified as outliers.
    Returns:
    --------
        mask : A numobservations-length boolean array.
    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
    """
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation # tweak if necessary

    return modified_z_score > thresh

 # create datetime features
 def add_datetime_features(df, date_col = None):
    """
    Adds features that are derived from datetime:
    Input : dataframe and the column the date features should be extracted from. 
    date, year, month, week number, dayofweek, dayofyear.
    """

    # Convert datetime_sch to datetime.
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col, ascending=True)

    # Get hour from datetime.
    df = df.assign(hour=lambda x: x[date_col].dt.hour)
    df = df.assign(time_qhour=lambda x: (x[date_col].dt.minute/15).astype(int) + x['hour']*4)

    # Get date from datetime.
    df['date'] = df[date_col].dt.date

    # Compute day-like features below from date and merge that back onto dataframe.
    date_features = pd.DataFrame(df['date'].unique(), columns=['date'])

    # Compute features from dates.
    date_features = date_features.assign(year=lambda x: [x.year for x in x['date']])
    date_features = date_features.assign(month=lambda x: [x.month for x in x['date']])
    date_features = date_features.assign(week=lambda x: [x.isocalendar()[1] for x in x['date']])
    date_features = date_features.assign(dayofweek=lambda x: [x.weekday() for x in x['date']])
    date_features = date_features.assign(dayofyear=lambda x: [x.timetuple().tm_yday for x in x['date']])

    # Merge back onto dataframe.
    n_rows_before = df.shape[0]
    df = pd.merge(df, date_features, on=['date'], suffixes=('_old', ''))

    # Check that no rows are dropped.
    assert df.shape[0] == n_rows_before

    return df

 def add_holiday_features(df, holidays_file):
    """
    https://gist.github.com/rok?direction=asc&sort=updated
    Adds holiday features.
    columns dc, dn and ds mean the days up until the next vacation in c (Central), s (South) and n(North) regions
    in Holland
    """
    
    holidays = pd.read_csv(holidays_file, parse_dates=['dt'])

    # Convert to datetime so that we can merge on it.
    holidays.dt = holidays.dt.dt.date

    df = pd.merge(df, holidays, left_on=['date'], right_on=['dt'], how='left', suffixes=('old', ''))
    df = df.drop(['dt', 'regions'], axis=1)

    return df

 class Parameters:
    def __init__(self):
        """
        class to define parameters for the ML model, notably on the creation of predefined train test sets and periods
        """
        self.predict_cols = ['X']
        self.feature_cols = [] # fill with columns 
        # possible train, validate , test splits
        self.year_train = 2015
        self.year_test = 2016
        self.start_day = 1
        self.start_month = 1
            
    def get_start_date(self, period):
        if period=='train':
            return dt.date(self.year_train, self.start_month, self.start_day)
        elif period=='test':
            return dt.date(self.year_test, self.start_month, self.start_day)
        else:
            assert False
    
    
    def get_end_date(self, period):
        if period=='train':
            return dt.date(self.year_train + 1, self.start_month, self.start_day)
        elif period=='test':
            return dt.date(self.year_test + 1, self.start_month, self.start_day)
        else:
            assert False    
    
    def get_dates(self, period):
        return [self.get_start_date(period), self.get_end_date(period)]
            
 p = Parameters()

 # use this f.i. as such: 
 hist_train = ml.select_period(history, *p.get_dates('train'))
 hist_test = ml.select_period(history, *p.get_dates('test')) 

 X_train = hist_train[p.feature_cols].values
 y_train = hist_train[p.predict_cols].values.ravel()

 X_test = hist_test[p.feature_cols].values
 y_test = hist_test[p.predict_cols].values.ravel()
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import Imputer

	# some functions executing some basic preprocessing steps for ml
	predict_cols = ['x']
	feature_cols = list(set(df.columns) - set(predict_cols) - {'source'})

	# PREPROCESSING STEPS
	def _preprocess(df, predict_cols, feature_cols, do_outlier_removal=False):


	col_dtypes = df[list(set(df.columns) - set(predict_cols))].dtypes
	cat_features = [c for c, dtype in col_dtypes.iteritems() if dtype not in ['int64', 'int32', 'float64']]
	num_features = [c for c, dtype in col_dtypes.iteritems() if dtype in ['int64', 'int32', 'float64']]

	print("Encoding...")
	# drop, impute na. This mask will id the rows
	#mask = ~df[cat_features].isnull()

	for c in cat_features:
	df.loc[:, c] = LabelEncoder().fit_transform(df.loc[:, c].fillna('unknown'))
	# agg = df.groupby(c).size().to_frame('size').reset_index()
	# df = pd.merge(df, agg, on=c)
	# df = df.drop(c, axis=1).rename(columns={'size': c})

	print("Imputing...") # outcomment if not needed
	imp = Imputer(missing_values=np.nan, strategy="median", axis=0)

	# Impute numerical features
	df[num_features] = imp.fit_transform(df[num_features])
	df[num_features] = df[num_features].fillna(-1000)

	if do_outlier_removal:
	for col in df.columns.values:
	outliers = np.where(_is_outlier(df.loc[:, (col)])) # refers to outlier function
	df.ix[:, (col)].iloc[outliers] = median

	print("Dropping NaN prediction rows...")
	# remove na`s in target cols. Otherwise impute
	#df = df.dropna(subset=predict_cols, axis=0)

	# Impute targets
	df[predict_cols] = df[predict_cols].fillna(-1)

	return df

	def _is_outlier(points, thresh=3.5):
	"""
	Remove points based on their "median absolute deviation".
	Returns a boolean array with True if points are outliers and False
	otherwise.
	Parameters:
	-----------
	points : An numobservations by numdimensions array of observations
	thresh : The modified z-score to use as a threshold. Observations with
	a modified z-score (based on the median absolute deviation) greater
	than this value will be classified as outliers.
	Returns:
	--------
	mask : A numobservations-length boolean array.
	References:
	----------
	Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
	Handle Outliers", The ASQC Basic References in Quality Control:
	Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
	"""
	if len(points.shape) == 1:
	points = points[:,None]
	median = np.median(points, axis=0)
	diff = np.sum((points - median)**2, axis=-1)
	diff = np.sqrt(diff)
	med_abs_deviation = np.median(diff)

	modified_z_score = 0.6745 * diff / med_abs_deviation # tweak if necessary

	return modified_z_score > thresh

	# create datetime features
	def add_datetime_features(df, date_col = None):
	"""
	Adds features that are derived from datetime:
	Input : dataframe and the column the date features should be extracted from.
	date, year, month, week number, dayofweek, dayofyear.
	"""

	# Convert datetime_sch to datetime.
	df[date_col] = pd.to_datetime(df[date_col])
	df = df.sort_values(date_col, ascending=True)

	# Get hour from datetime.
	df = df.assign(hour=lambda x: x[date_col].dt.hour)
	df = df.assign(time_qhour=lambda x: (x[date_col].dt.minute/15).astype(int) + x['hour']*4)

	# Get date from datetime.
	df['date'] = df[date_col].dt.date

	# Compute day-like features below from date and merge that back onto dataframe.
	date_features = pd.DataFrame(df['date'].unique(), columns=['date'])

	# Compute features from dates.
	date_features = date_features.assign(year=lambda x: [x.year for x in x['date']])
	date_features = date_features.assign(month=lambda x: [x.month for x in x['date']])
	date_features = date_features.assign(week=lambda x: [x.isocalendar()[1] for x in x['date']])
	date_features = date_features.assign(dayofweek=lambda x: [x.weekday() for x in x['date']])
	date_features = date_features.assign(dayofyear=lambda x: [x.timetuple().tm_yday for x in x['date']])

	# Merge back onto dataframe.
	n_rows_before = df.shape[0]
	df = pd.merge(df, date_features, on=['date'], suffixes=('_old', ''))

	# Check that no rows are dropped.
	assert df.shape[0] == n_rows_before

	return df

	def add_holiday_features(df, holidays_file):
	"""
	https://gist.github.com/rok?direction=asc&sort=updated
	Adds holiday features.
	columns dc, dn and ds mean the days up until the next vacation in c (Central), s (South) and n(North) regions
	in Holland
	"""

	holidays = pd.read_csv(holidays_file, parse_dates=['dt'])

	# Convert to datetime so that we can merge on it.
	holidays.dt = holidays.dt.dt.date

	df = pd.merge(df, holidays, left_on=['date'], right_on=['dt'], how='left', suffixes=('old', ''))
	df = df.drop(['dt', 'regions'], axis=1)

	return df

	class Parameters:
	def __init__(self):
	"""
	class to define parameters for the ML model, notably on the creation of predefined train test sets and periods
	"""
	self.predict_cols = ['X']
	self.feature_cols = [] # fill with columns
	# possible train, validate , test splits
	self.year_train = 2015
	self.year_test = 2016
	self.start_day = 1
	self.start_month = 1

	def get_start_date(self, period):
	if period=='train':
	return dt.date(self.year_train, self.start_month, self.start_day)
	elif period=='test':
	return dt.date(self.year_test, self.start_month, self.start_day)
	else:
	assert False


	def get_end_date(self, period):
	if period=='train':
	return dt.date(self.year_train + 1, self.start_month, self.start_day)
	elif period=='test':
	return dt.date(self.year_test + 1, self.start_month, self.start_day)
	else:
	assert False

	def get_dates(self, period):
	return [self.get_start_date(period), self.get_end_date(period)]

	p = Parameters()

	# use this f.i. as such:
	hist_train = ml.select_period(history, *p.get_dates('train'))
	hist_test = ml.select_period(history, *p.get_dates('test'))

	X_train = hist_train[p.feature_cols].values
	y_train = hist_train[p.predict_cols].values.ravel()

	X_test = hist_test[p.feature_cols].values
	y_test = hist_test[p.predict_cols].values.ravel()