Hello Jupyter, Hello Pandas
Documentation of methodology while you code.
Notebooks render in github.
Large ecosystem & community.
Can use other kernals like R.
Can also use R within Jupyter instead of python. See Propublica's compas-analysis notebook
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import pandas as pd
total 27680
drwxr-xr-x 14 bmigliozzi staff 476 Aug 29 07:41 �[1m�[36m.�[m�[m
drwxr-xr-x 68 bmigliozzi staff 2312 Aug 28 16:50 �[1m�[36m..�[m�[m
-rw-r--r--@ 1 bmigliozzi staff 6148 Aug 28 22:40 .DS_Store
drwxr-xr-x 3 bmigliozzi staff 102 Aug 28 17:06 �[1m�[36m.ipynb_checkpoints�[m�[m
-rw-r--r-- 1 bmigliozzi staff 8641539 Aug 29 07:41 Hello Pandas.ipynb
-rw-r--r--@ 1 bmigliozzi staff 438764 Aug 28 22:10 jupiter.gif
-rw-r--r--@ 1 bmigliozzi staff 1048297 Aug 28 22:11 panda.gif
lrwxr-xr-x 1 bmigliozzi staff 30 Aug 28 16:53 �[35mpyUtils�[m�[m -> /Users/bmigliozzi/code/pyUtils
-rw-r--r-- 1 bmigliozzi staff 2017268 Aug 29 06:24 tsne_points.html
-rw-r--r-- 1 bmigliozzi staff 770558 Aug 29 06:43 wine.pickle
-rw-r--r-- 1 bmigliozzi staff 874723 Aug 29 07:09 wine_std_tsne.pickle
-rw-r--r-- 1 bmigliozzi staff 84199 Aug 29 06:41 winequality-red.csv
-rw-r--r-- 1 bmigliozzi staff 264426 Aug 29 06:41 winequality-white.csv
-rw-r--r-- 1 bmigliozzi staff 3305 Aug 28 16:50 winequality.names.txt
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties.
In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.
Available at: [@Elsevier] http://dx.doi.org/10.1016/j.dss.2009.05.016
[Pre-press (pdf)] http://www3.dsi.uminho.pt/pcortez/winequality09.pdf
[bib] http://www3.dsi.uminho.pt/pcortez/dss09.bib
!head - 5 winequality - red .csv
!head - 5 winequality - white .csv
"fixed_acidity";"volatile_acidity";"citric_acid";"residual_sugar";"chlorides";"free_sulfur_dioxide";"total_sulfur_dioxide";"density";"pH";"sulphates";"alcohol";"quality"
7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;9.8;5
11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58;9.8;6
"fixed_acidity";"volatile_acidity";"citric_acid";"residual_sugar";"chlorides";"free_sulfur_dioxide";"total_sulfur_dioxide";"density";"pH";"sulphates";"alcohol";"quality"
7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6
8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6
7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6
white = pd .read_csv ('winequality-white.csv' , sep = ';' )
white .head ()
red = pd .read_csv ('winequality-red.csv' , sep = ';' )
red .head ()
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
0
7.4
0.70
0.00
1.9
0.076
11
34
0.9978
3.51
0.56
9.4
5
1
7.8
0.88
0.00
2.6
0.098
25
67
0.9968
3.20
0.68
9.8
5
2
7.8
0.76
0.04
2.3
0.092
15
54
0.9970
3.26
0.65
9.8
5
3
11.2
0.28
0.56
1.9
0.075
17
60
0.9980
3.16
0.58
9.8
6
4
7.4
0.70
0.00
1.9
0.076
11
34
0.9978
3.51
0.56
9.4
5
from IPython .display import display
white ['type' ] = 'white'
print (white .shape )
display (white .head ())
red ['type' ] = 'red'
print (red .shape )
display (red .head ())
(4898, 13)
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
type
0
7.0
0.27
0.36
20.7
0.045
45
170
1.0010
3.00
0.45
8.8
6
white
1
6.3
0.30
0.34
1.6
0.049
14
132
0.9940
3.30
0.49
9.5
6
white
2
8.1
0.28
0.40
6.9
0.050
30
97
0.9951
3.26
0.44
10.1
6
white
3
7.2
0.23
0.32
8.5
0.058
47
186
0.9956
3.19
0.40
9.9
6
white
4
7.2
0.23
0.32
8.5
0.058
47
186
0.9956
3.19
0.40
9.9
6
white
(1599, 13)
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
type
0
7.4
0.70
0.00
1.9
0.076
11
34
0.9978
3.51
0.56
9.4
5
red
1
7.8
0.88
0.00
2.6
0.098
25
67
0.9968
3.20
0.68
9.8
5
red
2
7.8
0.76
0.04
2.3
0.092
15
54
0.9970
3.26
0.65
9.8
5
red
3
11.2
0.28
0.56
1.9
0.075
17
60
0.9980
3.16
0.58
9.8
6
red
4
7.4
0.70
0.00
1.9
0.076
11
34
0.9978
3.51
0.56
9.4
5
red
df = pd .concat ([white ,red ])
print (df .shape )
df .sample (10 )
(6497, 13)
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
type
4871
5.0
0.20
0.40
1.9
0.015
20
98
0.98970
3.37
0.55
12.05
6
white
725
6.4
0.39
0.21
1.2
0.041
35
136
0.99225
3.15
0.46
10.20
5
white
1266
7.5
0.28
0.34
4.2
0.028
36
116
0.99100
2.99
0.41
12.30
8
white
4171
6.6
0.22
0.37
1.6
0.040
31
101
0.99009
3.15
0.66
12.00
5
white
2923
6.9
0.22
0.31
6.3
0.029
41
131
0.99326
3.08
0.49
10.80
6
white
2757
5.6
0.12
0.26
4.3
0.038
18
97
0.99477
3.36
0.46
9.20
5
white
2570
6.6
0.24
0.28
1.8
0.028
39
132
0.99182
3.34
0.46
11.40
5
white
2143
6.5
0.14
0.32
2.7
0.037
18
89
0.99240
3.40
0.74
11.50
7
white
143
6.3
0.39
0.08
1.7
0.066
3
20
0.99540
3.34
0.58
9.40
5
red
2482
6.2
0.26
0.37
7.1
0.047
54
201
0.99523
3.19
0.48
9.50
6
white
# df.type=='red'
len (df .type == 'red' )
# len(df[df.type=='red'])
6497
# df[df.type=='red']
# df[(df.pH>3.8)]
df [(df .pH > 3.8 ) & (df .type == 'white' )]
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
type
1250
5.3
0.26
0.23
5.15
0.034
48
160
0.9952
3.82
0.51
10.5
7
white
1255
6.4
0.22
0.34
1.80
0.057
29
104
0.9959
3.81
0.57
10.3
6
white
import os
df .to_pickle (os .path .join (os .getcwd (),'wine.pickle' ))
df = pd .read_pickle (os .path .join (os .getcwd (),'wine.pickle' ))
# print(df.shape)
# display(df.info())
# display(df.sample(10))
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
count
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
6497.000000
mean
7.215307
0.339666
0.318633
5.443235
0.056034
30.525319
115.744574
0.994697
3.218501
0.531268
10.491801
5.818378
std
1.296434
0.164636
0.145318
4.757804
0.035034
17.749400
56.521855
0.002999
0.160787
0.148806
1.192712
0.873255
min
3.800000
0.080000
0.000000
0.600000
0.009000
1.000000
6.000000
0.987110
2.720000
0.220000
8.000000
3.000000
25%
6.400000
0.230000
0.250000
1.800000
0.038000
17.000000
77.000000
0.992340
3.110000
0.430000
9.500000
5.000000
50%
7.000000
0.290000
0.310000
3.000000
0.047000
29.000000
118.000000
0.994890
3.210000
0.510000
10.300000
6.000000
75%
7.700000
0.400000
0.390000
8.100000
0.065000
41.000000
156.000000
0.996990
3.320000
0.600000
11.300000
6.000000
max
15.900000
1.580000
1.660000
65.800000
0.611000
289.000000
440.000000
1.038980
4.010000
2.000000
14.900000
9.000000
df .groupby ('type' )['fixed_acidity' ].mean ()
type
red 8.262603
white 6.853359
Name: fixed_acidity, dtype: float64
df .groupby ('type' ).agg (['mean' ,'median' ,'std' ,'var' ]).T
type
red
white
fixed_acidity
mean
8.262603
6.853359
median
7.900000
6.800000
std
1.648422
0.837328
var
2.717296
0.701119
volatile_acidity
mean
0.524186
0.278241
median
0.520000
0.260000
std
0.172718
0.100795
var
0.029831
0.010160
citric_acid
mean
0.270976
0.333731
median
0.260000
0.320000
std
0.194801
0.118840
var
0.037947
0.014123
residual_sugar
mean
2.538806
6.368401
median
2.200000
5.200000
std
1.409928
4.974490
var
1.987897
24.745553
chlorides
mean
0.081566
0.045415
median
0.079000
0.043000
std
0.023362
0.019768
var
0.000546
0.000391
free_sulfur_dioxide
mean
15.874922
35.131124
median
14.000000
34.000000
std
10.460157
16.267170
var
109.414884
264.620825
total_sulfur_dioxide
mean
46.467792
138.294446
median
38.000000
134.000000
std
32.895324
42.280090
var
1082.102373
1787.606010
density
mean
0.996747
0.994012
median
0.996750
0.993740
std
0.001887
0.002903
var
0.000004
0.000008
pH
mean
3.311113
3.188267
median
3.310000
3.180000
std
0.154386
0.151001
var
0.023835
0.022801
sulphates
mean
0.647880
0.489847
median
0.620000
0.470000
std
0.141561
0.114126
var
0.020040
0.013025
alcohol
mean
10.422983
10.514267
median
10.200000
10.400000
std
1.065668
1.230621
var
1.135647
1.514427
quality
mean
5.636023
5.877909
median
6.000000
6.000000
std
0.807569
0.885639
var
0.652168
0.784356
tsne_x
mean
1.440543
-0.369693
median
0.695856
0.676443
std
3.396749
5.885888
var
11.537901
34.643679
tsne_y
mean
-1.675487
0.428198
median
-5.580957
0.844001
std
7.281451
4.696751
var
53.019528
22.059467
import matplotlib
% matplotlib inline
matplotlib .style .use ('ggplot' )
import matplotlib .pyplot as plt
import seaborn as sns
sns .set_context ("notebook" )
Pandas dataframes wrap Matplotlib
df .plot .scatter (x = 'free_sulfur_dioxide' , y = 'total_sulfur_dioxide' ); # alpha=.5,s=100,figsize=(10,5)
Seaborn wraps Pandas & Matplotlib
kws = {'alpha' :0.1 ,'s' :50 }
sns .lmplot (data = df , x = 'free_sulfur_dioxide' , y = 'total_sulfur_dioxide' , hue = 'type' , fit_reg = True , size = 7 , scatter_kws = kws );
Seaborn is all sorts of 'out of the box' fancy
sns .jointplot (data = df , x = 'free_sulfur_dioxide' , y = 'total_sulfur_dioxide' , kind = "kde" ,color = 'm' ,size = 7 );
cscale = sns .choose_diverging_palette ()
# cscale = sns.choose_colorbrewer_palette('diverging')
binary_colors = [cscale [0 ],cscale [- 1 ]]
cmap = matplotlib .colors .LinearSegmentedColormap .from_list ("custom" , cscale )
ax = df [df .type == 'white' ].plot .scatter (x = 'free_sulfur_dioxide' , y = 'total_sulfur_dioxide' , color = binary_colors [1 ], label = 'Red' ,figsize = (10 ,5 ));
df [df .type == 'red' ].plot .scatter (ax = ax , x = 'free_sulfur_dioxide' , y = 'total_sulfur_dioxide' , color = binary_colors [0 ], label = 'White' );
fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
free_sulfur_dioxide
total_sulfur_dioxide
density
pH
sulphates
alcohol
quality
fixed_acidity
1.000000
0.219008
0.324436
-0.111981
0.298195
-0.282735
-0.329054
0.458910
-0.252700
0.299568
-0.095452
-0.076743
volatile_acidity
0.219008
1.000000
-0.377981
-0.196011
0.377124
-0.352557
-0.414476
0.271296
0.261454
0.225984
-0.037640
-0.265699
citric_acid
0.324436
-0.377981
1.000000
0.142451
0.038998
0.133126
0.195242
0.096154
-0.329808
0.056197
-0.010493
0.085532
residual_sugar
-0.111981
-0.196011
0.142451
1.000000
-0.128940
0.402871
0.495482
0.552517
-0.267320
-0.185927
-0.359415
-0.036980
chlorides
0.298195
0.377124
0.038998
-0.128940
1.000000
-0.195045
-0.279630
0.362615
0.044708
0.395593
-0.256916
-0.200666
free_sulfur_dioxide
-0.282735
-0.352557
0.133126
0.402871
-0.195045
1.000000
0.720934
0.025717
-0.145854
-0.188457
-0.179838
0.055463
total_sulfur_dioxide
-0.329054
-0.414476
0.195242
0.495482
-0.279630
0.720934
1.000000
0.032395
-0.238413
-0.275727
-0.265740
-0.041385
density
0.458910
0.271296
0.096154
0.552517
0.362615
0.025717
0.032395
1.000000
0.011686
0.259478
-0.686745
-0.305858
pH
-0.252700
0.261454
-0.329808
-0.267320
0.044708
-0.145854
-0.238413
0.011686
1.000000
0.192123
0.121248
0.019506
sulphates
0.299568
0.225984
0.056197
-0.185927
0.395593
-0.188457
-0.275727
0.259478
0.192123
1.000000
-0.003029
0.038485
alcohol
-0.095452
-0.037640
-0.010493
-0.359415
-0.256916
-0.179838
-0.265740
-0.686745
0.121248
-0.003029
1.000000
0.444319
quality
-0.076743
-0.265699
0.085532
-0.036980
-0.200666
0.055463
-0.041385
-0.305858
0.019506
0.038485
0.444319
1.000000
plt .figure (figsize = (15 ,15 ))
plt .title ('Correlation Matrix' )
sns .heatmap (df .corr (), annot = True , cmap = cmap );
sns .pairplot (df , hue = 'type' , size = 2 , palette = binary_colors );
def outliers (df , threshold , columns ):
for col in columns :
mask = df [col ] > float (threshold )* df [col ].std ()+ df [col ].mean ()
df .loc [mask == True ,col ] = np .nan
mean_property = df .loc [:,col ].mean ()
df .loc [mask == True ,col ] = mean_property
return df
col = df .columns [:- 1 ] # We want just the numerical columns
col
Index([u'fixed_acidity', u'volatile_acidity', u'citric_acid',
u'residual_sugar', u'chlorides', u'free_sulfur_dioxide',
u'total_sulfur_dioxide', u'density', u'pH', u'sulphates', u'alcohol',
u'quality'],
dtype='object')
threshold = 5
df_clean = df .copy ()
df_clean = outliers (df_clean , threshold , col )
sns .pairplot (df_clean , hue = 'type' , size = 2 , palette = binary_colors );
Scikit Learn implementation of TSNE
X = df_clean [col ]
from sklearn .manifold import TSNE
model = TSNE (n_components = 2 , random_state = 0 )
np .set_printoptions (suppress = True )
tsne_out = model .fit_transform (X_std )
df_clean ['tsne_x' ] = map (lambda t : t [0 ],tsne_out )
df_clean ['tsne_y' ] = map (lambda t : t [1 ],tsne_out )
df_clean .to_pickle (os .path .join (os .getcwd (),'wine_tsne.pickle' ))
df = pd .read_pickle (os .path .join (os .getcwd (),'wine_tsne.pickle' ))
plt .figure ();
df .plot .scatter (x = 'tsne_x' ,y = 'tsne_y' ,figsize = (10 ,10 ),title = 'TSNE plot without outliers' );
<matplotlib.figure.Figure at 0x123b3de90>
Bokeh plotting library.
Works well out of the box for plots with minimal editing.
Can generate a static html.
Unfortunately doesn't render directly in Github.
from bokeh .plotting import figure , output_file , show , reset_output
from bokeh .io import output_notebook
from bokeh .models import ColumnDataSource
reset_output ()
output_notebook ()
# output to static HTML file
# output_file("tsne_points.html")
p = figure (title = "TSNE example" ,tools = "hover" )
# colors
colors = [matplotlib .colors .rgb2hex (binary_colors [t .astype (int )][:3 ].tolist ()) for t in df .type == 'red' ]
source = ColumnDataSource (data = df )
p .circle (df ['tsne_x' ].values , df ['tsne_y' ].values , source = source , fill_color = colors , fill_alpha = 0.6 , line_alpha = 0 )
hover = p .select (dict (type = HoverTool ))
hover .tooltips = [(c , '@' + c ) for c in df .columns ]
p .logo = None
p .toolbar_location = None
show (p )