Skip to content

Instantly share code, notes, and snippets.

@ESeufert
Created February 11, 2019 04:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ESeufert/ac506bcb4ad3114423c61b5ea3fa0fb0 to your computer and use it in GitHub Desktop.
Save ESeufert/ac506bcb4ad3114423c61b5ea3fa0fb0 to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.colors as pltcolors
import pandas as pd
import numpy as np
import random
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import colors as mcolors
from matplotlib import cm
from beautifultable import BeautifulTable
from datetime import datetime
from datetime import timedelta
from scipy.stats import linregress
def stacked_bar( data, series_labels, category_labels=None,
show_values=False, value_format="{}", y_label=None,
grid=True, reverse=False, show_totals_values=False, totals = [],
colors=[] ):
"""Plots a stacked bar chart with the data and labels provided.
Keyword arguments:
data -- 2-dimensional numpy array or nested list
containing data for each series in rows
series_labels -- list of series labels (these appear in
the legend)
category_labels -- list of category labels (these appear
on the x-axis)
show_values -- If True then numeric value labels will
be shown on each bar
value_format -- Format string for numeric value labels
(default is "{}")
y_label -- Label for y-axis (str)
grid -- If True display grid
reverse -- If True reverse the order that the
series are displayed (left-to-right
or right-to-left)
"""
plt.figure( figsize = (25, 15) )
ny = len(data[0])
ind = list( range(ny) )
axes = []
cum_size = np.zeros( ny )
data = np.array( data )
if reverse:
data = np.flip(data, axis=1)
category_labels = reversed(category_labels)
for i, row_data in enumerate( data ):
if colors:
axes.append( plt.bar( ind, row_data, bottom=cum_size,
label=series_labels[i], color = colors[ i ] ) )
else:
axes.append( plt.bar( ind, row_data, bottom=cum_size,
label=series_labels[i] ) )
cum_size += row_data
if category_labels:
category_font_size = 20 if len( category_labels ) <= 15 else 16
plt.xticks( ind, category_labels, fontsize=category_font_size )
plt.xticks( rotation=45 )
if y_label:
plt.ylabel( y_label, fontsize=20 )
plt.yticks( fontsize=20 )
plt.legend( fontsize = 'xx-large' )
if grid:
plt.grid()
if show_values:
for axis in axes:
for bar in axis:
w, h = bar.get_width(), bar.get_height()
if h != 0:
plt.text( bar.get_x() + w/2, bar.get_y() + h/2,
h, ha="center",
va="center", fontsize=22 )
if show_totals_values:
#show the total for each stacked bar chart
#eg. the sum of the values for any given category
if totals:
if len( totals ) == len( category_labels ):
for index, total in enumerate( totals ):
totals_font = 26 if len( category_labels ) <= 15 else 18
totals_rotate = 0 if len( category_labels ) <= 15 else 45
totals_height = 3 if len( category_labels ) <= 15 else 10
plt.text( index, total + ( totals_height/100 * sum( totals ) / len( totals ) ),
total, ha="center",
va="center", fontsize=totals_font, color="r",
weight = 'bold', rotation=totals_rotate )
def build_cohort_DAU_map( cohort, map_length ):
DAU = [ 0 ] * map_length
for x in range( 0, map_length ):
DAU[ x ] = int( cohort[ 'cohort_size' ] * np.exp( cohort[ 'retention_profile' ][ 1 ] * x ) )
return DAU
def build_cohort( cohorts, date, cohort_size, retention_profile, color = None ):
cohort = pd.DataFrame( columns=[ 'date', 'retention_profile', 'cohort_size', 'color' ] )
if color is None:
color = random.choice( colors )
if not cohorts.empty:
while color in cohorts[ 'color' ].tolist():
color = random.choice( colors )
cohort.loc[ 0 ] = [ date.date(), retention_profile, cohort_size, color ]
return cohort
def create_cohorts( cohorts_DNU ):
cohorts = pd.DataFrame()
for i, value in enumerate( cohorts_DNU ):
cohort_size = cohorts_DNU[ i ][ 'DNU' ]
cohort = build_cohort( cohorts, ( this_date + timedelta( days = i ) ) , cohort_size, retention_profiles[ 0 ], color = cohorts_DNU[ i ][ 'color' ] )
cohorts = cohorts.append( cohort )
return cohorts
def add_cohort( cohorts, date, cohort_size, retention_profile ):
this_cohort = build_cohort( cohorts, date, cohort_size, retention_profile )
cohorts = cohorts.append( this_cohort )
return cohorts
def plot_DAU( DAU ):
fig, ax1 = plt.subplots()
plt.rcParams[ 'figure.figsize' ] = [ 10, 5 ]
ax1.set_ylabel( 'DAU' )
ax1.plot( np.arange( len( DAU ) ), DAU, label='DAU', color='#ff4d4d' )
ax1.legend( loc='center right' )
fig.suptitle( 'DAU', fontsize=14 )
vals = ax1.get_yticks()
gap_size = 90
ax1.annotate('(Day: %s, DAU: %s)' %
( 0, int( DAU[ 0 ] ) ), xy=( 0, int( DAU[ 0 ] ) ), textcoords='data')
for x in range( 1, int( ( len( DAU ) - 1 ) / gap_size ) ):
ax1.annotate('(Day: %s, DAU: %s)' %
( x * gap_size, int( DAU[ x * gap_size ] ) ), xy=( x * gap_size, int( DAU[ x * gap_size ] ) ), textcoords='data')
ax1.annotate('(Day: %s, DAU: %s)' %
( len( DAU ) - 1, int( DAU[ len( DAU ) - 1 ] ) ), xy=( len( DAU ) - 1, int( DAU[ len( DAU ) - 1 ] ) ), textcoords='data')
plt.grid()
plt.show()
print_DAU_table( DAU, DAU[ 0 ] )
def print_DAU_table( DAU, cohort_size ):
table = BeautifulTable()
table.column_headers = [ "Day", "DAU", "Needed Replacement" ]
table.append_row( [ 0, int( DAU[ 0 ] ), cohort_size - int( DAU[ 0 ] ) ] )
gap_size = 90
for x in range( 1, int( ( len( DAU ) - 1 ) / gap_size ) ):
table.append_row( [ x * gap_size, int( DAU[ x * gap_size ] ), cohort_size - int( DAU[ x * gap_size ] ) ] )
table.append_row( [ len( DAU ) - 1, int( DAU[ len( DAU ) - 1 ] ), cohort_size - int( DAU[ len( DAU ) - 1 ] ) ] )
print( table )
def plot_DNU( DNU ):
fig, ax1 = plt.subplots()
plt.rcParams[ 'figure.figsize' ] = [ 10, 5 ]
ax1.set_ylabel( 'DNU' )
ax1.bar( np.arange( len( DNU ) ), DNU, label='DNU', color='#ff4d4d' )
ax1.legend( loc='center right' )
fig.suptitle( 'DNU', fontsize=14 )
vals = ax1.get_yticks()
#add data labels to the bars
rects = ax1.patches
for rect, label in zip( rects, DNU ):
height = rect.get_height()
ax1.text( rect.get_x() + rect.get_width() / 2, height + 5, label,
ha='center', va='bottom' )
##
plt.grid()
plt.show()
def print_retention_table( retention_profile ):
table = BeautifulTable()
table.column_headers = [ "Day", "Retention %" ]
table.append_row( [ 0, int( retention_profile[ 0 ] ) ] )
table.append_row( [ 1, int( retention_profile[ 1 ] ) ] )
table.append_row( [ 30, int( retention_profile[ 30 ] ) ] )
table.append_row( [ 60, int( retention_profile[ 60 ] ) ] )
table.append_row( [ 360, int( retention_profile[ 360 ] ) ] )
print( table )
def plot_retention_profile( retention_profile ):
x = np.arange( 0, 364, 1 )
this_profile = np.exp( retention_profile[ 1 ] * x ) * 100
fig, ax1 = plt.subplots()
plt.rcParams[ 'figure.figsize' ] = [ 10, 5 ]
ax1.set_ylabel( 'Retention' )
ax1.plot( this_profile, '-r', label='Retention %' )
ax1.legend( loc='center right' )
fig.suptitle( 'Retention Profile', fontsize=14 )
vals = ax1.get_yticks()
ax1.set_yticklabels( [ '%1.2f%%' %i for i in vals ] )
plt.grid()
plt.show()
print_retention_table( this_profile )
def build_forward_DAU( cohorts, map_length ):
map_length += len( cohorts )
start_date = min( cohorts[ 'date' ] )
today = start_date + timedelta( days = ( map_length ) )
#map_length should include original cohort days, so add in the length of the cohorts
dates = pd.date_range( start_date, periods = map_length ).tolist()
dates = [ str( d.date() ) for d in dates ]
forward_DAU = pd.DataFrame( columns = [ 'cohort_date' ] + dates )
for index, value in cohorts.iterrows():
this_date = value[ 'date' ]
this_cohort = pd.DataFrame( columns = [ 'cohort_date' ] + dates )
this_cohort.loc[ 0, 'cohort_date' ] = this_date
i = 0
while this_date < today:
this_cohort.loc[ 0, str( this_date ) ] = int( value[ 'cohort_size' ] * np.exp( value[ 'retention_profile' ][ 1 ] * i ) )
this_date = this_date + timedelta( days = 1 )
i += 1
forward_DAU = forward_DAU.append( this_cohort )
forward_DAU = forward_DAU.fillna( 0 )
return ( forward_DAU, dates )
def plot_DNU_retention_map( cohorts ):
forward_DAU, forward_DAU_dates = build_forward_DAU( cohorts, len( cohorts ) )
cohort_days = [ forward_DAU[ x ].tolist() for x in forward_DAU.columns ]
cohort_data = np.array( cohort_days[ 1: ] )
column_names = forward_DAU_dates
row_names = forward_DAU_dates
fig = plt.figure( figsize = (25, 15) )
ax = Axes3D( fig )
lx = len( cohort_data[0] ) # Work out matrix dimensions
ly = len( cohort_data[ :, 0 ] )
xpos = np.arange( 0, lx, 1 ) # Set up a mesh of positions
ypos = np.arange( 0, ly, 1 )
xpos, ypos = np.meshgrid( xpos, ypos )
xpos = xpos.flatten( ) # Convert positions to 1D array
ypos = ypos.flatten( )
zpos = np.zeros( lx * ly )
dx = 0.5 * np.ones_like( zpos )
dy = dx.copy()
dz = cohort_data.flatten()
plot_colors = cohorts[ 'color' ].tolist() * ly
ax.bar3d( xpos, ypos, zpos, dx, dy, dz, color = plot_colors, shade=True, alpha=0.4 )
max_value = max( [ [ max( DAU ) ] for DAU in cohort_data ] )[ 0 ]
#get the max value from the entire DAU map and use that for plotting the text
#otherwise the text gets obscured by the other graphs for cohorts with low starting DAU
#print the DNU values, eg. the first value of each series
for ( x, y ), value in np.ndenumerate( cohort_data ):
if x == y:
ax.text( x, y, ( max_value / 2 ), value, color='#FFFFFF', fontsize=40 )
ax.w_xaxis.set_ticklabels( column_names )
ax.w_yaxis.set_ticklabels( row_names )
ax.set_xlabel( 'Cohort Date', fontsize=40, labelpad=80 )
ax.set_ylabel( 'Day', fontsize=40, labelpad=80 )
ax.set_zlabel( 'DAU', fontsize=40, labelpad=60 )
plt.tick_params( axis='both', which='major', labelsize=20 )
plt.show()
def plot_forward_DAU_stacked( forward_DAU, forward_DAU_dates, show_values=False, show_totals_values=False, cohorts = None ):
transformed = forward_DAU.values.tolist()
for index, value in enumerate( transformed ):
transformed[ index ] = value[ 1: ]
totals = [ forward_DAU[ column ].sum() for column
in forward_DAU.loc[ :, forward_DAU.columns != 'cohort_date' ] ]
if cohorts is not None:
if not cohorts.empty:
colors = cohorts[ 'color' ].tolist()
else:
print( "error: cohorts empty" )
else:
print( "error: cohorts weren't sent" )
stacked_bar( transformed, forward_DAU_dates[ 0: len( transformed ) ], category_labels=forward_DAU_dates,
show_values=show_values, value_format="{}", y_label='DAU',
grid=True, reverse=False, show_totals_values=show_totals_values, totals = totals, colors = colors )
def build_DAU_trajecory( start_DAU, end_DAU, periods ):
x = [ 1, periods ]
y = [ start_DAU, end_DAU ]
model = linregress( x, y )
return model
def plot_projections_example( start_DAU, end_DAU, periods, DAU_values=[] ):
if not DAU_values:
y = periods * [ 0 ]
y[ 0 ] = start_DAU
y[ len( y ) - 1 ] = end_DAU
else:
y = DAU_values
x = [ str( i ) for i in range( 1, len( y ) + 1 ) ]
fig, ax1 = plt.subplots()
plt.rcParams[ 'figure.figsize' ] = [ 10, 5 ]
ax1.set_ylabel( 'DAU' )
ax1.bar( x, y, label='DAU', color='#ff4d4d' )
ax1.legend( loc='center right' )
fig.suptitle( 'DAU', fontsize=14 )
#add data labels to the bars
rects = ax1.patches
for rect, label in zip( rects, y ):
height = rect.get_height()
ax1.text( rect.get_x() + rect.get_width() / 2, height + 5, int( label ),
ha='center', va='bottom' )
##
ax1.set_xticklabels( x )
plt.grid()
plt.show()
def print_trailing_cohort_DNU_table( cohorts, periods ):
replacement_DNU = cohorts[ ( -1 * periods ): ][ [ 'date', 'cohort_size' ] ]
table = BeautifulTable()
table.column_headers = replacement_DNU.columns.tolist()
for index, row in replacement_DNU.iterrows():
table.append_row( row )
print( table )
def build_DAU_projection_map( cohorts, retention_profiles, forward_DAU, DAU_values ):
this_DAU_value = DAU_values[ 0 ]
this_date_value = datetime.strptime( forward_DAU.columns.tolist()[ -1 ] , '%Y-%m-%d' ) + timedelta( days = 1 )
#advance the cohorts forward by one day to see what the natural DAU
#from existing cohorts would be without any additions
forward_DAU, forward_DAU_dates = build_forward_DAU( cohorts, 1 )
natural_DAU = forward_DAU.iloc[ :, -1 ].sum()
#calculate replacement DAU needed to hit the DAU goal
replacement_DAU = this_DAU_value - natural_DAU
#add this new cohort on this day IF the replacement DAU is positive
cohorts = add_cohort( cohorts, this_date_value, ( 0 if replacement_DAU < 0 else replacement_DAU ),
retention_profiles[ 0 ] )
#advance the cohorts, including the new cohort, forward by one day
forward_DAU, forward_DAU_dates = build_forward_DAU( cohorts, 0 )
#if this was the last DAU target to hit, return the values
if len( DAU_values ) == 1:
return cohorts
#if there are more DAU targets left to hit, remove this target and run the process again recursively
return build_DAU_projection_map( cohorts, retention_profiles, forward_DAU, DAU_values[ 1: ] )
def run_simulation( forward_DAU, DAU_target, periods, cohorts ):
###start projections
start_DAU = forward_DAU.iloc[ :, -1].sum() #the current value of DAU
model = build_DAU_trajecory( start_DAU, DAU_target, periods )
###end projections
###plot a projection example
plot_projections_example( start_DAU, DAU_target, periods )
DAU_values = [ model[ 0 ] * i + model[ 1 ] for i in range( 1, periods + 1 ) ]
#start from 2 because we want to exclude the first value, which is the last value of the existing cohorts
plot_projections_example( start_DAU, DAU_target, periods, DAU_values )
cohorts = build_DAU_projection_map( cohorts, retention_profiles, forward_DAU, DAU_values[ 1: ] )
forward_DAU, forward_DAU_dates = build_forward_DAU( cohorts, 0 )
plot_forward_DAU_stacked( forward_DAU, forward_DAU_dates, show_values=True,
show_totals_values=True, cohorts = cohorts )
print_trailing_cohort_DNU_table( cohorts, periods - 1 )
###setup
cmap = cm.get_cmap( 'tab20', 100 ) # PiYG, create a color map
colors = [ pltcolors.rgb2hex( cmap( i )[ :3 ] ) for i in range( cmap.N ) ]
retention_profiles = [ [ 0, -.008, 1 ] ]
#the retention profile that each cohort will take
#the larger the absolute value of the 2nd element, the steeper the curve
cohorts_DNU = [ { "DNU": 5482, "color": "#E4814F" },
{ "DNU": 6812, "color": "#C1CB69" },
{ "DNU": 4938, "color": "#4D7F71" },
{ "DNU": 5028, "color": "#DFD59A" },
{ "DNU": 3058, "color": "#EDA646" } ]
base_cohorts = create_cohorts( cohorts_DNU )
this_date = datetime.now()
#the initial forward_DAU calculation, it creates the stacked cohorts
#but doesn't advance them further than the number of cohorts
# (eg. only the original cohorts are included but they are all projected forward
# to the end of this cohort series)
base_forward_DAU, base_forward_DAU_dates = build_forward_DAU( base_cohorts, 0 )
#an example DAU map for one cohort
cohort_DAU = build_cohort_DAU_map( base_cohorts.iloc[ 0 ], 365 )
###end setup
###explanatory visualizations
plot_retention_profile( retention_profiles[ 0 ] )
plot_DAU( cohort_DAU )
plot_DNU( list( base_cohorts[ 'cohort_size' ] ) )
plot_DNU_retention_map( base_cohorts )
plot_forward_DAU_stacked( base_forward_DAU, base_forward_DAU_dates,
show_values=True, show_totals_values=True, cohorts=base_cohorts )
###end explanatory visualizations
###show what the base cohorts would evolve to over 15 periods without any NEW cohorts
example_forward_DAU, example_forward_DAU_dates = build_forward_DAU( base_cohorts, 15 )
plot_forward_DAU_stacked( example_forward_DAU, example_forward_DAU_dates, show_values=True,
show_totals_values=True, cohorts = base_cohorts )
###simulations
#first simulation: growing to 50000 DAU after 15 periods
run_simulation( base_forward_DAU, 50000, 15, base_cohorts )
#second simulation: maintaining the same DAU over 15 periods
run_simulation( base_forward_DAU, base_forward_DAU.iloc[ :, -1].sum(), 15, base_cohorts )
#third simulation: DAU declines to just 15000 over 15 periods
run_simulation( base_forward_DAU, 22500, 15, base_cohorts )
###end simulations
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment