ken333135 · May 8, 2019 07:57 · paulrougieux · Sep 23, 2019 · ken333135 · Sep 24, 2019
diff --git a/genSankey b/genSankey
 def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig
	def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
	# maximum of 6 value cols -> 6 colors
	colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
	labelList = []
	colorNumList = []
	for catCol in cat_cols:
	labelListTemp = list(set(df[catCol].values))
	colorNumList.append(len(labelListTemp))
	labelList = labelList + labelListTemp

	# remove duplicates from labelList
	labelList = list(dict.fromkeys(labelList))

	# define colors based on number of levels
	colorList = []
	for idx, colorNum in enumerate(colorNumList):
	colorList = colorList + [colorPalette[idx]]*colorNum

	# transform df into a source-target pair
	for i in range(len(cat_cols)-1):
	if i==0:
	sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
	sourceTargetDf.columns = ['source','target','count']
	else:
	tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
	tempDf.columns = ['source','target','count']
	sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
	sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()

	# add index for source-target pair
	sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
	sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

	# creating the sankey diagram
	data = dict(
	type='sankey',
	node = dict(
	pad = 15,
	thickness = 20,
	line = dict(
	color = "black",
	width = 0.5
	),
	label = labelList,
	color = colorList
	),
	link = dict(
	source = sourceTargetDf['sourceID'],
	target = sourceTargetDf['targetID'],
	value = sourceTargetDf['count']
	)
	)

	layout = dict(
	title = title,
	font = dict(
	size = 10
	)
	)

	fig = dict(data=[data], layout=layout)
	return fig