Interact with mutations found in cancers, and explore the drugs that target them


Author: Brin Rosenthal ([email protected])


In [135]:
# import some useful packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import networkx as nx
import pandas as pd
import random
import community
import json
import os

# latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')


% matplotlib inline

# import visJS_2_jupyter 
import sys
code_path = '/Users/brin/CCBB_internal/Brin/source/visJS_2_jupyter/'
sys.path.append(code_path)
sys.path.append('/Users/brin/Google_Drive/UCSD/cluster_code/network_propagation/')
import visJS_module as visJS_module
import network_prop

import imp
imp.reload(visJS_module)
Out[135]:
<module 'visJS_module' from '/Users/brin/CCBB_internal/Brin/source/visJS_2_jupyter/visJS_module.pyc'>
In [ ]:
 

TOC

Load drug-gene information

  • Load the drugbank drug-gene file, create a dictionary to map from gene target to drug
  • Run heat-prop simulation from a set of mutations in a single cluster
  • What are the nearby druggable genes?
In [3]:
DBdict = DGH.load_DB_data('/Users/brin/Google_Drive/UCSD/TCGA_data/drug_gene_heatprop/drugbank.0.json.new')
# make a Series object to map from genes to drugs

drug_list = []
gene_list = []
for drug in DBdict.keys():
    
    gene_targets_temp = [n['name'] for n in DBdict[drug]['node_list']]
    gene_list.extend(gene_targets_temp)
    num_gene_targets = len(gene_targets_temp)
    
    drug_temp = [drug]*num_gene_targets
    drug_list.extend(drug_temp)
    
gene_to_drug = pd.Series(drug_list,index=gene_list)
drug_to_gene = pd.Series(gene_list,index=drug_list)
    

TOC

Get a list of mutations

  • Many commonly mutated genes are found across lots of diseases. Are these known oncogenes?
  • Build a bipartite graph consisting of commonly mutated genes (top 100 in each disease), and disease found in
In [98]:
# list of TCGA cancer types
d_list =  ['ACC','BLCA','BRCA','CESC','CHOL','COAD','COADREAD','DLBC','ESCA','GBM','GBMLGG','HNSC','KICH','KIPAN',
           'KIRC','KIRP','LAML','LGG','LIHC','LUAD','LUSC','OV','PAAD','PCPG','PRAD','READ','SARC','SKCM',
           'STAD','STES','TGCT','THCA','UCEC','UCS','UVM']
len(d_list)
Out[98]:
35
In [49]:
mutation_EL = []
num_mutations = 25
for d1 in d_list:
    print(d1)

    mutation_df = pd.read_csv('/Users/brin/Documents/TCGA_data/expression_files/'+d1+'/mutation_matrix.txt',sep='\t')
    mutation_df.index=mutation_df['gene_name']
    index_no_v = [index[:-2] for index in list(mutation_df.index)]
    mutation_df.index = index_no_v
    mutation_df['row_average'] = mutation_df.mean(axis=1)
    mutation_df = mutation_df.sort('row_average',ascending=False)
    # reorder the columns
    cols = mutation_df.columns
    cols_reorder = [cols[0]]
    cols_reorder.append(cols[-1])
    cols_reorder.extend(cols[1:-1])
    mutation_df = mutation_df[cols_reorder]


    # sort by individual patient mutation list
    patient_num = 7
    patient_name = 'row_average' #cols[patient_num]
    mutation_df = mutation_df.sort(patient_name,ascending=False)
    mutation_df.head()

    mutation_EL_temp = zip(list(mutation_df.head(num_mutations).index),[d1]*num_mutations,list(mutation_df.head(num_mutations)['row_average']))
    mutation_EL.extend(mutation_EL_temp)
ACC
BLCA
BRCA
CESC
CHOL
COAD
COADREAD
DLBC
ESCA
GBM
GBMLGG
HNSC
KICH
KIPAN
KIRC
KIRP
LAML
LGG
LIHC
LUAD
LUSC
OV
PAAD
PCPG
PRAD
READ
SARC
SKCM
STAD
STES
TGCT
THCA
UCEC
UCS
UVM
In [53]:
# make a graph from mutation_EL
G_mutation = nx.Graph()
G_mutation.add_weighted_edges_from(mutation_EL)
In [88]:
mutation_DF = pd.DataFrame(mutation_EL)
mutation_DF.to_csv('mutation_EL.csv')
In [ ]:
 
In [95]:
# load graph from pre-saved edgelist
G_mut_DF = pd.read_csv('mutation_EL.csv',names=['gene1','gene2','weight',],skiprows=1)
G_mutation = nx.from_pandas_dataframe(G_mut_DF,source='gene1',target='gene2',edge_attr = 'weight')
G_mut_DF.head()
Out[95]:
gene1 gene2 weight
0 ZFPM1 ACC 2.333333
1 MUC5B ACC 1.622222
2 CRIPAK ACC 1.577778
3 GARS ACC 1.577778
4 ZNF517 ACC 1.466667
In [96]:
# prep graph for visJS_2_jupyter visualization
# add BC and CC as attributes
# create nodes_dict and edges_dict for input to visjs
pos = nx.spring_layout(G_mutation,k=.2)

nodes = G_mutation.nodes()

numnodes = len(nodes)
edges = G_mutation.edges()
edges_with_data = G_mutation.edges(data=True)
numedges = len(edges)

# add a node attributes to color-code by
cc = nx.clustering(G_mutation)
degree = G_mutation.degree()
bc = nx.betweenness_centrality(G_mutation)
nx.set_node_attributes(G_mutation,'clustering_coefficient',cc)
nx.set_node_attributes(G_mutation,'degree',degree)
nx.set_node_attributes(G_mutation,'betweenness_centrality',bc)


# set node_size to degree
node_size = [int(float(n)/np.max(degree.values())*25+1) for n in degree.values()]
node_to_nodeSize = dict(zip(degree.keys(),node_size))


# add nodes to highlight (druggable genes)
nodes_HL = [1 if node in gene_to_drug.keys() else 0 for node in G_mutation.nodes()]  
nodes_HL = dict(zip(G_mutation.nodes(),nodes_HL))

nodes_shape=[]
node_shape = ['triangle' if (node in d_list) else 'dot' for node in G_mutation.nodes()]
node_to_nodeShape=dict(zip(G_mutation.nodes(),node_shape))

# add a field for node labels
node_labels_temp = []
list_of_genes = list(np.setdiff1d(G_mutation.nodes(),d_list))
for node in G_mutation.nodes():
    label_temp = node
    
    if node in list_of_genes:
        label_temp+= ': <br/>'
        
        label_temp+='mutation found in = ' + str(nx.degree(G_mutation,node)) + ' out of '+str(len(d_list))+' diseases<br/><br/>'
        label_temp+='Drugs targeting this gene: <br/>'
        
        
        if node in gene_to_drug.keys():
            drugs_temp = gene_to_drug[node]
            if type(drugs_temp)==unicode:
                drugs_temp = [drugs_temp]
            for d in drugs_temp:
                label_temp+=d + '<br/>'
        else:
            label_temp+='None'
        
    
    node_labels_temp.append(label_temp)

node_labels = dict(zip(G_mutation.nodes(),node_labels_temp))

node_titles = [node for node in G_mutation.nodes()]

node_titles = dict(zip(G_mutation.nodes(),node_titles))
    
    

TOC

Draw the bipartite mutation graph

  • Circles are commonly mutated genes
  • Triangles are the TCGA diseases these mutations are found in
In [138]:
# draw the graph here

node_to_color = visJS_module.return_node_to_color(G_mutation,field_to_map='degree',cmap=mpl.cm.jet,alpha = 1,
                                                  color_vals_transform=None,
                                                 color_max_frac = .9,color_min_frac = .2)

edge_to_color = visJS_module.return_edge_to_color(G_mutation,field_to_map = 'weight',cmap=mpl.cm.Greys,alpha=.6,vmin=0,vmax=1)

nodes_dict = [{"id":n,"degree":G_mutation.degree(n),"color":node_to_color[n],
              "node_size":node_to_nodeSize[n],'border_width':nodes_HL[n],
              "node_label":node_labels[n],
              "edge_label":'',
              "title":node_labels[n],
              "node_shape":node_to_nodeShape[n],
              "x":pos[n][0]*1000,
              "y":pos[n][1]*1000} for n in nodes
              ]
node_map = dict(zip(nodes,range(numnodes)))  # map to indices for source/target in edges

edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
              "color":edge_to_color[edges[i]],"title":edges_with_data[i][2]['weight']} for i in range(numedges)]

visJS_module.visjs_network(nodes_dict,edges_dict,
                            node_highlight_color="black",
                            node_hover_color = 'orange',
                            node_border_color='black',
                            node_size_field='node_size',
                            node_size_transform='Math.sqrt',
                            node_size_multiplier=4,
                            node_border_width=2,
                            hover = False,
                            label_field='id',
                            edge_width=.2,
                            hover_connected_edges = False,
                            physics_enabled=False,
                            min_velocity=.5,
                            max_velocity=16,
                            draw_threshold=20,
                            min_label_size=12,
                            max_label_size=25,
                            max_visible=10,
                            edge_label_size=50,
                            edge_title_field='title',
                            graph_title = 'mutation graph- top '+str(num_mutations)+' mutations in each disease')
Out[138]:
Network | Basic usage
In [ ]: