Created
October 15, 2021 05:43
-
-
Save lmeyerov/423df6b3b5bd85d12fd74b85eca4a17a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def validate(g): | |
# Ensure no NA IDs | |
if g._edges[g._source].isna().sum() > 0: | |
print('edge src has na()') | |
return g._edges[g._edges[g._source].isna()] | |
if g._edges[g._destination].isna().sum() > 0: | |
print('edge dst has na()') | |
return g._edges[g._edges[g._destination].isna()] | |
if g._nodes[g._node].isna().sum() > 0: | |
print('node id has na()') | |
return g._nodes[g._nodes[g._node].isna()] | |
# Ensure all edge src/dst IDs are in nodes | |
e_srcs = g._edges.rename(columns={g._source: g._node})[g._node].unique() | |
e_dsts = g._edges.rename(columns={g._destination: g._node})[g._node].unique() | |
e_nodes = pd.DataFrame({ | |
g._node: pd.concat([pd.Series(e_srcs), pd.Series(e_dsts)]).drop_duplicates() | |
}) | |
e_nodes_hits = ( | |
e_nodes | |
.merge( | |
pd.DataFrame(g._nodes).assign(hit=1), | |
how='left') | |
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0))) | |
) | |
e_nodes_misses = e_nodes_hits[e_nodes_hits['hit'] != 1] | |
if len(e_nodes_misses) > 0: | |
print('edge src/dst IDs not in nodes') | |
return e_nodes_misses[[g._node, 'hit']] | |
# Ensure all nodes in edge src or dst | |
n_e_hits = ( | |
g._nodes[[g._node]] | |
.merge( | |
e_nodes.assign(hit=1), | |
how='left') | |
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0))) | |
) | |
n_e_misses = n_e_hits[n_e_hits['hit'] != 1] | |
if len(n_e_misses) > 0: | |
print('node IDs not in edges') | |
return n_e_misses[[g._node, 'hit']] | |
return e_nodes_misses[[g._node, 'hit']] | |
def remove_singletons(g): | |
g = g.materialize_nodes() | |
# Find all src/dst IDs used in edges | |
e_srcs = g._edges.rename(columns={g._source: g._node})[g._node].unique() | |
e_dsts = g._edges.rename(columns={g._destination: g._node})[g._node].unique() | |
e_nodes = pd.DataFrame({ | |
g._node: pd.concat([pd.Series(e_srcs), pd.Series(e_dsts)]).drop_duplicates() | |
}) | |
# Find all nodes in edge src or dst | |
n_e_hits = ( | |
g._nodes[[g._node]] | |
.merge( | |
e_nodes.assign(hit=1), | |
how='left') | |
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0).astype('bool'))) | |
).set_index(g._node) | |
n2 = g._nodes.set_index(g._node) | |
#print(n_e_hits) | |
n3 = n2[ n_e_hits['hit'] ] | |
if len(n2) != len(n3): | |
print('slimmed', len(n2), '=>', len(n3)) | |
return g.nodes(n3.reset_index()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment