Skip to content

Instantly share code, notes, and snippets.

@lmeyerov
Created October 15, 2021 05:43
Show Gist options
  • Save lmeyerov/423df6b3b5bd85d12fd74b85eca4a17a to your computer and use it in GitHub Desktop.
Save lmeyerov/423df6b3b5bd85d12fd74b85eca4a17a to your computer and use it in GitHub Desktop.
def validate(g):
# Ensure no NA IDs
if g._edges[g._source].isna().sum() > 0:
print('edge src has na()')
return g._edges[g._edges[g._source].isna()]
if g._edges[g._destination].isna().sum() > 0:
print('edge dst has na()')
return g._edges[g._edges[g._destination].isna()]
if g._nodes[g._node].isna().sum() > 0:
print('node id has na()')
return g._nodes[g._nodes[g._node].isna()]
# Ensure all edge src/dst IDs are in nodes
e_srcs = g._edges.rename(columns={g._source: g._node})[g._node].unique()
e_dsts = g._edges.rename(columns={g._destination: g._node})[g._node].unique()
e_nodes = pd.DataFrame({
g._node: pd.concat([pd.Series(e_srcs), pd.Series(e_dsts)]).drop_duplicates()
})
e_nodes_hits = (
e_nodes
.merge(
pd.DataFrame(g._nodes).assign(hit=1),
how='left')
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0)))
)
e_nodes_misses = e_nodes_hits[e_nodes_hits['hit'] != 1]
if len(e_nodes_misses) > 0:
print('edge src/dst IDs not in nodes')
return e_nodes_misses[[g._node, 'hit']]
# Ensure all nodes in edge src or dst
n_e_hits = (
g._nodes[[g._node]]
.merge(
e_nodes.assign(hit=1),
how='left')
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0)))
)
n_e_misses = n_e_hits[n_e_hits['hit'] != 1]
if len(n_e_misses) > 0:
print('node IDs not in edges')
return n_e_misses[[g._node, 'hit']]
return e_nodes_misses[[g._node, 'hit']]
def remove_singletons(g):
g = g.materialize_nodes()
# Find all src/dst IDs used in edges
e_srcs = g._edges.rename(columns={g._source: g._node})[g._node].unique()
e_dsts = g._edges.rename(columns={g._destination: g._node})[g._node].unique()
e_nodes = pd.DataFrame({
g._node: pd.concat([pd.Series(e_srcs), pd.Series(e_dsts)]).drop_duplicates()
})
# Find all nodes in edge src or dst
n_e_hits = (
g._nodes[[g._node]]
.merge(
e_nodes.assign(hit=1),
how='left')
.pipe(lambda df: df.assign(hit=df['hit'].fillna(0).astype('bool')))
).set_index(g._node)
n2 = g._nodes.set_index(g._node)
#print(n_e_hits)
n3 = n2[ n_e_hits['hit'] ]
if len(n2) != len(n3):
print('slimmed', len(n2), '=>', len(n3))
return g.nodes(n3.reset_index())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment