Skip to content

Instantly share code, notes, and snippets.

@mapmeld
Forked from jexp/bulk-neo4j-import-original.sh
Last active May 26, 2016 08:01
Show Gist options
  • Save mapmeld/eea9051410e8adb41c93b6623ad1cdf2 to your computer and use it in GitHub Desktop.
Save mapmeld/eea9051410e8adb41c93b6623ad1cdf2 to your computer and use it in GitHub Desktop.
Panama Papers Import Scripts for Neo4j and Docker
export NEO4J_HOME="/usr/local"
if [ ! -f data-csv.zip ]; then
curl -OL https://cloudfront-files-1.publicintegrity.org/offshoreleaks/data-csv.zip
fi
export DATA=${PWD}/import
rm -rf $DATA
unzip -o -j data-csv.zip -d $DATA
wc -l $DATA/*.csv
tr -d '\\' < $DATA/Addresses.csv > $DATA/Addresses_fixed.csv
for i in $DATA/[AIEO]*.csv; do echo $i; sed -i '' -e '1,1 s/node_id/node_id:ID/' $i; done
sed -i '' -e '1 d' $DATA/all_edges.csv
tr '[:lower:]' '[:upper:]' < $DATA/all_edges.csv | sed -e 's/[^A-Z0-9,_ ]//g' -e 's/ */_/g' -e 's/,_/_/g' > $DATA/all_edges_cleaned.csv
echo 'node_id:START_ID,rel_type:TYPE,node_id:END_ID' > $DATA/all_edges_header.csv
rm -rf $DATA/panama.db
head -1 $DATA/*.csv
$NEO4J_HOME/bin/neo4j-import --into $DATA/panama.db --nodes:Address $DATA/Addresses_fixed.csv --nodes:Entity $DATA/Entities.csv --nodes:Intermediary $DATA/Intermediaries.csv --nodes:Officer $DATA/Officers.csv \
--relationships $DATA/all_edges_header.csv,$DATA/all_edges_cleaned.csv --ignore-empty-strings true --skip-duplicate-nodes true --skip-bad-relationships true --bad-tolerance 1000000 --multiline-fields=true
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN labels(n),count(*) ORDER BY count(*) DESC;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n) RETURN count(*) as nodes;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH ()-[r]->() RETURN type(r),r.detail,count(*) ORDER BY count(*) DESC;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN collect(distinct labels(n)),type(r),labels(m),count(*) ORDER BY count(*) DESC;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),collect(distinct labels(m)),count(*) ORDER BY count(*) DESC;'
$NEO4J_HOME/bin/neo4j-shell -path $DATA/panama.db -c 'MATCH (n)-[r]->(m) RETURN labels(n),type(r),labels(m),count(*) ORDER BY count(*) DESC;'
# IMPORT DONE in 23s 391ms. Imported:
# 839434 nodes
# 1269796 relationships
# 8211010 properties
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment