Skip to content

Instantly share code, notes, and snippets.

@pkpp1233
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pkpp1233/5ac93730c60e029c24b4 to your computer and use it in GitHub Desktop.
Save pkpp1233/5ac93730c60e029c24b4 to your computer and use it in GitHub Desktop.
fuzzy matcher block
import blockspring
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import os
import pandas as pd
import numpy as np
def block(request, response):
primary_ids = [str(row[0]) for row in request.params["proper_ids"]]
header_primary_ids = primary_ids.pop(0)
for_match_ids = [str(row[0]) for row in request.params["fuzzy_ids"]]
header_for_match_ids = for_match_ids.pop(0)
fuzzy_matches = int(request.params["count_matches"] or 2)
output = [["ids"] + ["Match " + str(i+1) for i in range(fuzzy_matches)]]
for primary_id in primary_ids:
matches = process.extract(str(primary_id), for_match_ids, limit=fuzzy_matches)
matches = [primary_id] + [", ".join(str(i) for i in match) for match in matches]
output.append(matches)
headers = output.pop(0)
df = pd.DataFrame(output, columns=headers)
df.to_csv("output.csv")
response.addFileOutput("fuzzy_matched", "output.csv")
response.end()
blockspring.define(block)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment