Skip to content

Instantly share code, notes, and snippets.

@njvack
Created April 25, 2019 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save njvack/8b1c6d1ffc829bde337f6610db5ccf4a to your computer and use it in GitHub Desktop.
Save njvack/8b1c6d1ffc829bde337f6610db5ccf4a to your computer and use it in GitHub Desktop.
Match distributions of two pandas dataframes
# -*- coding: utf-8 -*-
# Written by Nate Vack <njvack@wisc.edu> at the Center for Healthy Minds
# Copyright 2019 Board of Regents of the University of Wisconsin System
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import numpy as np
def distance(point1, point2):
return np.sqrt(np.sum((point1 - point2) ** 2))
def euclidian_distribution_match(
distribution_data, sample_data, fields, debug=False):
"""
Given distribution_data, sample_data, and a list of fields, return a new
dataframe, sampled from sample_data, of the same size as distribution_data.
Each point in the returned dataset will be the closest match, by
euclidean distance, in sample_data. sample_data is sampled without
replacement.
This algorithm is super slow, you could doubtless make it faster.
"""
samp = sample_data.copy()
samp['_match_index'] = None
for index, row in distribution_data.iterrows():
point1 = row[fields]
def dist_fx(point2):
return distance(point1.values, point2.values)
samp['_difference'] = samp[fields].apply(dist_fx, axis=1)
samp.sort_values(by='_difference', inplace=True)
match = samp.loc[samp['_match_index'].isnull()].iloc[0]
match_index = match.name
if debug:
print(f'Search {point1}, found {match[fields]} at {match_index}')
samp.loc[match_index, '_match_index'] = index
return samp.loc[~samp['_match_index'].isnull()]
def match_distributions(distribution_data, sample_data, field_name):
"""
Given distribution_data, sample_data, and a field_name, return a new
dataframe, sampled from sample_data, of the same size as distribution_data
and as closely matched on the distribution of `field_name` as possible.
This is a (much faster) special-case of the euclidian match above.
It's also an inefficient algorithm, though!
"""
samp = sample_data.copy()
samp['_match_index'] = None
for index, row in distribution_data.iterrows():
val = row[field_name]
samp['_difference'] = (samp[field_name] - val).abs()
samp.sort_values(by=['_difference'], inplace=True)
match = samp.loc[samp['_match_index'].isnull()].iloc[0]
match_index = match.name
samp.loc[match_index, '_match_index'] = index
return samp.loc[samp['_match_index'] > 0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment