njvack · April 25, 2019 15:05
diff --git a/distribution_match.py b/distribution_match.py
 # -*- coding: utf-8 -*-

 # Written by Nate Vack <njvack@wisc.edu> at the Center for Healthy Minds

 # Copyright 2019 Board of Regents of the University of Wisconsin System
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:

 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.

 import numpy as np


 def distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))


 def euclidian_distribution_match(
        distribution_data, sample_data, fields, debug=False):
    """
    Given distribution_data, sample_data, and a list of fields, return a new
    dataframe, sampled from sample_data, of the same size as distribution_data.
    Each point in the returned dataset will be the closest match, by
    euclidean distance, in sample_data. sample_data is sampled without
    replacement.
    This algorithm is super slow, you could doubtless make it faster.
    """
    samp = sample_data.copy()
    samp['_match_index'] = None

    for index, row in distribution_data.iterrows():
        point1 = row[fields]

        def dist_fx(point2):
            return distance(point1.values, point2.values)

        samp['_difference'] = samp[fields].apply(dist_fx, axis=1)
        samp.sort_values(by='_difference', inplace=True)
        match = samp.loc[samp['_match_index'].isnull()].iloc[0]
        match_index = match.name
        if debug:
            print(f'Search {point1}, found {match[fields]} at {match_index}')
        samp.loc[match_index, '_match_index'] = index
    return samp.loc[~samp['_match_index'].isnull()]


 def match_distributions(distribution_data, sample_data, field_name):
    """
    Given distribution_data, sample_data, and a field_name, return a new
    dataframe, sampled from sample_data, of the same size as distribution_data
    and as closely matched on the distribution of `field_name` as possible.

    This is a (much faster) special-case of the euclidian match above.
    It's also an inefficient algorithm, though!
    """
    samp = sample_data.copy()
    samp['_match_index'] = None

    for index, row in distribution_data.iterrows():
        val = row[field_name]
        samp['_difference'] = (samp[field_name] - val).abs()
        samp.sort_values(by=['_difference'], inplace=True)
        match = samp.loc[samp['_match_index'].isnull()].iloc[0]
        match_index = match.name
        samp.loc[match_index, '_match_index'] = index
    return samp.loc[samp['_match_index'] > 0]
	# -- coding: utf-8 --

	# Written by Nate Vack <njvack@wisc.edu> at the Center for Healthy Minds

	# Copyright 2019 Board of Regents of the University of Wisconsin System
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.

	import numpy as np


	def distance(point1, point2):
	return np.sqrt(np.sum((point1 - point2) ** 2))


	def euclidian_distribution_match(
	distribution_data, sample_data, fields, debug=False):
	"""
	Given distribution_data, sample_data, and a list of fields, return a new
	dataframe, sampled from sample_data, of the same size as distribution_data.
	Each point in the returned dataset will be the closest match, by
	euclidean distance, in sample_data. sample_data is sampled without
	replacement.
	This algorithm is super slow, you could doubtless make it faster.
	"""
	samp = sample_data.copy()
	samp['_match_index'] = None

	for index, row in distribution_data.iterrows():
	point1 = row[fields]

	def dist_fx(point2):
	return distance(point1.values, point2.values)

	samp['_difference'] = samp[fields].apply(dist_fx, axis=1)
	samp.sort_values(by='_difference', inplace=True)
	match = samp.loc[samp['_match_index'].isnull()].iloc[0]
	match_index = match.name
	if debug:
	print(f'Search {point1}, found {match[fields]} at {match_index}')
	samp.loc[match_index, '_match_index'] = index
	return samp.loc[~samp['_match_index'].isnull()]


	def match_distributions(distribution_data, sample_data, field_name):
	"""
	Given distribution_data, sample_data, and a field_name, return a new
	dataframe, sampled from sample_data, of the same size as distribution_data
	and as closely matched on the distribution of `field_name` as possible.

	This is a (much faster) special-case of the euclidian match above.
	It's also an inefficient algorithm, though!
	"""
	samp = sample_data.copy()
	samp['_match_index'] = None

	for index, row in distribution_data.iterrows():
	val = row[field_name]
	samp['_difference'] = (samp[field_name] - val).abs()
	samp.sort_values(by=['_difference'], inplace=True)
	match = samp.loc[samp['_match_index'].isnull()].iloc[0]
	match_index = match.name
	samp.loc[match_index, '_match_index'] = index
	return samp.loc[samp['_match_index'] > 0]