Skip to content

Instantly share code, notes, and snippets.

@jfreels
Created November 30, 2021 00:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jfreels/d53ec92784193ee757c86117a4ef3ae6 to your computer and use it in GitHub Desktop.
Save jfreels/d53ec92784193ee757c86117a4ef3ae6 to your computer and use it in GitHub Desktop.
Determine average duration for each step across all sessions
""" Determine average duration for each step across all sessions """
import sys
STREAM = [
# session, step, timestamp
[1001, 1, 100000010], # duration = 11
[1001, 2, 100000021], # duration = 12
[1001, 3, 100000033], # duration = 13
[1001, 4, 100000046], # duration = None (there's no next step)
[1002, 1, 100000010], # duration = 10
[1002, 2, 100000020], # duration = None (there's no next session+step)
[1002, 2, 100000030], # duration = None (this is not the first occurance of this session+step)
]
# for any given session+step combo, keep only the first occurance
EXPECTED_RESULTS = {
1: 10.5,
2: 12.0,
3: 13.0
}
def main():
step_metrics = {
1: {
"total_converted_sessions": 0,
"total_converted_duration": 0
},
2: {
"total_converted_sessions": 0,
"total_converted_duration": 0
},
3: {
"total_converted_sessions": 0,
"total_converted_duration": 0
}
}
# keep track of which stream records have been seen, keeping only the latest one that matters
seen_stream_records = {}
for row in STREAM:
session_id = row[0]
step = row[1]
timestamp = row[2]
step_duration = None
seen_stream_record = seen_stream_records.get(session_id)
# has the session been seen? if not add it to seen_stream_records
if not seen_stream_record:
# has the previous row ever occured?
seen_stream_records[session_id] = {
"step": step,
"timestamp": timestamp
}
# if the session has been seen and current step > previous step
elif seen_stream_record and step > seen_stream_record["step"]:
step_duration = timestamp - seen_stream_record["timestamp"]
step_metrics[step-1]["total_converted_sessions"] += 1
step_metrics[step-1]["total_converted_duration"] += step_duration
if step < 4:
seen_stream_records[session_id] = {
"step": step,
"timestamp": timestamp
}
else:
# remove the seen_stream_record if the current row step is 4 (last step)
del seen_stream_records[session_id]
print(step_metrics)
avg_step_durations = {
1: step_metrics[1]["total_converted_duration"] / step_metrics[1]["total_converted_sessions"],
2: step_metrics[2]["total_converted_duration"] / step_metrics[2]["total_converted_sessions"],
3: step_metrics[3]["total_converted_duration"] / step_metrics[3]["total_converted_sessions"],
}
print(avg_step_durations)
assert avg_step_durations == EXPECTED_RESULTS
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment