Skip to content

Instantly share code, notes, and snippets.

javascript: Promise.all([import('https://unpkg.com/turndown@6.0.0?module'), import('https://unpkg.com/@tehshrike/readability@0.2.0'),]).then(async ([{
default: Turndown
}, {
default: Readability
}]) => {
/* Optional vault name */
const vault = "";
/* Optional folder name such as "Clippings/" */
@asifr
asifr / consecutive_group_ids.py
Created December 6, 2020 17:00
Create unique IDs from a run of 0s and 1s
def consecutive_group_ids(df, id_col: str, time_col: str, value_col: str, event_id_col: str="EventID"):
w1 = Window.partitionBy(id_col).orderBy(time_col)
wcumsum = (
Window
.partitionBy(id_col)
.orderBy(time_col)
.rangeBetween(Window.unboundedPreceding, 0)
)
res = (
@asifr
asifr / generate_ids.py
Created December 6, 2020 06:18
Group consecutive values and return unique ids
import numpy as np
def generate_ids(x):
partitions = lambda x: np.where(x[1:] != x[:-1])[0] + 1
inds = np.split(np.arange(len(x)), partitions(x))
ids = np.zeros(len(x))
for k, p in enumerate(inds):
ids[p] = k
return ids
@asifr
asifr / resample.py
Last active December 6, 2020 06:19
Resample a numpy array
import numpy as np
def resample(x, t, start, end, step):
bins = np.arange(start, end+step, step)
inds = np.digitize(t,bins)
n = x.shape[0]
y = np.empty((n, len(bins))) * np.nan
for i in range(n):
y[i,inds[i,:]] = x[i,:]
return y, bins
import numpy as np
def pad_sequences(
sequences, maxlen=None, dtype="int32", padding="pre", truncating="pre", value=0.0
):
if not hasattr(sequences, "__len__"):
raise ValueError("`sequences` must be iterable.")
lengths = []
for x in sequences:
if not hasattr(x, "__len__"):
import numpy as np
def outlier_detect(data, threshold=1, method="IQR"):
assert method in ["IQR", "STD", "MAD"], "Method must be one of IQR|STD|MAD"
if method == "IQR":
IQR = np.quantile(data, 0.75) - np.quantile(data, 0.25)
lower = np.quantile(data, 0.25) - (IQR * threshold)
upper = np.quantile(data, 0.75) + (IQR * threshold)
if method == "STD":
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd
def ffill(arr: np.ndarray):
arr = arr.T
mask = np.isnan(arr)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
np.maximum.accumulate(idx, axis=1, out=idx)
import numpy as np
def ffill(arr: np.ndarray):
arr = arr.T
mask = np.isnan(arr)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)
np.maximum.accumulate(idx, axis=1, out=idx)
out = arr[np.arange(idx.shape[0])[:, None], idx].T
return out
"""
Creates a new connection to spark and makes available:
`spark`, `sq` (`SQLContext`), `F`, and `Window` in the global namespace.
"""
from textwrap import dedent
import findspark
import os
def _formulate_pyspark_submit_args(submit_args=None):
pass
@asifr
asifr / event_label.py
Last active December 3, 2020 16:48
Label each row of a dataframe with EventXHoursFromNow or EventWithinXHours
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.column import Column
def overlaps(start_first, end_first, start_second, end_second):
return (end_first >= start_second) & (end_second >= start_first)
def eventXHrFromNow(hours: int, time_col: str, start_col: str, end_col: str) -> Column: