Skip to content

Instantly share code, notes, and snippets.

@roveo
Created October 18, 2022 18:00
Show Gist options
  • Save roveo/facb1857cb252761e35c0218bb5539bd to your computer and use it in GitHub Desktop.
Save roveo/facb1857cb252761e35c0218bb5539bd to your computer and use it in GitHub Desktop.
Data Quality
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Качество данных\n",
"\n",
"Прежде чем работать с данными, нужно проверить, что данные соответствуют нашим ожиданиям\n",
"и описанию/документации. В реальном мире проверка происходит постоянно по мере появления\n",
"новых записей, с помощью инструментов типа [Great Expectations](https://greatexpectations.io/),\n",
"но для статических данных можно воспользоваться Pandas.\n",
"\n",
"Каждое наше \"ожидание\" — один `pandas.Series` c булевыми значениями, `True` означает\n",
"что записи соответствует ожиданию, `False` — что не соответствует."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7213891, 17)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"df = pd.read_csv(\"test_task_BI_analyst.csv\", low_memory=False)\n",
"df[\"pickup_datetime\"] = pd.to_datetime(df[\"pickup_datetime\"])\n",
"df[\"dropoff_datetime\"] = pd.to_datetime(df[\"dropoff_datetime\"])\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"class ExpectationsSuite:\n",
" def __init__(self, data: pd.DataFrame):\n",
" self.expectations = pd.DataFrame()\n",
" self.data = data\n",
"\n",
" def expect(self, expectation_name: str, series: pd.Series):\n",
" self.expectations[expectation_name] = series\n",
" \n",
" def expect_not_empty(self, column: str):\n",
" self.expect(f\"{column} is not empty\", ~self.data[column].isna())\n",
" \n",
" def expect_positive(self, column: str):\n",
" self.expect(f\"{column} is positive\", self.data[column] > 0)\n",
"\n",
" def expect_not_negative(self, column: str):\n",
" self.expect(f\"{column} is not negative\", ~self.data[column].lt(0))\n",
" \n",
" def expect_in(self, column: str, values):\n",
" self.expect(f\"{column} is in {values}\", self.data[column].isin(values))\n",
"\n",
" def report(self):\n",
" total_rows = self.expectations.shape[0]\n",
" valid_rows = self.expectations.all(axis=1).sum()\n",
" valid_percentage = valid_rows / total_rows\n",
" print(f\"{len(self.expectations.columns)} expectations for {df.shape[0]} rows\")\n",
" print(f\"Valid rows: {valid_rows}/{total_rows} ({valid_percentage:.2%})\", end=\"\\n\\n\")\n",
" result = pd.DataFrame(e.expectations.T.sum(axis=1).rename(\"valid_rows\"))\n",
" result[\"invalid_rows\"] = (~e.expectations).sum()\n",
" result[\"invalid_rate\"] = result[\"invalid_rows\"] / df.shape[0]\n",
" return result\n",
" \n",
" def combinations(self) -> pd.DataFrame:\n",
" return (\n",
" self.expectations.groupby(self.expectations.columns.tolist())\n",
" .size()\n",
" .rename(\"rows\")\n",
" .reset_index()\n",
" .sort_values(by=\"rows\", ascending=False)\n",
" )\n",
"\n",
"\n",
"e = ExpectationsSuite(df)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# expectations\n",
"# unused expectations, used for data exploration, but not for actual validation,\n",
"# are commented out for clarity\n",
"\n",
"e.expect_not_empty(\"VendorID\")\n",
"e.expect_in(\"VendorID\", {1, 2})\n",
"\n",
"e.expect(\"Pickup is before dropoff\", df[\"pickup_datetime\"] < df[\"dropoff_datetime\"])\n",
"e.expect(\"Pickup year is 2077\", df[\"pickup_datetime\"].dt.year == 2077)\n",
"e.expect(\"Pickup month is October\", df[\"pickup_datetime\"].dt.month == 10)\n",
"\n",
"e.expect(\"Dropoff year is 2077\", df[\"dropoff_datetime\"].dt.year == 2077)\n",
"e.expect(\"Dropoff month is October\", df[\"dropoff_datetime\"].dt.month == 10)\n",
"\n",
"# e.expect_positive(\"passenger_count\")\n",
"e.expect_positive(\"trip_distance\")\n",
"\n",
"e.expect_in(\"RatecodeID\", set(range(1, 7)))\n",
"e.expect_in(\"str_and_fwd_flag\", {\"Y\", \"N\"})\n",
"\n",
"e.expect_not_empty(\"FLocationID\")\n",
"e.expect_not_empty(\"TLocationID\")\n",
"\n",
"e.expect_in(\"payment_type\", set(range(1, 7)))\n",
"\n",
"money_columns = [\n",
" \"fare_amount\",\n",
" \"extra\",\n",
" \"xcorp_tax\",\n",
" \"tip_amount\",\n",
" \"tolls_amount\",\n",
" \"improvement_surcharge\",\n",
" \"total_amount\",\n",
"]\n",
"for col in money_columns:\n",
" e.expect_not_empty(col)\n",
" # e.expect_not_negative(col)\n",
"\n",
"# e.expect_in(\"extra\", {0, 0.5, 1, 1.5})\n",
"\n",
"# total_columns = money_columns[:-1]\n",
"# e.expect(\n",
"# \"total_amount is a sum of all money columns\",\n",
"# df[\"total_amount\"] == df[total_columns].sum(axis=1),\n",
"# )\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20 expectations for 7213891 rows\n",
"Valid rows: 7093618/7213891 (98.33%)\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>valid_rows</th>\n",
" <th>invalid_rows</th>\n",
" <th>invalid_rate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>VendorID is not empty</th>\n",
" <td>7167168</td>\n",
" <td>46723</td>\n",
" <td>0.006477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>VendorID is in {1, 2}</th>\n",
" <td>7167168</td>\n",
" <td>46723</td>\n",
" <td>0.006477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pickup is before dropoff</th>\n",
" <td>7208672</td>\n",
" <td>5219</td>\n",
" <td>0.000723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pickup year is 2077</th>\n",
" <td>7213814</td>\n",
" <td>77</td>\n",
" <td>0.000011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pickup month is October</th>\n",
" <td>7213588</td>\n",
" <td>303</td>\n",
" <td>0.000042</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dropoff year is 2077</th>\n",
" <td>7213814</td>\n",
" <td>77</td>\n",
" <td>0.000011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dropoff month is October</th>\n",
" <td>7210453</td>\n",
" <td>3438</td>\n",
" <td>0.000477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>trip_distance is positive</th>\n",
" <td>7143978</td>\n",
" <td>69913</td>\n",
" <td>0.009691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RatecodeID is in {1, 2, 3, 4, 5, 6}</th>\n",
" <td>7166737</td>\n",
" <td>47154</td>\n",
" <td>0.006537</td>\n",
" </tr>\n",
" <tr>\n",
" <th>str_and_fwd_flag is in {'Y', 'N'}</th>\n",
" <td>7167168</td>\n",
" <td>46723</td>\n",
" <td>0.006477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FLocationID is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TLocationID is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>payment_type is in {1, 2, 3, 4, 5, 6}</th>\n",
" <td>7167168</td>\n",
" <td>46723</td>\n",
" <td>0.006477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fare_amount is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>extra is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>xcorp_tax is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tip_amount is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>tolls_amount is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>improvement_surcharge is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>total_amount is not empty</th>\n",
" <td>7213891</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" valid_rows invalid_rows invalid_rate\n",
"VendorID is not empty 7167168 46723 0.006477\n",
"VendorID is in {1, 2} 7167168 46723 0.006477\n",
"Pickup is before dropoff 7208672 5219 0.000723\n",
"Pickup year is 2077 7213814 77 0.000011\n",
"Pickup month is October 7213588 303 0.000042\n",
"Dropoff year is 2077 7213814 77 0.000011\n",
"Dropoff month is October 7210453 3438 0.000477\n",
"trip_distance is positive 7143978 69913 0.009691\n",
"RatecodeID is in {1, 2, 3, 4, 5, 6} 7166737 47154 0.006537\n",
"str_and_fwd_flag is in {'Y', 'N'} 7167168 46723 0.006477\n",
"FLocationID is not empty 7213891 0 0.000000\n",
"TLocationID is not empty 7213891 0 0.000000\n",
"payment_type is in {1, 2, 3, 4, 5, 6} 7167168 46723 0.006477\n",
"fare_amount is not empty 7213891 0 0.000000\n",
"extra is not empty 7213891 0 0.000000\n",
"xcorp_tax is not empty 7213891 0 0.000000\n",
"tip_amount is not empty 7213891 0 0.000000\n",
"tolls_amount is not empty 7213891 0 0.000000\n",
"improvement_surcharge is not empty 7213891 0 0.000000\n",
"total_amount is not empty 7213891 0 0.000000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"e.report()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Выводы\n",
"\n",
"Большинство значений (кроме тех про которые написано ниже) соответствуют ожиданиям.\n",
"В реальном мире требуется работа с разработчиками/стейкхолдерами, чтобы поправить ошибки\n",
"и понять, как считать метрики по невалидным данным (например, учитываются ли в выручке\n",
"поездки, начавшиеся позже того как закончились). Здесь для простоты мы их просто исключаем.\n",
"\n",
"## `passenger_count`\n",
"\n",
"Количество пассажиров вводится водителями, а не системой, поэтому ошибок довольно много,\n",
"кроме того, мы можем поставить себе цель улучшить процент поездок с известным количеством\n",
"пассажиров, поэтому такие строчки оставляем.\n",
"\n",
"## Деньги\n",
"\n",
"- Непонятно, как считается `total_amount`. Примерно в 33% случаев это сумма всех «денежных»\n",
" колонок, но часто появляются какие-то лишние суммы, чаще всего $2,50. В норме надо идти\n",
" к бизнесу и задавать много вопросов, здесь придётся просто «поверить» в `total_amount`.\n",
"- `extra` — надбавка за спрос/ночь, в документации $0,5 или $1, можно предположить что\n",
" ещё может быть $1,5, когда одновременно ночью и высокий спрос. В реальности там куча\n",
" других вариантов.\n",
"- Во всех денежных колонках встречаются отрицательные значения. В биллингах так часто\n",
" обозначаются возвраты, но это не биллинг, а лог поездок. Кроме того, если это отмены,\n",
" мы бы ожидали `payment_type = 6`, а у «отрицательных» поездок все варианты кроме `6`.\n",
" Без уникальных идентификаторов поездок непонятно, что это значит, но будем считать,\n",
" что это возвраты.\n",
"- Про `tolls_amount` написано, что это «стоимость поездки без учета супернадбавки»,\n",
" ожидание было такое, что это поле будет равно `total_amount - improvement_surcharge`,\n",
" но судя по сумме, это просто отдельная составляющая `total_amount`.\n",
"\n",
"Поскольку в данных есть информация о поставщике данных, сама невалидность данных может\n",
"иметь бизнес-смысл, поэтому просто добавляем в таблицу флаг `is_valid` на основании проверок\n",
"для всех колонок, кроме `passenger_count` и проверки суммы `total_amount`."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df[\"is_valid\"] = e.expectations.iloc[:, :-2].all(axis=1)\n",
"\n",
"# fix an artefact of Pandas not supporting missing ints by default\n",
"df[[\"VendorID\", \"passenger_count\", \"RatecodeID\", \"payment_type\"]] = df[\n",
" [\"VendorID\", \"passenger_count\", \"RatecodeID\", \"payment_type\"]\n",
"].astype(\"Int32\")\n",
"\n",
"# add known literal values for IDs\n",
"df[\"Vendor\"] = df[\"VendorID\"].map({1: \"SkyFort Mobile Technologies\", 2: \"Micron Inc.\"})\n",
"df[\"Rate\"] = df[\"RatecodeID\"].map(\n",
" {\n",
" 1: \"Standard\",\n",
" 2: \"Rate 2\",\n",
" 3: \"Rate 3\",\n",
" 4: \"Rate 4\",\n",
" 5: \"Rate 5\",\n",
" 6: \"Rate 6\",\n",
" }\n",
")\n",
"df[\"DataSent\"] = df[\"str_and_fwd_flag\"].map(\n",
" {\"Y\": \"Delayed\", \"N\": \"Immediate\"}\n",
")\n",
"df[\"PaymentType\"] = df[\"payment_type\"].map({\n",
" 1: \"Credit card\",\n",
" 2: \"Cash\",\n",
" 3: \"Free\",\n",
" 4: \"Not paid\",\n",
" 5: \"N/A\",\n",
" 5: \"Cancelled\",\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"rides.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 64-bit ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "d1c01c1aef5bf133c5cb3f2cf1dcdb7912309035452812152dfa61d47145039e"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment