This notebook serves to provide more examples of how to identify missingness mechanisms through data.
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'
from scipy.stats import ks_2samp
# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
fig = ff.create_distplot(
hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
group_labels=[group1, group2],
show_rug=False, show_hist=False,
colors=['#ef553b', '#636efb'],
)
return fig.update_layout(title=title)
'vin'
number, 'car_make'
, 'car_year'
, and 'car_color'
.'car_color'
missing at random, dependent on 'car_year'
?'car_year'
similar when color is missing vs. not missing?Let's use a permutation test!
cars = pd.read_csv(os.path.join('data', 'cars.csv'))
cars.head()
vin | car_make | car_year | car_color | |
---|---|---|---|---|
0 | 3D7TT2CT8BG121773 | Audi | 2008.0 | Teal |
1 | SCBZB25E62C073475 | Audi | 1996.0 | Mauv |
2 | 1FT7W2A69EE682086 | NaN | NaN | Turquoise |
3 | 1B3AZ6JZ7AV582128 | Ford | 2010.0 | Goldenrod |
4 | 1GYUCGEF4AR632425 | Mazda | 1996.0 | Purple |
# Proportion of car colors missing.
cars['car_color'].isna().mean()
0.1542
cars['color_missing'] = cars['car_color'].isna()
cars.head()
vin | car_make | car_year | car_color | color_missing | |
---|---|---|---|---|---|
0 | 3D7TT2CT8BG121773 | Audi | 2008.0 | Teal | False |
1 | SCBZB25E62C073475 | Audi | 1996.0 | Mauv | False |
2 | 1FT7W2A69EE682086 | NaN | NaN | Turquoise | False |
3 | 1B3AZ6JZ7AV582128 | Ford | 2010.0 | Goldenrod | False |
4 | 1GYUCGEF4AR632425 | Mazda | 1996.0 | Purple | False |
(
cars
.pivot_table(index='car_year', columns='color_missing', values=None, aggfunc='size')
.fillna(0)
.apply(lambda x: x / x.sum())
.plot(title='Distribution of Car Years by Missingness of Color')
)
'car_color'
depends on 'car_year'
.'car_color'
is MCAR, we'd need to do a similar analysis for all other columns.'car_color'
on 'car_make'
¶Let's test whether the missingness of 'car_color'
is dependent on 'car_make'
.
cars.head()
vin | car_make | car_year | car_color | color_missing | |
---|---|---|---|---|---|
0 | 3D7TT2CT8BG121773 | Audi | 2008.0 | Teal | False |
1 | SCBZB25E62C073475 | Audi | 1996.0 | Mauv | False |
2 | 1FT7W2A69EE682086 | NaN | NaN | Turquoise | False |
3 | 1B3AZ6JZ7AV582128 | Ford | 2010.0 | Goldenrod | False |
4 | 1GYUCGEF4AR632425 | Mazda | 1996.0 | Purple | False |
emp_distributions = (
cars
.pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
.fillna(0)
.apply(lambda x: x / x.sum())
)
# There are too many makes to plot them all at once! Instead, we'll take the top 20.
emp_distributions.iloc[:20].plot(kind='barh', title='Distribution of Makes by Missingness of Color',
barmode='group')
observed_tvd = emp_distributions.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd
0.10371381974098398
shuffled = cars.copy()[['car_make', 'color_missing']]
n_repetitions = 500
tvds = []
for _ in range(n_repetitions):
shuffled['car_make'] = np.random.permutation(shuffled['car_make'])
pivoted = (
shuffled
.pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
.fillna(0)
.apply(lambda x: x / x.sum())
)
tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
tvds.append(tvd)
fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability',
title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
x=1.08 * observed_tvd, showarrow=False, y=0.1)
fig.update_layout(yaxis_range=[0, 0.15])
np.mean(np.array(tvds) >= observed_tvd)
0.672
Here, we fail to reject the null that the distribution of 'car_make'
is the same whether or not 'car_color'
is missing.
payments = pd.read_csv(os.path.join('data', 'payment.csv'))
payments['cc_isnull'] = payments['credit_card_number'].isna()
payments.head()
id | credit_card_type | credit_card_number | date_of_birth | cc_isnull | |
---|---|---|---|---|---|
0 | 1 | diners-club-enroute | 2.018706e+14 | 25-Sep-1982 | False |
1 | 2 | americanexpress | 3.737511e+14 | 08-Jan-1946 | False |
2 | 3 | jcb | 3.570296e+15 | NaN | False |
3 | 4 | mastercard | 5.318327e+15 | NaN | False |
4 | 5 | maestro | 6.759827e+17 | 20-Apr-1975 | False |
emp_distributions = (
payments
.pivot_table(columns='cc_isnull', index='credit_card_type', aggfunc='size')
.fillna(0)
.apply(lambda x:x / x.sum())
)
emp_distributions.plot(kind='barh', title='Distribution of Card Types', barmode='group')
observed_tvd = emp_distributions.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd
0.08546365914786964
shuffled = payments.copy()[['credit_card_type', 'cc_isnull']]
n_repetitions = 500
tvds = []
for _ in range(n_repetitions):
shuffled['credit_card_type'] = np.random.permutation(shuffled['credit_card_type'])
pivoted = (
shuffled
.pivot_table(index='credit_card_type', columns='cc_isnull', values=None, aggfunc='size')
.fillna(0)
.apply(lambda x: x / x.sum())
)
tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
tvds.append(tvd)
fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability',
title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
x=0.06, showarrow=False, y=0.08)
fig.update_layout(xaxis_range=[0, 0.25])
# Same as np.mean(np.array(tvds) >= observed_tvd).
np.count_nonzero(np.array(tvds) >= observed_tvd) / len(tvds)
0.942
payments['date_of_birth'] = pd.to_datetime(payments['date_of_birth'])
payments['age'] = (2023 - payments.date_of_birth.dt.year)
Note that the age column itself has missing values.
create_kde_plotly(payments[['cc_isnull', 'age']].dropna(), 'cc_isnull', True, False, 'age')
ks_2samp(
payments.groupby('cc_isnull')['age'].get_group(True),
payments.groupby('cc_isnull')['age'].get_group(False)
)
KstestResult(statistic=0.12699202780883062, pvalue=0.03445181524401586)