import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'

from scipy.stats import ks_2samp

# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
    fig = ff.create_distplot(
        hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
        group_labels=[group1, group2],
        show_rug=False, show_hist=False,
        colors=['#ef553b', '#636efb'],
    )
    return fig.update_layout(title=title)


cars = pd.read_csv(os.path.join('data', 'cars.csv'))
cars.head()


# Proportion of car colors missing.
cars['car_color'].isna().mean()

0.1542


cars['color_missing'] = cars['car_color'].isna()


cars.head()


(
    cars
    .pivot_table(index='car_year', columns='color_missing', values=None, aggfunc='size')
    .fillna(0)
    .apply(lambda x: x / x.sum())
    .plot(title='Distribution of Car Years by Missingness of Color')
)


cars.head()


emp_distributions = (
    cars
    .pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
    .fillna(0)
    .apply(lambda x: x / x.sum())
)

# There are too many makes to plot them all at once! Instead, we'll take the top 20.
emp_distributions.iloc[:20].plot(kind='barh', title='Distribution of Makes by Missingness of Color', 
                                 barmode='group')


observed_tvd = emp_distributions.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.10371381974098398


shuffled = cars.copy()[['car_make', 'color_missing']]

n_repetitions = 500
tvds = []

for _ in range(n_repetitions):
    
    shuffled['car_make'] = np.random.permutation(shuffled['car_make'])
    
    pivoted = (
        shuffled
        .pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
        .fillna(0)
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=1.08 * observed_tvd, showarrow=False, y=0.1)
fig.update_layout(yaxis_range=[0, 0.15])


np.mean(np.array(tvds) >= observed_tvd)

0.672


payments = pd.read_csv(os.path.join('data', 'payment.csv'))
payments['cc_isnull'] = payments['credit_card_number'].isna()


payments.head()


emp_distributions = (
    payments
    .pivot_table(columns='cc_isnull', index='credit_card_type', aggfunc='size')
    .fillna(0)
    .apply(lambda x:x / x.sum())
)

emp_distributions.plot(kind='barh', title='Distribution of Card Types', barmode='group')


observed_tvd = emp_distributions.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.08546365914786964


shuffled = payments.copy()[['credit_card_type', 'cc_isnull']]

n_repetitions = 500
tvds = []

for _ in range(n_repetitions):
    
    shuffled['credit_card_type'] = np.random.permutation(shuffled['credit_card_type'])
    
    pivoted = (
        shuffled
        .pivot_table(index='credit_card_type', columns='cc_isnull', values=None, aggfunc='size')
        .fillna(0)
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=0.06, showarrow=False, y=0.08)
fig.update_layout(xaxis_range=[0, 0.25])


# Same as np.mean(np.array(tvds) >= observed_tvd).
np.count_nonzero(np.array(tvds) >= observed_tvd) / len(tvds)

0.942


payments['date_of_birth'] = pd.to_datetime(payments['date_of_birth'])
payments['age'] = (2023 - payments.date_of_birth.dt.year)


create_kde_plotly(payments[['cc_isnull', 'age']].dropna(), 'cc_isnull', True, False, 'age')


ks_2samp(
    payments.groupby('cc_isnull')['age'].get_group(True),
    payments.groupby('cc_isnull')['age'].get_group(False)
)

KstestResult(statistic=0.12699202780883062, pvalue=0.03445181524401586)

	vin	car_make	car_year	car_color
0	3D7TT2CT8BG121773	Audi	2008.0	Teal
1	SCBZB25E62C073475	Audi	1996.0	Mauv
2	1FT7W2A69EE682086	NaN	NaN	Turquoise
3	1B3AZ6JZ7AV582128	Ford	2010.0	Goldenrod
4	1GYUCGEF4AR632425	Mazda	1996.0	Purple

	vin	car_make	car_year	car_color	color_missing
0	3D7TT2CT8BG121773	Audi	2008.0	Teal	False
1	SCBZB25E62C073475	Audi	1996.0	Mauv	False
2	1FT7W2A69EE682086	NaN	NaN	Turquoise	False
3	1B3AZ6JZ7AV582128	Ford	2010.0	Goldenrod	False
4	1GYUCGEF4AR632425	Mazda	1996.0	Purple	False

	vin	car_make	car_year	car_color	color_missing
0	3D7TT2CT8BG121773	Audi	2008.0	Teal	False
1	SCBZB25E62C073475	Audi	1996.0	Mauv	False
2	1FT7W2A69EE682086	NaN	NaN	Turquoise	False
3	1B3AZ6JZ7AV582128	Ford	2010.0	Goldenrod	False
4	1GYUCGEF4AR632425	Mazda	1996.0	Purple	False

	id	credit_card_type	credit_card_number	date_of_birth	cc_isnull
0	1	diners-club-enroute	2.018706e+14	25-Sep-1982	False
1	2	americanexpress	3.737511e+14	08-Jan-1946	False
2	3	jcb	3.570296e+15	NaN	False
3	4	mastercard	5.318327e+15	NaN	False
4	5	maestro	6.759827e+17	20-Apr-1975	False

Aside: More Missingness Examples¶

Example: Cars 🚗¶

Example: Cars¶

Missingness of `'car_color'` on `'car_make'`¶

Example: Payments 💰¶

Example: Assessing missingness in payments data¶

Assessing missingness in payments data¶

Assessing missingness in payments data¶

Aside: More Missingness Examples¶

Example: Cars 🚗¶

Example: Cars¶

Missingness of 'car_color' on 'car_make'¶

Example: Payments 💰¶

Example: Assessing missingness in payments data¶

Assessing missingness in payments data¶

Assessing missingness in payments data¶

Missingness of `'car_color'` on `'car_make'`¶