import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(10, 5))
plt.rc('font', size=12)


heights = pd.read_csv('data/midparent.csv')
heights = heights.rename(columns={'childHeight': 'child'})
heights = heights[['father', 'mother', 'gender', 'child']]
heights.head()


heights.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.0
dtype: float64


pd.plotting.scatter_matrix(heights.drop('gender', axis=1));


np.random.seed(42) # So that we get the same results each time (for lecture)

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.NaN


heights_mcar.head(10)


heights_mcar.isna().mean()

father    0.000000
mother    0.000000
gender    0.000000
child     0.299786
dtype: float64


heights_mcar.head()


gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

gender_dist = gender_dist / gender_dist.sum()
gender_dist


gender_dist.plot(kind='barh', figsize=(5, 4), title='Gender by Missingness of Child Height');


shuffled = heights_mcar.copy()
shuffled['child_missing'] = shuffled['child'].isna()

n_repetitions = 500
tvds = []
for _ in range(n_repetitions):
    
    # Shuffling genders and assigning back to the DataFrame
    shuffled['gender'] = np.random.permutation(shuffled['gender'])
    
    # Computing and storing TVD
    pivoted = (
        shuffled
        .pivot_table(index='child_missing', columns='gender', aggfunc='size')
        .apply(lambda x: x / x.sum(), axis=1)
    )
    
    tvd = pivoted.diff().iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


obs_tvd = gender_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
obs_tvd

0.009196155526430771


pval = np.mean(tvds >= obs_tvd)

pd.Series(tvds).plot(kind='hist', density=True, ec='w', bins=10, title=f'p-value: {pval}', label='Simulated TVDs')
plt.axvline(x=obs_tvd, color='red', linewidth=4, label='Observed TVD')
plt.legend();


(
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .groupby('child_missing')['father']
    .plot(kind='hist', density=True, alpha=0.75, ec='w', bins=20, legend=True, 
          title="Father's Height by Missingness of Child Height")
);


(
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height")
);


np.random.seed(42) # So that we get the same results each time (for lecture)

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1
    if r['father'] > 72 and rand < 0.5:
        return np.NaN
    elif r['gender'] == 'female' and rand < 0.3:
        return np.NaN
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)


heights_mar.head()


gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

gender_dist = gender_dist / gender_dist.sum()
gender_dist


gender_dist.plot(kind='barh', figsize=(5, 4), title='Gender by Missingness of Child Height');


(
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height")
);


np.random.seed(42) # So that we get the same results each time (for lecture)

N = 1000 # number of samples for each distribution

# Distribution 'A'
distr1 = pd.Series(np.random.normal(0, 1, size=N//2))

# Distribution 'B'
distr2 = pd.Series(np.random.normal(3, 1, size=N//2))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})


meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
title = f'mean of A: {meanA}\n mean of B: {meanB}'

data.groupby('group')['data'].plot(kind='kde', legend=True, title=title);


np.random.seed(42) # So that we get the same results each time (for lecture)

N = 1000 # number of samples for each distribution

# Distribution 'A'
a = pd.Series(np.random.normal(0, 1, size=N//2))
b = pd.Series(np.random.normal(4, 1, size=N//2))
distr1 = pd.concat([a,b], ignore_index=True)

# Distribution 'B'
distr2 = pd.Series(np.random.normal(distr1.mean(), distr1.std(), size=N))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})


meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
title = f'mean of A: {meanA}\n mean of B: {meanB}'

data.groupby('group')['data'].plot(kind='kde', legend=True, title=title);


n_repetitions = 500
shuffled = data.copy()

diff_means = []
for _ in range(n_repetitions):
    
    # Shuffling the data and assigning it back to the DataFrame
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the absolute difference in means
    diff_mean = shuffled.groupby('group')['data'].mean().diff().abs().iloc[-1]
    diff_means.append(diff_mean)
    
diff_means[:10]

[0.06814385394584077,
 0.01012235918252502,
 0.20021636977312052,
 0.24281549039243333,
 0.010075853494654119,
 0.07500611481190322,
 0.022733963103508614,
 0.0016829212124789272,
 0.015376481518938334,
 0.0859578721994847]


obs_diff = data.groupby('group')['data'].mean().diff().abs().iloc[-1]
pval = np.mean(np.array(diff_means) >= obs_diff)

pd.Series(diff_means).plot(kind='hist', density=True, ec='w', bins=20, title=f'p-value: {pval}', label='Simulated Absolute Differences in Means')
plt.axvline(obs_diff, color='red', label='Observed Difference in Means')
plt.legend();


data.groupby('group')['data'].plot(kind='kde', legend=True);


# Original data
data.groupby('group')['data'].plot(kind='kde', legend=True);


# Try and figure out how this code works!
gpA = data.loc[data['group'] == 'A', 'data']
gpB = data.loc[data['group'] == 'B', 'data']

plt.plot(gpA.value_counts(normalize=True).sort_index().cumsum(), label='CDF of A')
plt.plot(gpB.value_counts(normalize=True).sort_index().cumsum(), label='CDF of B')
plt.legend();


from scipy.stats import ks_2samp


ks_2samp?


obs_ks = ks_2samp(gpA, gpB).statistic
obs_ks

0.14


n_repetitions = 500
shuffled = data.copy()

ks_stats = []
for _ in range(n_repetitions):
    
    # Shuffling the data and assigning it back to the DataFrame
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the K-S statistic
    groups = shuffled.groupby('group')['data']
    ks_stat = ks_2samp(groups.get_group('A'), groups.get_group('B')).statistic
    ks_stats.append(ks_stat)
    
ks_stats[:10]

[0.037, 0.048, 0.04, 0.068, 0.045, 0.04, 0.042, 0.052, 0.019, 0.029]


pval = np.mean(np.array(ks_stats) >= obs_ks)

pd.Series(ks_stats).plot(kind='hist', density=True, ec='w', title=f'p-value: {pval}', label='Simulated K-S Statistics')
plt.axvline(obs_diff, color='red', label='Observed K-S Statistic')
plt.legend();


ks_2samp(gpA, gpB)

KstestResult(statistic=0.14, pvalue=5.822752148022591e-09)


cars = pd.read_csv('data/cars.csv')
cars.head()


# Proportion of car colors missing
cars['car_color'].isna().mean()

0.1542


cars['color_missing'] = cars['car_color'].isna()


cars.head()


(
    cars
    .pivot_table(index='car_year', columns='color_missing', values=None, aggfunc='size')
    .fillna(0)
    .apply(lambda x: x / x.sum())
    .plot(title='Distribution of Car Years by Missingness of Color')
);


cars.head()


emp_distributions = (
    cars
    .pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
    .fillna(0)
    .apply(lambda x: x / x.sum())
)

# There are too many makes to plot thema ll at once!
emp_distributions.iloc[:20].plot(kind='barh', title='Distribution of Makes by Missingness of Color');


observed_tvd = emp_distributions.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.10371381974098398


shuffled = cars.copy()[['car_make', 'color_missing']]

n_repetitions = 500
tvds = []

for _ in range(n_repetitions):
    
    # Shuffling the colors and assigning them to the DataFrame
    shuffled['car_make'] = np.random.permutation(shuffled['car_make'])
    
    # Computing and storing the TVD
    pivoted = (
        shuffled
        .pivot_table(index='car_make', columns='color_missing', values=None, aggfunc='size')
        .fillna(0)
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


pval = np.mean(tvds >= observed_tvd)

pd.Series(tvds).plot(kind='hist', density=True, ec='w', bins=10, title=f'p-value: {pval}', label='Simulated TVDs')
plt.axvline(x=observed_tvd, color='red', linewidth=4, label='Observed TVD')
plt.legend();


payments = pd.read_csv('data/payment.csv')
payments['cc_isnull'] = payments['credit_card_number'].isna()


payments.head()


emp_distributions = (
    payments
    .pivot_table(columns='cc_isnull', index='credit_card_type', aggfunc='size')
    .fillna(0)
    .apply(lambda x:x / x.sum())
)

emp_distributions.plot(kind='barh', title='distribution of card types');


observed_tvd = np.sum(np.abs(emp_distributions.diff(axis=1).iloc[:,-1])) / 2
observed_tvd

0.08546365914786964


n_repetitions = 500

payments_type = payments.copy()[['credit_card_type', 'cc_isnull']]
tvds = []
for _ in range(n_repetitions):
    
    shuffled_types = (
        payments_type['credit_card_type']
        .sample(replace=False, frac=1)
        .reset_index(drop=True)
    )
    
    shuffled = (
        payments_type
        .assign(**{'Shuffled Types': shuffled_types})
    )
    
    # compute the tvd
    shuffed_emp_distributions = (
        shuffled
        .pivot_table(columns='cc_isnull', index='Shuffled Types', values=None, aggfunc='size')
        .fillna(0)
        .apply(lambda x:x/x.sum())
    )
    
    tvd = np.sum(np.abs(shuffed_emp_distributions.diff(axis=1).iloc[:,-1])) / 2
    # add it to the list of results
    
    tvds.append(tvd)


#: visualize
pd.Series(tvds).plot(kind='hist', density=True, alpha=0.8)
plt.scatter(observed_tvd, 0, color='red', s=40);


# p-value
np.count_nonzero(tvds <= observed_tvd) / len(tvds)

0.058


payments['date_of_birth'] = pd.to_datetime(payments.date_of_birth)
payments['age'] = (2019 - payments.date_of_birth.dt.year)


# are the distributions similar?
# Where are the differences? Are they noise, or real?
payments.groupby('cc_isnull').age.plot(kind='kde', title='distribution of ages by missingness of CC', legend=True);


ks_2samp(
    payments.groupby('cc_isnull')['age'].get_group(True),
    payments.groupby('cc_isnull')['age'].get_group(False)
)

KstestResult(statistic=0.12699202780883062, pvalue=0.03445181524401586)

Phone	Screen Size	Price
iPhone 13	6.06	999
Galaxy Z Fold 3	7.6	NaN
OnePlus 9 Pro	6.7	799
iPhone 12 Pro Max	6.68	NaN

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5
5	75.5	66.5	male	NaN
6	75.5	66.5	female	65.5
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5

	father	mother	gender	child
0	78.5	67.0	male	NaN
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	NaN

	vin	car_make	car_year	car_color
0	3D7TT2CT8BG121773	Audi	2008.0	Teal
1	SCBZB25E62C073475	Audi	1996.0	Mauv
2	1FT7W2A69EE682086	NaN	NaN	Turquoise
3	1B3AZ6JZ7AV582128	Ford	2010.0	Goldenrod
4	1GYUCGEF4AR632425	Mazda	1996.0	Purple

	id	credit_card_type	credit_card_number	date_of_birth	cc_isnull
0	1	diners-club-enroute	2.018706e+14	25-Sep-1982	False
1	2	americanexpress	3.737511e+14	08-Jan-1946	False
2	3	jcb	3.570296e+15	NaN	False
3	4	mastercard	5.318327e+15	NaN	False
4	5	maestro	6.759827e+17	20-Apr-1975	False

child_missing	False	True
gender
female	0.487768	0.478571
male	0.512232	0.521429

child_missing	False	True
gender
female	0.397386	0.881657
male	0.602614	0.118343

Lecture 12 – Missing Values, Continued¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Missingness mechanisms¶

Review: missingness mechanisms¶

Flowchart¶

Discussion Question¶

Why do we care again?¶

Formal definition: MCAR¶

Formal definition: MAR¶

Formal definition: NMAR¶

Assessing missingness through data¶

Assessing missingness through data¶

Assessing NMAR¶

Assessing MAR¶

Assessing MCAR¶

Assessing MCAR¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Comparing null and non-null 'child' distributions for 'gender'¶

Comparing null and non-null 'child' distributions for 'gender'¶

Simulation¶

Results¶

Comparing null and non-null 'child' distributions for 'father'¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

The Kolmogorov-Smirnov test statistic¶

Recap: permutation tests¶

Difference in means¶

Discussion Question¶

Different distributions with the same mean¶

Telling quantitative distributions apart¶

The Kolmogorov-Smirnov test statistic¶

Aside: cumulative distribution functions¶

The K-S statistic in Python¶

ks_2samp¶

Difference in means vs. K-S statistic¶

More examples¶

Summary: NMAR¶

Summary: MAR¶

Summary: MCAR¶

Example: Cars¶

Missingness of 'car_color' on 'car_make'¶

Example: Assessing missingness in payments data¶

Example: assessing missingness in payments data¶

Example: assessing missingness in payments data¶

Summary, next time¶

Summary, next time¶

Verifying that child heights are MCAR in `heights_mcar`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'father'`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶

`ks_2samp`¶

Missingness of `'car_color'` on `'car_make'`¶