import pandas as pd
import numpy as np
import os

import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'

# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
    fig = ff.create_distplot(
        hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
        group_labels=[group1, group2],
        show_rug=False, show_hist=False,
        colors=['#ef553b', '#636efb'],
    )
    return fig.update_layout(title=title)


heights = pd.read_csv('data/midparent.csv')
heights = heights.rename(columns={'childHeight': 'child'})
heights = heights[['father', 'mother', 'gender', 'child']]
heights.head()


heights.isna().sum()

father    0
mother    0
gender    0
child     0
dtype: int64


fig = px.scatter_matrix(heights.drop(columns=['gender']))
fig

/Users/surajrampure/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
  dims = [


np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.NaN


heights_mcar.head(10)


heights_mcar.isna().mean()

father    0.000000
mother    0.000000
gender    0.000000
child     0.299786
dtype: float64


heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()


gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist


gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')


n_repetitions = 500
shuffled = heights_mcar.copy()

tvds = []
for _ in range(n_repetitions):
    
    # Shuffling genders. 
    # Note that we are assigning back to the same DataFrame for performance reasons; 
    # see https://dsc80.com/resources/lectures/lec11/lec11-fast-permutation-tests.html.
    shuffled['gender'] = np.random.permutation(shuffled['gender'])
    
    # Computing and storing the TVD.
    pivoted = (
        shuffled
        .pivot_table(index='gender', columns='child_missing', aggfunc='size')
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)


observed_tvd = gender_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.009196155526430771


fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=2.3 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])


np.mean(np.array(tvds) >= observed_tvd)

0.838


px.histogram(heights_mcar, x='father', color='child_missing', histnorm='probability', marginal='box',
             title="Father's Height by Missingness of Child Height", barmode='overlay', opacity=0.7)


create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height")


np.random.seed(42) # So that we get the same results each time (for lecture).

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1.
    if r['father'] > 72 and rand < 0.5:
        return np.NaN
    elif r['gender'] == 'female' and rand < 0.3:
        return np.NaN
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)
heights_mar['child_missing'] = heights_mar['child'].isna()


heights_mar.head()


gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist


gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')


# When child heights are MAR:
heights_mar['child'].mean()

67.10339869281046


# When child heights are not missing:
heights['child'].mean()

66.74593147751605


# When child heights are MCAR:
heights_mcar['child'].mean()

66.7256880733945


create_kde_plotly(heights_mar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height")


heights_mar.groupby('child_missing')['father'].mean().diff().iloc[-1]

1.0055466604787853


np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
distr1 = pd.Series(np.random.normal(0, 1, size=N//2))

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(3, 1, size=N//2))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})


meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')


np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
a = pd.Series(np.random.normal(0, 1, size=N//2))
b = pd.Series(np.random.normal(4, 1, size=N//2))
distr1 = pd.concat([a,b], ignore_index=True)

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(distr1.mean(), distr1.std(), size=N))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})


meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')


n_repetitions = 500
shuffled = data.copy()

diff_means = []
for _ in range(n_repetitions):
    
    # Shuffling the values, while keeping the group labels in place.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the absolute difference in means.
    diff_mean = shuffled.groupby('group')['data'].mean().diff().abs().iloc[-1]
    diff_means.append(diff_mean)
    
diff_means[:10]

[0.06814385394584033,
 0.010122359182524576,
 0.2002163697731203,
 0.2428154903924331,
 0.010075853494653675,
 0.07500611481190367,
 0.02273396310350906,
 0.0016829212124784831,
 0.015376481518938778,
 0.08595787219948381]


observed_diff = data.groupby('group')['data'].mean().diff().abs().iloc[-1]
fig = px.histogram(pd.DataFrame(diff_means), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Absolute Difference in Means')
fig.add_vline(x=observed_diff, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means = {round(observed_diff, 2)}</span>',
                   x=1.45 * observed_diff, showarrow=False, y=0.07)


# The computed p-value is fairly large.
np.mean(np.array(diff_means) >= observed_diff)

0.108


create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')


# Think about what this function is doing!
def create_cdf(group):
    return data.loc[data['group'] == group, 'data'].value_counts(normalize=True).sort_index().cumsum()

fig = go.Figure()

fig.add_trace(
    go.Scatter(x=create_cdf('A').index, y=create_cdf('A'), name='CDF of A')
)

fig.add_trace(
    go.Scatter(x=create_cdf('B').index, y=create_cdf('B'), name='CDF of B')
)

fig.update_layout(title='CDFs of A and B')


from scipy.stats import ks_2samp


ks_2samp?


observed_ks = ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data']).statistic
observed_ks

0.14


n_repetitions = 500
shuffled = data.copy()

ks_stats = []
for _ in range(n_repetitions):
    
    # Shuffling the data.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the K-S statistic.
    groups = shuffled.groupby('group')['data']
    ks_stat = ks_2samp(groups.get_group('A'), groups.get_group('B')).statistic
    ks_stats.append(ks_stat)
    
ks_stats[:10]

[0.037, 0.048, 0.04, 0.068, 0.045, 0.04, 0.042, 0.052, 0.019, 0.029]


fig = px.histogram(pd.DataFrame(ks_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the K-S Statistic')
fig.add_vline(x=observed_ks, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed KS = {round(observed_ks, 2)}</span>',
                   x=0.85 * observed_ks, showarrow=False, y=0.16)

fig.update_layout(xaxis_range=[0, 0.2])
fig.update_layout(yaxis_range=[0, 0.2])


np.mean(np.array(ks_stats) >= observed_ks)

0.0


ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data'])

KstestResult(statistic=0.14, pvalue=5.822752148022591e-09)

Phone	Screen Size	Price
iPhone 14	6.06	999
Galaxy Z Fold 4	7.6	NaN
OnePlus 9 Pro	6.7	799
iPhone 13 Pro Max	6.68	NaN

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5
5	75.5	66.5	male	NaN
6	75.5	66.5	female	65.5
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	child_missing = False	child_missing = True
gender
female	0.487768	0.478571
male	0.512232	0.521429

	child_missing = False	child_missing = True
gender
female	0.397386	0.881657
male	0.602614	0.118343

Lecture 12 – Identifying Missingness Mechanisms¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Missingness mechanisms¶

Flowchart¶

Discussion Question¶

Why do we care again?¶

Formal definitions¶

Formal definition: MCAR¶

Formal definition: MAR¶

Formal definition: NMAR¶

Assessing missingness through data¶

Assessing missingness through data¶

Assessing NMAR¶

Assessing MAR¶

Deciding between MCAR and MAR¶

Deciding between MCAR and MAR¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Comparing null and non-null 'child' distributions for 'gender'¶

Comparing null and non-null 'child' distributions for 'gender'¶

Simulation¶

Results¶

Comparing null and non-null 'child' distributions for 'father'¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

The Kolmogorov-Smirnov test statistic¶

Recap: Permutation tests¶

Difference in means¶

Discussion Question¶

Different distributions with the same mean¶

Telling quantitative distributions apart¶

The Kolmogorov-Smirnov test statistic¶

Aside: cumulative distribution functions¶

The K-S statistic in Python¶

ks_2samp¶

Summary, next time¶

Summary¶

Verifying that child heights are MCAR in `heights_mcar`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'father'`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶

`ks_2samp`¶