from dsc80_utils import *

import lec08_utils as util

heights = pd.read_csv('data/midparent.csv')
heights = heights.rename(columns={'childHeight': 'child'})
heights = heights[['father', 'mother', 'gender', 'child']]
heights.head()

np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.NaN

heights_mcar.head(10)

heights_mcar.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.3
dtype: float64

heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()

gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')

n_repetitions = 500
shuffled = heights_mcar.copy()

tvds = []
for _ in range(n_repetitions):
    
    # Shuffling genders. 
    # Note that we are assigning back to the same DataFrame for performance reasons; 
    # see https://dsc80.com/resources/lectures/lec07/lec07-fast-permutation-tests.html
    shuffled['gender'] = np.random.permutation(shuffled['gender'])
    
    # Computing and storing the TVD.
    pivoted = (
        shuffled
        .pivot_table(index='gender', columns='child_missing', aggfunc='size')
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)

observed_tvd = gender_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.009196155526430771

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=4 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

np.mean(np.array(tvds) >= observed_tvd)

0.838

px.histogram(heights_mcar, x='father', color='child_missing', histnorm='probability', marginal='box',
             title="Father's Height by Missingness of Child Height", barmode='overlay', opacity=0.7)

util.create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                       "Father's Height by Missingness of Child Height")

np.random.seed(42) # So that we get the same results each time (for lecture).

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1.
    if r['father'] > 72 and rand < 0.5:
        return np.NaN
    elif r['gender'] == 'female' and rand < 0.3:
        return np.NaN
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)
heights_mar['child_missing'] = heights_mar['child'].isna()

heights_mar.head()

gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')

util.create_kde_plotly(heights_mar, 'child_missing', True, False, 'father', 
                       "Father's Height by Missingness of Child Height")

heights_mar.groupby('child_missing')['father'].mean().diff().iloc[-1]

1.0055466604787853

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
distr1 = pd.Series(np.random.normal(0, 1, size=N//2))

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(3, 1, size=N//2))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
util.create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
a = pd.Series(np.random.normal(0, 1, size=N//2))
b = pd.Series(np.random.normal(4, 1, size=N//2))
distr1 = pd.concat([a,b], ignore_index=True)

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(distr1.mean(), distr1.std(), size=N))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
util.create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

n_repetitions = 500
shuffled = data.copy()

diff_means = []
for _ in range(n_repetitions):
    
    # Shuffling the values, while keeping the group labels in place.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the absolute difference in means.
    diff_mean = shuffled.groupby('group')['data'].mean().diff().abs().iloc[-1]
    diff_means.append(diff_mean)

observed_diff = data.groupby('group')['data'].mean().diff().abs().iloc[-1]
fig = px.histogram(pd.DataFrame(diff_means), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Absolute Difference in Means')
fig.add_vline(x=observed_diff, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means = {round(observed_diff, 2)}</span>',
                   x=1.45 * observed_diff, showarrow=False, y=0.07)

# The computed p-value is fairly large.
np.mean(np.array(diff_means) >= observed_diff)

0.108

util.create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

# Think about what this function is doing!
def create_cdf(group):
    return data.loc[data['group'] == group, 'data'].value_counts(normalize=True).sort_index().cumsum()

fig = go.Figure()

fig.add_trace(
    go.Scatter(x=create_cdf('A').index, y=create_cdf('A'), name='CDF of A')
)

fig.add_trace(
    go.Scatter(x=create_cdf('B').index, y=create_cdf('B'), name='CDF of B')
)

fig.update_layout(title='CDFs of A and B')

from scipy.stats import ks_2samp

ks_2samp?

observed_ks = ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data']).statistic
observed_ks

0.14

n_repetitions = 500
shuffled = data.copy()

ks_stats = []
for _ in range(n_repetitions):
    
    # Shuffling the data.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the K-S statistic.
    groups = shuffled.groupby('group')['data']
    ks_stat = ks_2samp(groups.get_group('A'), groups.get_group('B')).statistic
    ks_stats.append(ks_stat)
    
ks_stats[:10]

[0.037, 0.048, 0.04, 0.068, 0.045, 0.04, 0.042, 0.052, 0.019, 0.029]

fig = px.histogram(pd.DataFrame(ks_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the K-S Statistic')
fig.add_vline(x=observed_ks, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed KS = {round(observed_ks, 2)}</span>',
                   x=0.8 * observed_ks, showarrow=False, y=0.16)

fig.update_layout(xaxis_range=[0, 0.2])
fig.update_layout(yaxis_range=[0, 0.2])

np.mean(np.array(ks_stats) >= observed_ks)

0.0

ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data'])

KstestResult(statistic=0.14, pvalue=5.822752148022591e-09, statistic_location=0.9755451271223592, statistic_sign=1)

# Our MAR data
heights_mar.isna().mean()

father           0.00
mother           0.00
gender           0.00
child            0.18
child_missing    0.00
dtype: float64

heights_mar['child_missing'] = heights_mar['child'].isna()
util.create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")

util.create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")

heights_mar

ks_2samp(heights_mar.query('child_missing')['father'], heights_mar.query('not child_missing')['father'])

KstestResult(statistic=0.20676025834396874, pvalue=1.1424922868036869e-05, statistic_location=72.0, statistic_sign=-1)

np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mar = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.5)

util.multiple_describe({
    'Original': heights,
    'MCAR': heights_mcar,
    'MAR': heights_mar
})

# Look in util.py to see how multiple_kdes is defined.
util.multiple_kdes({'Original': heights, 'MCAR, Unfilled': heights_mcar})

heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64

heights_mcar_mfilled = heights_mcar.fillna(heights_mcar['child'].mean())
heights_mcar_mfilled['child'].head()

0    73.20
1    69.20
2    66.64
3    66.64
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MCAR, Unfilled': heights_mcar, 'MCAR, Mean Imputed': heights_mcar_mfilled}
util.multiple_describe(df_map)

util.multiple_kdes(df_map)

util.multiple_kdes({'Original': heights, 'MAR, Unfilled': heights_mar})

heights_mar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64

heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean())
heights_mar_mfilled['child'].head()

0    73.20
1    69.20
2    68.52
3    68.52
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MAR, Unfilled': heights_mar, 'MAR, Mean Imputed': heights_mar_mfilled}
util.multiple_describe(df_map)

util.multiple_kdes(df_map)

pd.concat([
    heights.groupby('gender')['child'].mean().rename('Original'),
    heights_mar.groupby('gender')['child'].mean().rename('MAR, Unfilled'),
    heights_mar_mfilled.groupby('gender')['child'].mean().rename('MAR, Mean Imputed')
], axis=1).T

def mean_impute(ser):
    return ser.fillna(ser.mean())

heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame()
heights_mar_cond['child'].head()

0    73.20
1    69.20
2    64.22
3    64.22
4    73.50
Name: child, dtype: float64

df_map['MAR, Conditional Mean Imputed'] = heights_mar_cond
util.multiple_kdes(df_map)

num_null = heights_mcar['child'].isna().sum()
num_null

467

fill_values = np.random.choice(heights_mcar['child'].dropna(), num_null)

heights_mcar_pfilled = heights_mcar.copy()
heights_mcar_pfilled.loc[heights_mcar_pfilled['child'].isna(), 'child'] = fill_values

df_map = {'Original': heights, 
          'MCAR, Unfilled': heights_mcar, 
          'MCAR, Probabilistically Imputed': heights_mcar_pfilled}

util.multiple_describe(df_map)

util.multiple_kdes(df_map)

heights_mcar.head()

# This function implements the 3-step process we studied earlier.
def create_imputed(col):
    col = col.copy()
    num_null = col.isna().sum()
    fill_values = np.random.choice(col.dropna(), num_null)
    col[col.isna()] = fill_values
    return col

create_imputed(heights_mcar['child']).head()

0    73.2
1    69.2
2    72.0
3    65.0
4    73.5
Name: child, dtype: float64

mult_imp = pd.concat([create_imputed(heights_mcar['child']).rename(k) for k in range(100)], axis=1)
mult_imp.head()

# Random sample of 15 imputed columns.
mult_imp_sample = mult_imp.sample(15, axis=1)
fig = ff.create_distplot(mult_imp_sample.to_numpy().T, list(mult_imp_sample.columns), show_hist=False, show_rug=False)
fig.update_xaxes(title='child')

px.histogram(pd.DataFrame(mult_imp.mean()), nbins=15, histnorm='probability',
             title='Distribution of Imputed Sample Means')

means = df.groupby('c2').mean().to_dict()
imputed = df['c1'].apply(lambda x: means[x] if np.isnan(x) else x)

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
...	...	...	...	...
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	father	mother	gender	child	child_missing
0	78.5	67.0	male	NaN	True
1	78.5	67.0	female	69.2	False
2	78.5	67.0	female	69.0	False
...	...	...	...	...	...
931	62.0	66.0	female	61.0	False
932	62.5	63.0	male	66.5	False
933	62.5	63.0	female	57.0	False

	Mean	Standard Deviation
Dataset
Original	66.75	3.58
MCAR, Unfilled	66.64	3.56
MCAR, Probabilistically Imputed	66.67	3.47

gender	female	male
Original	64.10	69.23
MAR, Unfilled	64.22	69.28
MAR, Mean Imputed	67.85	69.14

Lecture 8 – Imputation¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📝 Midterm Exam¶

📆 Agenda¶

Review: Missingness mechanisms¶

Review: Assessing missingness through data¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Comparing null and non-null 'child' distributions for 'gender'¶

Comparing null and non-null 'child' distributions for 'gender'¶

Simulation¶

Results¶

Comparing null and non-null 'child' distributions for 'father'¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

The Kolmogorov-Smirnov test statistic¶

Recap: Permutation tests¶

Difference in means¶

Discussion Question¶

Different distributions with the same mean¶

Telling quantitative distributions apart¶

The Kolmogorov-Smirnov test statistic¶

Aside: cumulative distribution functions¶

The K-S statistic in Python¶

ks_2samp¶

Difference in means vs. K-S statistic¶

Back to our Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Discussion Question¶

Performing the test¶

Handling missing values¶

What do we do with missing data?¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR heights dataset¶

Mean imputation of MCAR data¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR heights dataset¶

Mean imputation of MAR data¶

Mean imputation of MAR data¶

Within-group (conditional) mean imputation¶

transform returns!¶

Conclusion: Imputation with single values¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MCAR heights dataset¶

Observations¶

Randomness¶

Multiple imputation¶

Multiple imputation¶

Summary, next time¶

Summary of imputation techniques¶

Summary: Listwise deletion¶

Summary: Mean imputation¶

Summary: Conditional mean imputation¶

Summary: Probabilistic imputation¶

Summary: Multiple imputation¶

Next time¶

Verifying that child heights are MCAR in `heights_mcar`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'father'`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶

`ks_2samp`¶

Back to our Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Mean imputation in the MCAR `heights` dataset¶

Example: Mean imputation in the MAR `heights` dataset¶

`transform` returns!¶

Example: Probabilistic imputation in the MCAR `heights` dataset¶