from dsc80_utils import *

heights_path = Path('data') / 'midparent.csv'
heights = pd.read_csv(heights_path).rename(columns={'childHeight': 'child'})[['father', 'mother', 'gender', 'child']]
heights.head()

np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.NaN

heights_mcar.head(10)

heights_mcar.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.3
dtype: float64

heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()

gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height (MCAR Example)', barmode='group')

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height (MCAR Example)")

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'mother', 
                  "Mother's Height by Missingness of Child Height (MCAR Example)")

np.random.seed(42) # So that we get the same results each time (for lecture).

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1.
    if r['father'] > 72 and rand < 0.5:
        return np.NaN
    elif r['gender'] == 'female' and rand < 0.3:
        return np.NaN
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)
heights_mar['child_missing'] = heights_mar['child'].isna()

heights_mar.head()

gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height (MAR Example)', barmode='group')

create_kde_plotly(heights_mar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height (MAR Example)")

(
    heights_mar
    .groupby('child_missing')
    ['father']
    .mean()
    .diff()
    .iloc[-1]
)

1.0055466604787853

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
distr1 = pd.Series(np.random.normal(0, 1, size=N // 2))

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(3, 1, size=N // 2))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

np.random.seed(42) # So that we get the same results each time (for lecture).

N = 1000 # Number of samples for each distribution.

# Distribution 'A'.
a = pd.Series(np.random.normal(0, 1, size=N//2))
b = pd.Series(np.random.normal(4, 1, size=N//2))
distr1 = pd.concat([a,b], ignore_index=True)

# Distribution 'B'.
distr2 = pd.Series(np.random.normal(distr1.mean(), distr1.std(), size=N))

data = pd.concat([distr1, distr2], axis=1, keys=['A', 'B']).unstack().reset_index().drop('level_1', axis=1)
data = data.rename(columns={'level_0': 'group', 0: 'data'})

meanA, meanB = data.groupby('group')['data'].mean().round(7).tolist()
create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

n_repetitions = 500
shuffled = data.copy()

diff_means = []
for _ in range(n_repetitions):
    
    # Shuffling the values, while keeping the group labels in place.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the absolute difference in means.
    diff_mean = shuffled.groupby('group')['data'].mean().diff().abs().iloc[-1]
    diff_means.append(diff_mean)

observed_diff = data.groupby('group')['data'].mean().diff().abs().iloc[-1]
fig = px.histogram(pd.DataFrame(diff_means), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Absolute Difference in Means')
fig.add_vline(x=observed_diff, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed Absolute Difference in Means = {round(observed_diff, 2)}</span>',
                   x=2 * observed_diff, showarrow=False, y=0.07)

# The computed p-value is fairly large.
np.mean(np.array(diff_means) >= observed_diff)

0.108

create_kde_plotly(data, 'group', 'A', 'B', 'data', f'mean of A: {meanA}<br>mean of B: {meanB}')

fig1 = create_kde_plotly(data, 'group', 'A', 'B', 'data', f'Distributions of A and B')

# Think about what this function is doing!
def create_cdf(group):
    return data.loc[data['group'] == group, 'data'].value_counts(normalize=True).sort_index().cumsum()

fig2 = go.Figure()

fig2.add_trace(
    go.Scatter(x=create_cdf('A').index, y=create_cdf('A'), name='CDF of A')
)

fig2.add_trace(
    go.Scatter(x=create_cdf('B').index, y=create_cdf('B'), name='CDF of B')
)

fig2.update_layout(title='CDFs of A and B')

from plotly.subplots import make_subplots

for i in range(2):
    fig2.data[i]['marker']['color'] = fig1.data[i]['marker']['color']
    fig2.data[i]['showlegend'] = False
    
fig = make_subplots(rows=1, cols=2, subplot_titles=['Distributions', 'CDFs'])
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig1.data[1], row=1, col=1)
fig.add_trace(fig2.data[0], row=1, col=2)
fig.add_trace(fig2.data[1], row=1, col=2)
fig.update_layout(width=1000, height=400);

fig

from scipy.stats import ks_2samp

ks_2samp?

observed_ks = ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data']).statistic
observed_ks

0.14

n_repetitions = 500
shuffled = data.copy()

ks_stats = []
for _ in range(n_repetitions):
    
    # Shuffling the data.
    shuffled['data'] = np.random.permutation(shuffled['data'])
    
    # Computing and storing the K-S statistic.
    groups = shuffled.groupby('group')['data']
    ks_stat = ks_2samp(groups.get_group('A'), groups.get_group('B')).statistic
    ks_stats.append(ks_stat)
    
ks_stats[:10]

[0.037, 0.048, 0.04, 0.068, 0.045, 0.04, 0.042, 0.052, 0.019, 0.029]

fig = px.histogram(pd.DataFrame(ks_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the K-S Statistic')
fig.add_vline(x=observed_ks, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed KS = {round(observed_ks, 2)}</span>',
                   x=0.8 * observed_ks, showarrow=False, y=0.16)

fig.update_layout(xaxis_range=[0, 0.2])
fig.update_layout(yaxis_range=[0, 0.2])

np.mean(np.array(ks_stats) >= observed_ks)

0.0

ks_2samp(data.loc[data['group'] == 'A', 'data'], data.loc[data['group'] == 'B', 'data'])

KstestResult(statistic=0.14, pvalue=5.822752148022591e-09, statistic_location=0.9755451271223592, statistic_sign=1)

heights_mar['child_missing'] = heights_mar['child'].isna()
create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")

heights_mar

ks_2samp(heights_mar.query('child_missing')['father'], heights_mar.query('not child_missing')['father'])

KstestResult(statistic=0.20676025834396874, pvalue=1.1424922868036869e-05, statistic_location=72.0, statistic_sign=-1)

np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = make_mcar(heights, 'child', pct=0.5)
heights_mar = make_mar_on_cat(heights, 'child', 'gender', pct=0.5)

multiple_describe({
    'Original': heights,
    'MCAR': heights_mcar,
    'MAR': heights_mar
})

# Look in util.py to see how multiple_kdes is defined.
multiple_kdes({'Original': heights, 'MCAR, Unfilled': heights_mcar})

heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64

heights_mcar_mfilled = heights_mcar.fillna(heights_mcar['child'].mean())
heights_mcar_mfilled['child'].head()

0    73.20
1    69.20
2    66.64
3    66.64
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MCAR, Unfilled': heights_mcar, 'MCAR, Mean Imputed': heights_mcar_mfilled}
multiple_describe(df_map)

multiple_kdes(df_map)

multiple_kdes({'Original': heights, 'MAR, Unfilled': heights_mar})

heights_mar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64

heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean())
heights_mar_mfilled['child'].head()

0    73.20
1    69.20
2    68.52
3    68.52
4    73.50
Name: child, dtype: float64

df_map = {'Original': heights, 'MAR, Unfilled': heights_mar, 'MAR, Mean Imputed': heights_mar_mfilled}
multiple_describe(df_map)

multiple_kdes(df_map)

pd.concat([
    heights.groupby('gender')['child'].mean().rename('Original'),
    heights_mar.groupby('gender')['child'].mean().rename('MAR, Unfilled'),
    heights_mar_mfilled.groupby('gender')['child'].mean().rename('MAR, Mean Imputed')
], axis=1).T

def mean_impute(s):
    return s.fillna(s.mean())

heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame()
heights_mar_cond['child'].head()

0    73.20
1    69.20
2    64.22
3    64.22
4    73.50
Name: child, dtype: float64

df_map['MAR, Conditional Mean Imputed'] = heights_mar_cond
multiple_kdes(df_map)

def prob_impute(s):
    s = s.copy()
    
    # Step 1: Find the number of missing child heights for that gender.
    num_null = s.isna().sum()
    
    # Step 2: Sample num_null observed child heights for that gender.
    fill_values = np.random.choice(s.dropna(), num_null)
    
    # Step 3: Fill in missing values and return ser.
    s[s.isna()] = fill_values
    return s

heights_mar_pfilled = heights_mar.copy()
heights_mar_pfilled['child'] = (
    heights_mar
    .groupby('gender')
    ['child']
    .transform(prob_impute)
)
heights_mar_pfilled['child'].head()

0    73.2
1    69.2
2    62.0
3    62.5
4    73.5
Name: child, dtype: float64

df_map['MAR, Conditionally Probabilistically Imputed'] = heights_mar_pfilled
multiple_kdes(df_map)

means = df.groupby('c2').mean().to_dict()
imputed = df['c1'].apply(lambda x: means[x] if np.isnan(x) else x)

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
...	...	...	...	...
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	Mean	Standard Deviation
Dataset
Original	66.75	3.58
MCAR, Unfilled	66.64	3.56
MCAR, Mean Imputed	66.64	2.52

gender	female	male
Original	64.10	69.23
MAR, Unfilled	64.22	69.28
MAR, Mean Imputed	67.85	69.14

	child_missing = False	child_missing = True
gender
female	0.49	0.48
male	0.51	0.52

	child_missing = False	child_missing = True
gender
female	0.4	0.88
male	0.6	0.12

Lecture 8 – Imputation¶

DSC 80, Spring 2024¶

Announcements 📣¶

Midterm Exam 📝¶

Agenda 📆¶

Review: Missingness mechanisms¶

Flowchart¶

Question 🤔 (Answer at q.dsc80.com)

Question 🤔 (Answer at q.dsc80.com)

Question 🤔 (Answer at q.dsc80.com)

Identifying missingness mechanisms in data¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

The Kolmogorov-Smirnov test statistic¶

Recap: Permutation tests¶

Difference in means¶

Different distributions with the same mean¶

Telling numerical distributions apart¶

The Kolmogorov-Smirnov test statistic¶

Aside: Cumulative distribution functions¶

Aside: Cumulative distribution functions¶

The K-S statistic in Python¶

ks_2samp¶

Difference in means vs. K-S statistic¶

Back to our Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Performing the test¶

Handling missing values¶

What do we do with missing data?¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR heights dataset¶

Mean imputation of MCAR data¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR heights dataset¶

Mean imputation of MAR data¶

Mean imputation of MAR data¶

Within-group (conditional) mean imputation¶

transform returns!¶

Conclusion: Imputation with single values¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MAR heights dataset¶

Observations¶

Randomness¶

Summary, next time¶

Summary of imputation techniques¶

Summary: Listwise deletion¶

Summary: Mean imputation¶

Summary: Conditional mean imputation¶

Summary: Probabilistic imputation¶

Summary: Multiple imputation¶

Next time¶

Verifying that child heights are MCAR in `heights_mcar`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶

`ks_2samp`¶

Back to our Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Mean imputation in the MCAR `heights` dataset¶

Example: Mean imputation in the MAR `heights` dataset¶

`transform` returns!¶

Example: Probabilistic imputation in the MAR `heights` dataset¶