import pandas as pd
import numpy as np
import os

import util

import plotly.express as px
import plotly.figure_factory as ff
pd.options.plotting.backend = 'plotly'


heights = pd.read_csv(os.path.join('data', 'midparent.csv'))
heights = (
    heights
    .rename(columns={'childHeight': 'child', 'childNum': 'number'})
    .drop('midparentHeight', axis=1)
)
heights.head()


# Generating MCAR data.
np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mcar.isna().mean()

family      0.0
father      0.0
mother      0.0
children    0.0
number      0.0
gender      0.0
child       0.5
dtype: float64


heights_mcar['child_missing'] = heights_mcar['child'].isna()
util.create_kde_plotly(heights_mcar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MCAR example)")


util.create_kde_plotly(heights_mcar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MCAR example)")


# 'father' when 'child' is missing.
father_ch_mis = heights_mcar.loc[heights_mcar['child_missing'], 'father']

# 'father' when 'child' is not missing.
father_ch_not_mis = heights_mcar.loc[~heights_mcar['child_missing'], 'father']


from scipy.stats import ks_2samp

ks_2samp(father_ch_mis, father_ch_not_mis)

KstestResult(statistic=0.0728051391862955, pvalue=0.16824323619176823)


util.create_kde_plotly(heights_mcar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MCAR example)")


# Generating MAR data.
heights_mar = util.make_mar_on_num(heights, 'child', 'father', pct=0.75)
heights_mar.isna().mean()

family      0.000000
father      0.000000
mother      0.000000
children    0.000000
number      0.000000
gender      0.000000
child       0.749465
dtype: float64


heights_mar['child_missing'] = heights_mar['child'].isna()
util.create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")


util.create_kde_plotly(heights_mar[['child_missing', 'father']], 'child_missing', True, False, 'father',
                       "Father's Height by Missingness of Child Height (MAR example)")


np.random.seed(42) # So that we get the same results each time (for lecture).
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mar = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.5)


util.multiple_describe({
    'Original': heights,
    'MCAR': heights_mcar,
    'MAR': heights_mar
})


# Look in util.py to see how multiple_kdes is defined.
util.multiple_kdes({'Original': heights, 'MCAR, Unfilled': heights_mcar})


heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64


heights_mcar_mfilled = heights_mcar.fillna(heights_mcar['child'].mean())
heights_mcar_mfilled['child'].head()

0    73.200000
1    69.200000
2    66.640685
3    66.640685
4    73.500000
Name: child, dtype: float64


df_map = {'Original': heights, 'MCAR, Unfilled': heights_mcar, 'MCAR, Mean Imputed': heights_mcar_mfilled}
util.multiple_describe(df_map)


util.multiple_kdes(df_map)


util.multiple_kdes({'Original': heights, 'MAR, Unfilled': heights_mar})


heights_mar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64


heights_mar_mfilled = heights_mar.fillna(heights_mar['child'].mean())
heights_mar_mfilled['child'].head()

0    73.200000
1    69.200000
2    68.518844
3    68.518844
4    73.500000
Name: child, dtype: float64


df_map = {'Original': heights, 'MAR, Unfilled': heights_mar, 'MAR, Mean Imputed': heights_mar_mfilled}
util.multiple_describe(df_map)


util.multiple_kdes(df_map)


pd.concat([
    heights.groupby('gender')['child'].mean().rename('Original'),
    heights_mar.groupby('gender')['child'].mean().rename('MAR, Unfilled'),
    heights_mar_mfilled.groupby('gender')['child'].mean().rename('MAR, Mean Imputed')
], axis=1).T


def mean_impute(ser):
    return ser.fillna(ser.mean())

heights_mar_cond = heights_mar.groupby('gender')['child'].transform(mean_impute).to_frame()
heights_mar_cond['child'].head()

0    73.200000
1    69.200000
2    64.218571
3    64.218571
4    73.500000
Name: child, dtype: float64


df_map['MAR, Conditional Mean Imputed'] = heights_mar_cond
util.multiple_kdes(df_map)


num_null = heights_mcar['child'].isna().sum()
num_null

467


fill_values = np.random.choice(heights_mcar['child'].dropna(), num_null)


heights_mcar_pfilled = heights_mcar.copy()
heights_mcar_pfilled.loc[heights_mcar_pfilled['child'].isna(), 'child'] = fill_values


df_map = {'Original': heights, 
          'MCAR, Unfilled': heights_mcar, 
          'MCAR, Probabilistically Imputed': heights_mcar_pfilled}


util.multiple_describe(df_map)


util.multiple_kdes(df_map)


heights_mcar.head()


# This function implements the 3-step process we studied earlier.
def create_imputed(col):
    col = col.copy()
    num_null = col.isna().sum()
    fill_values = np.random.choice(col.dropna(), num_null)
    col[col.isna()] = fill_values
    return col


create_imputed(heights_mcar['child']).head()

0    73.2
1    69.2
2    72.0
3    65.0
4    73.5
Name: child, dtype: float64


mult_imp = pd.concat([create_imputed(heights_mcar['child']).rename(k) for k in range(100)], axis=1)
mult_imp.head()


# Random sample of 15 imputed columns.
mult_imp_sample = mult_imp.sample(15, axis=1)
fig = ff.create_distplot(mult_imp_sample.to_numpy().T, list(mult_imp_sample.columns), show_hist=False, show_rug=False)
fig.update_xaxes(title='child')


px.histogram(pd.DataFrame(mult_imp.mean()), nbins=15, histnorm='probability',
             title='Distribution of Imputed Sample Means')

	family	father	mother	children	number	gender	child
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	69.0
3	1	78.5	67.0	4	4	female	69.0
4	2	75.5	66.5	4	1	male	73.5

gender	female	male
Original	64.103974	69.234096
MAR, Unfilled	64.218571	69.277078
MAR, Mean Imputed	67.854342	69.144663

	Mean	Standard Deviation
Dataset
Original	66.745931	3.579251
MCAR, Unfilled	66.640685	3.563299
MCAR, Probabilistically Imputed	66.668308	3.474865

	family	father	mother	children	number	gender	child
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	NaN
3	1	78.5	67.0	4	4	female	NaN
4	2	75.5	66.5	4	1	male	73.5

Lecture 13 – Imputation¶

DSC 80, Winter 2023¶

📣 Announcements¶

Midterm Exam Logistics¶

Agenda¶

Recap: Identifying missingness mechanisms¶

Review: Missingness mechanisms¶

Deciding between MAR and MCAR¶

Deciding between MAR and MCAR¶

Example: Heights¶

Example: Missingness of 'child' heights on 'father''s heights (MCAR)¶

Example: Missingness of 'child' heights on 'father''s heights (MCAR)¶

Difference in means vs. K-S statistic¶

Example: Missingness of 'child' heights on 'father''s heights (MCAR)¶

Discussion Question¶

Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Discussion Question¶

Handling missing values¶

What do we do with missing data?¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR heights dataset¶

Mean imputation of MCAR data¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR heights dataset¶

Mean imputation of MAR data¶

Mean imputation of MAR data¶

Within-group (conditional) mean imputation¶

transform returns!¶

Conclusion: Imputation with single values¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MCAR heights dataset¶

Observations¶

Randomness¶

Multiple imputation¶

Multiple imputation¶

Summary, next time¶

Summary of imputation techniques¶

Summary: Listwise deletion¶

Summary: Mean imputation¶

Summary: Conditional mean imputation¶

Summary: Probabilistic imputation¶

Summary: Multiple imputation¶

Next time¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MCAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MCAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MCAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Mean imputation in the MCAR `heights` dataset¶

Example: Mean imputation in the MAR `heights` dataset¶

`transform` returns!¶

Example: Probabilistic imputation in the MCAR `heights` dataset¶