import pandas as pd
import numpy as np
from scipy import stats
import os

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(10, 5))
plt.rc('font', size=12)

import util


heights = pd.read_csv(os.path.join('data', 'heights.csv'))
heights = (
    heights
    .rename(columns={'childHeight': 'child', 'childNum': 'number'})
    .drop('midparentHeight', axis=1)
)
heights.head()


# Generating MCAR data
heights_mcar = util.make_mcar(heights, 'child', pct=0.5)
heights_mcar.isna().mean()

family      0.0
father      0.0
mother      0.0
children    0.0
number      0.0
gender      0.0
child       0.5
dtype: float64


heights_mcar['child_missing'] = heights_mcar['child'].isna()

(
    heights_mcar
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height (MCAR example)")
);


# 'father' when 'child' is missing 
father_ch_mis = heights_mcar.loc[heights_mcar['child_missing'], 'father']

# 'father' when 'child' is not missing
father_ch_not_mis = heights_mcar.loc[~heights_mcar['child_missing'], 'father']


stats.ks_2samp(father_ch_mis, father_ch_not_mis)

KstestResult(statistic=0.03640256959314775, pvalue=0.9168468032193087)


(
    heights_mcar
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height (MCAR example)")
);


# Generating MAR data
heights_mar = util.make_mar_on_num(heights, 'child', 'father', pct=0.75)
heights_mar.isna().mean()

family      0.000000
father      0.000000
mother      0.000000
children    0.000000
number      0.000000
gender      0.000000
child       0.749465
dtype: float64


heights_mar['child_missing'] = heights_mar['child'].isna()

(
    heights_mar
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height (MAR example)")
);


# 'father' when 'child' is missing 
father_ch_mis = heights_mar.loc[heights_mar['child_missing'], 'father']

# 'father' when 'child' is not missing
father_ch_not_mis = heights_mar.loc[~heights_mar['child_missing'], 'father']


stats.ks_2samp(father_ch_mis, father_ch_not_mis)

KstestResult(statistic=0.49496947496947497, pvalue=5.551115123125783e-16)


(
    heights_mar
    .groupby('child_missing')['father']
    .plot(kind='kde', legend=True, title="Father's Height by Missingness of Child Height (MAR example)")
);


np.random.seed(42) # So that we get the same results each time (for lecture)
heights_mcar = util.make_mcar(heights, 'child', pct=0.50)
heights_mar = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.50)


heights['child'].mean()

66.74593147751597


heights_mcar['child'].dropna().mean()

66.64068522483944


# Note that .mean() automatically drops nulls, so this expression is the same as the one above
heights_mcar['child'].mean()

66.64068522483944


heights_mar['child'].mean()

68.51884368308356


heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64


# Note that this is **not** a density histogram!
plt.hist([heights['child'], heights_mcar['child'].dropna()])
plt.legend(['full data', 'missing (mcar)']);


heights_mcar['child'].head()

0    73.2
1    69.2
2     NaN
3     NaN
4    73.5
Name: child, dtype: float64


heights_mcar_mfilled = heights_mcar.fillna(heights_mcar['child'].mean())
heights_mcar_mfilled['child'].head()

0    73.200000
1    69.200000
2    66.640685
3    66.640685
4    73.500000
Name: child, dtype: float64


print(
    'mean (original): %f' % heights['child'].mean(),
    'mean (missing):  %f' % heights_mcar['child'].mean(),
    'mean (mean imp): %f' % heights_mcar_mfilled['child'].mean(),
    sep='\n'
)

mean (original): 66.745931
mean (missing):  66.640685
mean (mean imp): 66.640685


print(
    'std (original): %f' % heights['child'].std(),
    'std (missing):  %f' % heights_mcar['child'].std(),
    'std (mean imp): %f' % heights_mcar_mfilled['child'].std(),
    sep='\n'
)

std (original): 3.579251
std (missing):  3.563299
std (mean imp): 2.518282


plt.hist([heights['child'], heights_mcar['child'].dropna(), heights_mcar_mfilled['child']])
plt.legend(['full data', 'missing (mcar)', 'imputed']);


np.random.seed(42) # So that we get the same results each time (for lecture)
heights_mar_cat = util.make_mar_on_cat(heights, 'child', 'gender', pct=0.50)
heights_mar_cat['child'].head()

0     NaN
1    69.2
2    69.0
3    69.0
4     NaN
Name: child, dtype: float64


# The observed vs true distribution
plt.hist([heights['child'], heights_mar_cat['child']])
plt.legend(['full data', 'missing (mar)']);


heights_mar_cat_mfilled = heights_mar_cat.fillna(heights_mar_cat['child'].mean())
heights_mar_cat_mfilled


print(
    'mean (original): %f' % heights['child'].mean(),
    'mean (missing):  %f' % heights_mar_cat['child'].mean(),
    'mean (mean imp): %f' % heights_mar_cat_mfilled['child'].mean(),
    sep='\n'
)

mean (original): 66.745931
mean (missing):  64.999358
mean (mean imp): 64.999358


print(
    'std (original): %f' % heights.child.std(),
    'std (missing):  %f' % heights_mar_cat.child.std(),
    'std (mean imp): %f' % heights_mar_cat_mfilled.child.std(),
    sep='\n'
)

std (original): 3.579251
std (missing):  3.166655
std (mean imp): 2.237963


plt.hist([heights['child'], heights_mar_cat['child'], heights_mar_cat_mfilled['child']]);
plt.legend(['full data', 'missing (mar)', 'imputed']);


pd.concat([
    heights.groupby('gender')['child'].mean().rename('full'),
    heights_mar_cat.groupby('gender')['child'].mean().rename('missing (mar)'),
    heights_mar_cat_mfilled.groupby('gender')['child'].mean().rename('naively imputed')
], axis=1)


def mean_impute(ser):
    return ser.fillna(ser.mean())

heights_mar_cat.groupby('gender')['child'].transform(mean_impute)

0      69.377907
1      69.200000
2      69.000000
3      69.000000
4      69.377907
         ...    
929    69.377907
930    62.000000
931    64.011024
932    69.377907
933    57.000000
Name: child, Length: 934, dtype: float64


num_null = heights_mcar['child'].isna().sum()
num_null

467


fill_values = heights_mcar.child.dropna().sample(num_null, replace=True)
fill_values

253    73.0
640    66.0
50     70.5
288    67.0
27     64.0
       ... 
655    73.5
855    61.0
930    62.0
16     69.0
691    64.0
Name: child, Length: 467, dtype: float64


# Find the positions where values in heights_mcar are missing
fill_values.index = heights_mcar.loc[heights_mcar['child'].isna()].index

# Fill in the missing values
heights_mcar_dfilled = heights_mcar.fillna({'child': fill_values.to_dict()})  # fill the vals


print(
    'mean (original):  %f' % heights['child'].mean(),
    'mean (missing):   %f' % heights_mcar['child'].mean(),
    'mean (distr imp): %f' % heights_mcar_dfilled['child'].mean(),
    sep='\n'
)

mean (original):  66.745931
mean (missing):   66.640685
mean (distr imp): 66.659529


print(
    'std (original):  %f' % heights['child'].std(),
    'std (missing):   %f' % heights_mcar['child'].std(),
    'std (distr imp): %f' % heights_mcar_dfilled['child'].std(),
    sep='\n'
)

std (original):  3.579251
std (missing):   3.563299
std (distr imp): 3.501084


plt.hist([heights['child'], heights_mcar['child'], heights_mcar_dfilled['child']], density=True);
plt.legend(['full data','missing (mcar)', 'distr imputed']);


heights_mcar.head()


# This function implements the 3-step process we studied earlier
def create_imputed(col):
    num_null = col.isna().sum()
    fill_values = col.dropna().sample(num_null, replace=True)
    fill_values.index = col.loc[col.isna()].index
    return col.fillna(fill_values.to_dict())


create_imputed(heights_mcar['child']).head()

0    73.2
1    69.2
2    62.0
3    65.0
4    73.5
Name: child, dtype: float64


mult_imp = pd.concat([create_imputed(heights_mcar['child']).rename(k) for k in range(100)], axis=1)
mult_imp.head()


# Random sample of 15 imputed columns
mult_imp.sample(15, axis=1).plot(kind='kde', alpha=0.5, legend=False);


mult_imp.mean().plot(kind='hist', bins=20, ec='w', density=True);

Lecture 13 – Imputation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Review: Missingness mechanisms¶

Review: Missingness mechanisms¶

Deciding between MAR and MCAR¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MCAR)¶

Discussion Question¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Discussion Question¶

Handling missing values¶

What do we do with missing data?¶

Example: Charity¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR `heights` dataset¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR `heights` dataset¶

Within-group (conditional) mean imputation¶

Discussion Question¶

Conclusion: Imputation with single values¶

Discussion Question¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MCAR `heights` dataset¶

Observations¶

Randomness¶

Multiple imputation¶

Summary¶

Summary of imputation techniques¶

Summary: listwise deletion¶

Summary: mean imputation¶

Summary: conditional mean imputation¶

Summary: probabilistic imputation¶

Summary: multiple imputation¶

	family	father	mother	children	number	gender	child
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	69.0
3	1	78.5	67.0	4	4	female	69.0
4	2	75.5	66.5	4	1	male	73.5

	full	missing (mar)	naively imputed
gender
female	64.103974	64.011024	64.168110
male	69.234096	69.377907	65.782217

Lecture 13 – Imputation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Review: Missingness mechanisms¶

Review: Missingness mechanisms¶

Deciding between MAR and MCAR¶

Example: Missingness of 'child' heights on 'father''s heights (MCAR)¶

Discussion Question¶

Example: Missingness of 'child' heights on 'father''s heights (MAR)¶

Discussion Question¶

Handling missing values¶

What do we do with missing data?¶

Example: Charity¶

Solution 1: Dropping missing values¶

Listwise deletion¶

Solution 2: Imputation¶

Kinds of imputation¶

Mean imputation¶

Mean imputation¶

Example: Mean imputation in the MCAR heights dataset¶

Mean imputation of MCAR data¶

Example: Mean imputation in the MAR heights dataset¶

Within-group (conditional) mean imputation¶

Discussion Question¶

Conclusion: Imputation with single values¶

Discussion Question¶

Probabilistic imputation¶

Imputing missing values using distributions¶

Example: Probabilistic imputation in the MCAR heights dataset¶

Observations¶

Randomness¶

Multiple imputation¶

Summary¶

Summary of imputation techniques¶

Summary: listwise deletion¶

Summary: mean imputation¶

Summary: conditional mean imputation¶

Summary: probabilistic imputation¶

Summary: multiple imputation¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MCAR)¶

Example: Missingness of `'child'` heights on `'father'`'s heights (MAR)¶

Example: Mean imputation in the MCAR `heights` dataset¶

Example: Mean imputation in the MAR `heights` dataset¶

Example: Probabilistic imputation in the MCAR `heights` dataset¶