# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame

def show_permutation_testing_summary():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vSovXDonR6EmjrT45h4pY1mwmcKFMWVSdgpbKHC5HNTm9sbG7dojvvCDEQCjuk2dk1oA4gmwMogr8ZL/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 569
    display(IFrame(src, width, height))
    
def show_bootstrapping_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vS_iYHJYXSVMMZ-YQVFwMEFR6EFN3FDSAvaMyUm-YJfLQgRMTHm3vI-wWJJ5999eFJq70nWp2hyItZg/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))


babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies


babies.groupby('Maternal Smoker').mean()


babies_with_shuffled = babies.assign(
    Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker'))
)
babies_with_shuffled


original_groups = babies.groupby('Maternal Smoker').mean()
original_groups


original_means = original_groups.get('Birth Weight')
observed_difference = original_means.loc[False] - original_means.loc[True]
observed_difference

9.266142572024918


def difference_in_group_means(weights_df):
    group_means = weights_df.groupby('Shuffled_Labels').mean().get('Birth Weight')
    return group_means.loc[False] - group_means.loc[True]

# Shuffling the labels again.
babies_with_shuffled = babies.assign(Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker')))
difference_in_group_means(babies_with_shuffled)

-0.27796212502094875


n_repetitions = 500 # The dataset is large, so it takes too long to run if we use 5000 or 10000.
differences = np.array([])

for i in np.arange(n_repetitions):
    # Step 1: Shuffle the labels.
    shuffled_labels = np.random.permutation(babies.get('Maternal Smoker'))
    
    # Step 2: Put them in a DataFrame.
    shuffled = babies_with_shuffled.assign(Shuffled_Labels=shuffled_labels)
    
    # Step 3: Compute the difference in group means and add the result to the differences array.
    difference = difference_in_group_means(shuffled)
    
    differences = np.append(differences, difference)
    
differences

array([-0.53,  0.8 , -0.99, ...,  0.47,  0.15,  0.08])


(bpd.DataFrame()
 .assign(simulated_diffs=differences)
 .plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);


(bpd.DataFrame()
 .assign(simulated_diffs=differences)
 .plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();


smoker_p_value = np.count_nonzero(differences >= observed_difference) / 500
smoker_p_value

0.0


show_permutation_testing_summary()


babies.take(np.arange(3))


footballs = bpd.read_csv('data/footballs.csv')
footballs


means = footballs.groupby('Team').mean().get('PressureDrop')
means

Team
Colts       0.47
Patriots    1.21
Name: PressureDrop, dtype: float64


# Calculate the observed statistic.
observed_difference = means.loc['Patriots'] - means.loc['Colts']
observed_difference

0.7362500000000001


# For simplicity, keep only the columns that are necessary for the test: 
# one column of group labels and one column of numerical values.
footballs = footballs.get(['Team', 'PressureDrop'])
footballs


# Shuffle one column. 
# We chose to shuffle the numerical data (pressure drops), but we could have shuffled the group labels (team names) instead.
shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
shuffled_drops

array([0.47, 1.8 , 1.65, 1.23, 0.28, 0.42, 0.65, 1.18, 1.48, 0.72, 1.38,
       1.35, 0.47, 0.85])


# Add the shuffled column back to the DataFrame.
shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
shuffled


# Calculate the group means for the two randomly created groups.
team_means = shuffled.groupby('Team').mean().get('Shuffled_Drops')
team_means

Team
Colts       1.01
Patriots    0.99
Name: Shuffled_Drops, dtype: float64


# Calcuate the difference in group means (Patriots minus Colts) for the randomly created groups.
team_means.loc['Patriots'] - team_means.loc['Colts']

-0.02499999999999991


def difference_in_mean_pressure_drops(pressures_df):
    team_means = pressures_df.groupby('Team').mean().get('Shuffled_Drops')
    return team_means.loc['Patriots'] - team_means.loc['Colts']


n_repetitions = 5000 # The dataset is much smaller than in the baby weights example, so a larger number of repetitions will still run quickly.

differences = np.array([])
for i in np.arange(n_repetitions):
    # Step 1: Shuffle the pressure drops.
    shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
    
    # Step 2: Put them in a DataFrame.
    shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
    
    # Step 3: Compute the difference in group means and add the result to the differences array.
    difference = difference_in_mean_pressure_drops(shuffled)

    differences = np.append(differences, difference)
    
differences

array([-0.38, -0.22,  0.11, ..., -0.15, -0.02, -0.2 ])


bpd.DataFrame().assign(SimulatedDifferenceInMeans=differences).plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();


np.count_nonzero(differences >= observed_difference) / n_repetitions

0.0034


population = bpd.read_csv('data/2021_salaries.csv')
population


population.columns

Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
       'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
       'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
       'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
       'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
       'DeferredCompensationPlan', 'HealthDentalVision',
       'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
       'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
       'SpecialDistrictActivities', 'IncludesUnfundedLiability',
       'SpecialDistrictType'],
      dtype='object')


population = population.get(['TotalWages'])
population


population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2021');


population_median = population.get('TotalWages').median()
population_median

74441.0


np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample


# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

72016.0


sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([81062.5, 77915.5, 70419.5, ..., 71840. , 73618.5, 79238. ])


(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);


fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);


show_bootstrapping_slides()


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [2 3 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [2 1 3]     Median:  2.0


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [2 3 3]     Median:  3.0
Resample:  [3 1 3]     Median:  3.0
Resample:  [2 2 3]     Median:  2.0
Resample:  [2 3 1]     Median:  2.0
Resample:  [3 3 3]     Median:  3.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 2 1]     Median:  1.0
Resample:  [3 3 2]     Median:  3.0
Resample:  [3 3 1]     Median:  3.0
Resample:  [1 1 3]     Median:  1.0


# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)


boot_medians

array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();


my_sample.get('TotalWages').median()

72016.0


(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();

	TotalWages
0	359138
1	345336
2	336250
...	...
12302	9
12303	9
12304	4

	TotalWages
599	167191
10595	18598
837	157293
...	...
2423	122785
7142	62808
5792	78093

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
...	...	...
1171	True	130
1172	False	125
1173	False	117

	Team	Pressure	PressureDrop
0	Patriots	11.65	0.85
1	Patriots	11.03	1.48
2	Patriots	10.85	1.65
...	...	...	...
11	Colts	12.53	0.47
12	Colts	12.72	0.28
13	Colts	12.35	0.65

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12302	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12303	2021	City	San Diego	Fleet Operations	...	San Diego	NaN	False	NaN
12304	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

Lecture 18 – Permutation Testing, Bootstrapping¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Permutation testing¶

Purpose¶

Smoking and birth weight 👶¶

Setup for the hypothesis test¶

Strategy and implementation¶

Shuffling the labels¶

Calculating the test statistic¶

Repeating the process¶

Comparing the empirical distribution to the observed statistic¶

Conclusion¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example: Did the New England Patriots cheat? 🏈¶

Background¶

The measurements¶

The question¶

The test statistic¶

Creating random groups and calculating one value of the test statistic¶

The simulation¶

Conclusion¶

🚨 Caution!¶

Aftermath¶

Bootstrapping 🥾¶

City of San Diego employee salary data¶

Concept Check ✅ – Answer at cc.dsc10.com¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

The sample median is random¶

An impractical approach¶

The problem¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶