# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
from IPython.display import display, IFrame
def show_permutation_testing_summary():
src = "https://docs.google.com/presentation/d/e/2PACX-1vSovXDonR6EmjrT45h4pY1mwmcKFMWVSdgpbKHC5HNTm9sbG7dojvvCDEQCjuk2dk1oA4gmwMogr8ZL/embed?start=false&loop=false&delayms=3000"
width = 960
height = 569
display(IFrame(src, width, height))
def show_bootstrapping_slides():
src = "https://docs.google.com/presentation/d/e/2PACX-1vS_iYHJYXSVMMZ-YQVFwMEFR6EFN3FDSAvaMyUm-YJfLQgRMTHm3vI-wWJJ5999eFJq70nWp2hyItZg/embed?start=false&loop=false&delayms=3000"
width = 960
height = 509
display(IFrame(src, width, height))
Permutation tests help answer questions of the form:
I have two samples, but no information about any population distributions. Do these samples look like they were drawn from the same population?
babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
... | ... | ... |
1171 | True | 130 |
1172 | False | 125 |
1173 | False | 117 |
1174 rows × 2 columns
The means of the two groups in our sample are different.
babies.groupby('Maternal Smoker').mean()
Birth Weight | |
---|---|
Maternal Smoker | |
False | 123.09 |
True | 113.82 |
babies_with_shuffled = babies.assign(
Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker'))
)
babies_with_shuffled
Maternal Smoker | Birth Weight | Shuffled_Labels | |
---|---|---|---|
0 | False | 120 | True |
1 | False | 113 | True |
2 | True | 128 | False |
... | ... | ... | ... |
1171 | True | 130 | False |
1172 | False | 125 | False |
1173 | False | 117 | False |
1174 rows × 3 columns
The 'Maternal Smoker'
column defines the original groups. The 'Shuffed_Labels'
column defines the random groups.
For the original groups:
original_groups = babies.groupby('Maternal Smoker').mean()
original_groups
Birth Weight | |
---|---|
Maternal Smoker | |
False | 123.09 |
True | 113.82 |
original_means = original_groups.get('Birth Weight')
observed_difference = original_means.loc[False] - original_means.loc[True]
observed_difference
9.266142572024918
For the random groups:
def difference_in_group_means(weights_df):
group_means = weights_df.groupby('Shuffled_Labels').mean().get('Birth Weight')
return group_means.loc[False] - group_means.loc[True]
# Shuffling the labels again.
babies_with_shuffled = babies.assign(Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker')))
difference_in_group_means(babies_with_shuffled)
-0.27796212502094875
n_repetitions = 500 # The dataset is large, so it takes too long to run if we use 5000 or 10000.
differences = np.array([])
for i in np.arange(n_repetitions):
# Step 1: Shuffle the labels.
shuffled_labels = np.random.permutation(babies.get('Maternal Smoker'))
# Step 2: Put them in a DataFrame.
shuffled = babies_with_shuffled.assign(Shuffled_Labels=shuffled_labels)
# Step 3: Compute the difference in group means and add the result to the differences array.
difference = difference_in_group_means(shuffled)
differences = np.append(differences, difference)
differences
array([-0.53, 0.8 , -0.99, ..., 0.47, 0.15, 0.08])
(bpd.DataFrame()
.assign(simulated_diffs=differences)
.plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
(bpd.DataFrame()
.assign(simulated_diffs=differences)
.plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();
smoker_p_value = np.count_nonzero(differences >= observed_difference) / 500
smoker_p_value
0.0
show_permutation_testing_summary()
Recall, babies
has two columns.
babies.take(np.arange(3))
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
To randomly assign weights to groups, we shuffled 'Maternal Smoker'
column. Could we have shuffled the 'Birth Weight'
column instead?
footballs = bpd.read_csv('data/footballs.csv')
footballs
Team | Pressure | PressureDrop | |
---|---|---|---|
0 | Patriots | 11.65 | 0.85 |
1 | Patriots | 11.03 | 1.48 |
2 | Patriots | 10.85 | 1.65 |
... | ... | ... | ... |
11 | Colts | 12.53 | 0.47 |
12 | Colts | 12.72 | 0.28 |
13 | Colts | 12.35 | 0.65 |
14 rows × 3 columns
'Pressure'
column records the average of the two officials' measurements at halftime.'PressureDrop'
column records the difference between the estimated starting pressure and the average recorded 'Pressure'
of each football.Did the Patriots' footballs drop in pressure more than the Colts'?
Similar to the baby weights example, our test statistic will be the difference between the teams' average pressure drops. We'll calculate the mean drop for the 'Patriots'
minus the mean drop for the 'Colts'
.
means = footballs.groupby('Team').mean().get('PressureDrop')
means
Team Colts 0.47 Patriots 1.21 Name: PressureDrop, dtype: float64
# Calculate the observed statistic.
observed_difference = means.loc['Patriots'] - means.loc['Colts']
observed_difference
0.7362500000000001
The average pressure drop for the Patriots was about 0.74 psi more than the Colts.
We'll run a permutation test to see if 0.74 psi is a significant difference.
'Team'
or the 'PressureDrop'
column.'PressureDrop'
column.for
-loop.# For simplicity, keep only the columns that are necessary for the test:
# one column of group labels and one column of numerical values.
footballs = footballs.get(['Team', 'PressureDrop'])
footballs
Team | PressureDrop | |
---|---|---|
0 | Patriots | 0.85 |
1 | Patriots | 1.48 |
2 | Patriots | 1.65 |
... | ... | ... |
11 | Colts | 0.47 |
12 | Colts | 0.28 |
13 | Colts | 0.65 |
14 rows × 2 columns
# Shuffle one column.
# We chose to shuffle the numerical data (pressure drops), but we could have shuffled the group labels (team names) instead.
shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
shuffled_drops
array([0.47, 1.8 , 1.65, 1.23, 0.28, 0.42, 0.65, 1.18, 1.48, 0.72, 1.38, 1.35, 0.47, 0.85])
# Add the shuffled column back to the DataFrame.
shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
shuffled
Team | PressureDrop | Shuffled_Drops | |
---|---|---|---|
0 | Patriots | 0.85 | 0.47 |
1 | Patriots | 1.48 | 1.80 |
2 | Patriots | 1.65 | 1.65 |
... | ... | ... | ... |
11 | Colts | 0.47 | 1.35 |
12 | Colts | 0.28 | 0.47 |
13 | Colts | 0.65 | 0.85 |
14 rows × 3 columns
# Calculate the group means for the two randomly created groups.
team_means = shuffled.groupby('Team').mean().get('Shuffled_Drops')
team_means
Team Colts 1.01 Patriots 0.99 Name: Shuffled_Drops, dtype: float64
# Calcuate the difference in group means (Patriots minus Colts) for the randomly created groups.
team_means.loc['Patriots'] - team_means.loc['Colts']
-0.02499999999999991
for
-loop.def difference_in_mean_pressure_drops(pressures_df):
team_means = pressures_df.groupby('Team').mean().get('Shuffled_Drops')
return team_means.loc['Patriots'] - team_means.loc['Colts']
n_repetitions = 5000 # The dataset is much smaller than in the baby weights example, so a larger number of repetitions will still run quickly.
differences = np.array([])
for i in np.arange(n_repetitions):
# Step 1: Shuffle the pressure drops.
shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
# Step 2: Put them in a DataFrame.
shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
# Step 3: Compute the difference in group means and add the result to the differences array.
difference = difference_in_mean_pressure_drops(shuffled)
differences = np.append(differences, difference)
differences
array([-0.38, -0.22, 0.11, ..., -0.15, -0.02, -0.2 ])
bpd.DataFrame().assign(SimulatedDifferenceInMeans=differences).plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();
It doesn't look good for the Patriots. What is the p-value?
np.count_nonzero(differences >= observed_difference) / n_repetitions
0.0034
This p-value is low enough to consider this result to be highly statistically significant ($p<0.01$).
Quote from an investigative report commissioned by the NFL:
“[T]he average pressure drop of the Patriots game balls exceeded the average pressure drop of the Colts balls by 0.45 to 1.02 psi, depending on various possible assumptions regarding the gauges used, and assuming an initial pressure of 12.5 psi for the Patriots balls and 13.0 for the Colts balls.”
population = bpd.read_csv('data/2021_salaries.csv')
population
Year | EmployerType | EmployerName | DepartmentOrSubdivision | ... | EmployerCounty | SpecialDistrictActivities | IncludesUnfundedLiability | SpecialDistrictType | |
---|---|---|---|---|---|---|---|---|---|
0 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
1 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
2 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
12302 | 2021 | City | San Diego | Fire-Rescue | ... | San Diego | NaN | False | NaN |
12303 | 2021 | City | San Diego | Fleet Operations | ... | San Diego | NaN | False | NaN |
12304 | 2021 | City | San Diego | Fire-Rescue | ... | San Diego | NaN | False | NaN |
12305 rows × 29 columns
When you load in a dataset that has so many columns that you can't see them all, it's a good idea to look at the column names.
population.columns
Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision', 'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions', 'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage', 'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages', 'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered', 'DeferredCompensationPlan', 'HealthDentalVision', 'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL', 'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty', 'SpecialDistrictActivities', 'IncludesUnfundedLiability', 'SpecialDistrictType'], dtype='object')
We only need the 'TotalWages'
column, so let's get
just that column.
population = population.get(['TotalWages'])
population
TotalWages | |
---|---|
0 | 359138 |
1 | 345336 |
2 | 336250 |
... | ... |
12302 | 9 |
12303 | 9 |
12304 | 4 |
12305 rows × 1 columns
population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
title='Distribution of Total Wages of San Diego City Employees in 2021');
Consider the question
What is the median salary of all San Diego city employees?
What is the right tool to answer this question?
.median()
to find the median salary of all city employees.population_median = population.get('TotalWages').median()
population_median
74441.0
Let's survey 500 employees at random. To do so, we can use the .sample
method.
np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
# Take a sample of size 500.
my_sample = population.sample(500)
my_sample
TotalWages | |
---|---|
599 | 167191 |
10595 | 18598 |
837 | 157293 |
... | ... |
2423 | 122785 |
7142 | 62808 |
5792 | 78093 |
500 rows × 1 columns
We won't reassign my_sample
at any point in this notebook, so it will always refer to this particular sample.
# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median
72016.0
sample_medians = np.array([])
for i in np.arange(1000):
median = population.sample(500).get('TotalWages').median()
sample_medians = np.append(sample_medians, median)
sample_medians
array([81062.5, 77915.5, 70419.5, ..., 71840. , 73618.5, 79238. ])
(bpd.DataFrame()
.assign(SampleMedians=sample_medians)
.plot(kind='hist', density=True,
bins=30, ec='w', figsize=(8, 5),
title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);
my_sample
, looks a lot like the population.fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);
Note that unlike the previous histogram we saw, this is depicting the distribution of the population and of one particular sample (my_sample
), not the distribution of sample medians for 1000 samples.
show_bootstrapping_slides()
original = [1, 2, 3]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=False)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [1 3 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [2 3 1] Median: 2.0 Resample: [1 2 3] Median: 2.0 Resample: [3 2 1] Median: 2.0 Resample: [1 2 3] Median: 2.0 Resample: [3 1 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [2 1 3] Median: 2.0
original = [1, 2, 3]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=True)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [2 3 3] Median: 3.0 Resample: [3 1 3] Median: 3.0 Resample: [2 2 3] Median: 2.0 Resample: [2 3 1] Median: 2.0 Resample: [3 3 3] Median: 3.0 Resample: [1 3 2] Median: 2.0 Resample: [1 2 1] Median: 1.0 Resample: [3 3 2] Median: 3.0 Resample: [3 3 1] Median: 3.0 Resample: [1 1 3] Median: 1.0
When we resample without replacement, resamples look just like the original samples.
When we resample with replacement, resamples can have a different mean, median, max, and min than the original sample.
We can simulate the act of collecting new samples by sampling with replacement from our original sample, my_sample
.
# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.
n_resamples = 5000
boot_medians = np.array([])
for i in range(n_resamples):
# Resample from my_sample WITH REPLACEMENT.
resample = my_sample.sample(500, replace=True)
# Compute the median.
median = resample.get('TotalWages').median()
# Store it in our array of medians.
boot_medians = np.append(boot_medians, median)
boot_medians
array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();
The population median (blue dot) is near the middle.
In reality, we'd never get to see this!
We have a sample median wage:
my_sample.get('TotalWages').median()
72016.0
With it, we can say that the population median wage is approximately \$72,016, and not much else.
But by bootstrapping our one sample, we can generate an empirical distribution of the sample median:
(bpd.DataFrame()
.assign(BootstrapMedians=boot_medians)
.plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();
which allows us to say things like
We think the population median wage is between \$67,000 and \\$77,000.
Question: We could also say that we think the population median wage is between \$70,000 and \\$75,000, or between \$60,000 and \\$80,000. What range should we pick?