# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("png")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
from IPython.display import display, IFrame, HTML
def show_permutation_testing_intro():
src="https://docs.google.com/presentation/d/e/2PACX-1vT3IfZAbqXtscEPu-nTl6lWZcXh6AWfjKsXZpWDNc0UhueXsOYQ7ivShlwbn-PW1EZm7CunTLtq7rmt/embed?start=false&loop=false&delayms=60000&rm=minimal"
width = 965
height = 635
display(IFrame(src, width, height))
def show_permutation_testing_summary():
src = "https://docs.google.com/presentation/d/e/2PACX-1vSovXDonR6EmjrT45h4pY1mwmcKFMWVSdgpbKHC5HNTm9sbG7dojvvCDEQCjuk2dk1oA4gmwMogr8ZL/embed?start=false&loop=false&delayms=60000&rm=minimal"
width = 960
height = 569
display(IFrame(src, width, height))
I have a population distribution, and I have one sample. Does this sample look like it was drawn from the population?
I have two samples, but no information about any population distributions. Do these samples look like they were drawn from the same population?
babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
... | ... | ... |
1171 | True | 130 |
1172 | False | 125 |
1173 | False | 117 |
1174 rows × 2 columns
babies.groupby('Maternal Smoker').mean()
Birth Weight | |
---|---|
Maternal Smoker | |
False | 123.09 |
True | 113.82 |
diff_in_means = (babies.groupby('Maternal Smoker').mean().get('Birth Weight').loc[False] -
babies.groupby('Maternal Smoker').mean().get('Birth Weight').loc[True])
diff_in_means
9.266142572024918
show_permutation_testing_intro()
'BAC'
and 'CAB'
are both permutations of the string 'ABC'
.'Maternal Smoker'
column of babies
!A permutation test is a type of A/B test (and a type of hypothesis test). It tests whether two samples come from the same population distribution. To conduct a permutation test:
True
s and False
s) to generate two new samples under the null. These two new samples have the same sizes as the original samples.'Maternal Smoker'
column in the babies
DataFrame.df.sample
returns a random sample of the rows in a DataFrame, but we want to shuffle one column independently.data = bpd.DataFrame().assign(x=['a', 'b', 'c', 'd', 'e'], y=[1, 2, 3, 4, 5])
data
x | y | |
---|---|---|
0 | a | 1 |
1 | b | 2 |
2 | c | 3 |
3 | d | 4 |
4 | e | 5 |
# The order of the rows are different,
# but each x is still in a row with the same y.
# This is NOT what we want.
data.sample(data.shape[0])
x | y | |
---|---|---|
4 | e | 5 |
2 | c | 3 |
3 | d | 4 |
0 | a | 1 |
1 | b | 2 |
np.random.permutation
, which takes in a sequence and returns a shuffled version of it, as an array.# Random!
np.random.permutation(data.get('x'))
array(['c', 'a', 'b', 'e', 'd'], dtype=object)
data.assign(shuffled_x=np.random.permutation(data.get('x')))
x | y | shuffled_x | |
---|---|---|---|
0 | a | 1 | d |
1 | b | 2 | c |
2 | c | 3 | b |
3 | d | 4 | a |
4 | e | 5 | e |
As mentioned before, we'll shuffle the 'Maternal Smoker'
column.
babies_with_shuffled = babies.assign(
Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker'))
)
babies_with_shuffled
Maternal Smoker | Birth Weight | Shuffled_Labels | |
---|---|---|---|
0 | False | 120 | True |
1 | False | 113 | False |
2 | True | 128 | False |
... | ... | ... | ... |
1171 | True | 130 | True |
1172 | False | 125 | False |
1173 | False | 117 | False |
1174 rows × 3 columns
Let's look at the distributions of the two new samples we just generated.
fig, ax = plt.subplots()
baby_bins = np.arange(50, 200, 5)
smokers = babies_with_shuffled[babies_with_shuffled.get('Shuffled_Labels')]
non_smokers = babies_with_shuffled[babies_with_shuffled.get('Shuffled_Labels') == False]
non_smokers.plot(kind='hist', y='Birth Weight', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w', figsize=(10, 5))
smokers.plot(kind='hist',y='Birth Weight', density=True, ax=ax, alpha=0.75, bins=baby_bins)
plt.legend(['Maternal Smoker = False', 'Maternal Smoker = True'])
plt.xlabel('Birth Weight');
What do you notice? 👀
babies_with_shuffled.groupby('Shuffled_Labels').mean().get(['Birth Weight'])
Birth Weight | |
---|---|
Shuffled_Labels | |
False | 119.01 |
True | 120.16 |
group_means = babies_with_shuffled.groupby('Shuffled_Labels').mean().get('Birth Weight')
group_means.loc[False] - group_means.loc[True]
-1.1472340295869685
This is the test statistic for one experiment (one "shuffle"). Let's write a function that can compute this test statistic for any shuffle.
def difference_in_group_means(weights_df):
group_means = weights_df.groupby('Shuffled_Labels').mean().get('Birth Weight')
return group_means.loc[False] - group_means.loc[True]
difference_in_group_means(babies_with_shuffled)
-1.1472340295869685
n_repetitions = 500 # The dataset is large, so it takes too long to run if we use 5000 or 10000
differences = np.array([])
for i in np.arange(n_repetitions):
# Step 1: Shuffle the labels to create two new samples.
shuffled_labels = np.random.permutation(babies.get('Maternal Smoker'))
# Step 2: Add them as a column to the DataFrame.
shuffled = babies_with_shuffled.assign(Shuffled_Labels=shuffled_labels)
# Step 3: Compute the difference in group means in the two new samples and store the result.
difference = difference_in_group_means(shuffled)
differences = np.append(differences, difference)
differences
array([ 1.53, 0.77, 1.67, ..., -0.17, -0.11, 0.12])
(bpd.DataFrame()
.assign(simulated_diffs=differences)
.plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
Where does our observed statistic lie?
(bpd.DataFrame()
.assign(simulated_diffs=differences)
.plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
plt.axvline(diff_in_means, color='black', linewidth=4, label='observed difference in means')
plt.legend();
smoker_p_value = np.count_nonzero(differences >= diff_in_means) / n_repetitions
smoker_p_value
0.0
show_permutation_testing_summary()
Recall, babies
has two columns.
babies.take(np.arange(3))
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
To randomly assign weights to groups, we shuffled 'Maternal Smoker'
column. Could we have shuffled the 'Birth Weight'
column instead?
footballs = bpd.read_csv('data/footballs.csv')
footballs
Team | Pressure | PressureDrop | |
---|---|---|---|
0 | Patriots | 11.65 | 0.85 |
1 | Patriots | 11.03 | 1.48 |
2 | Patriots | 10.85 | 1.65 |
... | ... | ... | ... |
11 | Colts | 12.53 | 0.47 |
12 | Colts | 12.72 | 0.28 |
13 | Colts | 12.35 | 0.65 |
14 rows × 3 columns
'Pressure'
column records the average of the two officials' measurements at halftime.'PressureDrop'
column records the difference between the estimated starting pressure and the average recorded 'Pressure'
of each football.Did the Patriots' footballs drop in pressure more than the Colts'?
Similar to the baby weights example, our test statistic will be the difference between the teams' average pressure drops. We'll calculate the mean drop for the 'Patriots'
minus the mean drop for the 'Colts'
.
means = footballs.groupby('Team').mean().get('PressureDrop')
means
Team Colts 0.47 Patriots 1.21 Name: PressureDrop, dtype: float64
# Calculate the observed statistic.
observed_difference = means.loc['Patriots'] - means.loc['Colts']
observed_difference
0.7362500000000001
The average pressure drop for the Patriots was about 0.74 psi more than the Colts.
We'll run a permutation test to see if 0.74 psi is a significant difference.
'Team'
or the 'PressureDrop'
column.'PressureDrop'
column.for
-loop.# For simplicity, keep only the columns that are necessary for the test:
# one column of group labels and one column of numerical values.
footballs = footballs.get(['Team', 'PressureDrop'])
footballs
Team | PressureDrop | |
---|---|---|
0 | Patriots | 0.85 |
1 | Patriots | 1.48 |
2 | Patriots | 1.65 |
... | ... | ... |
11 | Colts | 0.47 |
12 | Colts | 0.28 |
13 | Colts | 0.65 |
14 rows × 2 columns
# Shuffle one column.
# We chose to shuffle the numerical data (pressure drops), but we could have shuffled the group labels (team names) instead.
shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
shuffled_drops
array([1.18, 0.28, 0.47, 0.47, 1.38, 1.8 , 0.72, 1.48, 1.35, 0.42, 1.23, 0.85, 1.65, 0.65])
# Add the shuffled column back to the DataFrame.
shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
shuffled
Team | PressureDrop | Shuffled_Drops | |
---|---|---|---|
0 | Patriots | 0.85 | 1.18 |
1 | Patriots | 1.48 | 0.28 |
2 | Patriots | 1.65 | 0.47 |
... | ... | ... | ... |
11 | Colts | 0.47 | 0.85 |
12 | Colts | 0.28 | 1.65 |
13 | Colts | 0.65 | 0.65 |
14 rows × 3 columns
# Calculate the group means for the two randomly created groups.
team_means = shuffled.groupby('Team').mean().get('Shuffled_Drops')
team_means
Team Colts 1.09 Patriots 0.96 Name: Shuffled_Drops, dtype: float64
# Calcuate the difference in group means (Patriots minus Colts) for the randomly created groups.
team_means.loc['Patriots'] - team_means.loc['Colts']
-0.13874999999999993
for
-loop.def difference_in_mean_pressure_drops(pressures_df):
team_means = pressures_df.groupby('Team').mean().get('Shuffled_Drops')
return team_means.loc['Patriots'] - team_means.loc['Colts']
n_repetitions = 5000 # The dataset is much smaller than in the baby weights example, so a larger number of repetitions will still run quickly.
differences = np.array([])
for i in np.arange(n_repetitions):
# Step 1: Shuffle the pressure drops.
shuffled_drops = np.random.permutation(footballs.get('PressureDrop'))
# Step 2: Put them in a DataFrame.
shuffled = footballs.assign(Shuffled_Drops=shuffled_drops)
# Step 3: Compute the difference in group means and add the result to the differences array.
difference = difference_in_mean_pressure_drops(shuffled)
differences = np.append(differences, difference)
differences
array([-0.12, 0.82, 0.68, ..., -0.2 , -0.39, 0.12])
bpd.DataFrame().assign(SimulatedDifferenceInMeans=differences).plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();
It doesn't look good for the Patriots. What is the p-value?
np.count_nonzero(differences >= observed_difference) / n_repetitions
0.0046
This p-value is low enough to consider this result to be highly statistically significant ($p<0.01$).
Quote from an investigative report commissioned by the NFL:
“[T]he average pressure drop of the Patriots game balls exceeded the average pressure drop of the Colts balls by 0.45 to 1.02 psi, depending on various possible assumptions regarding the gauges used, and assuming an initial pressure of 12.5 psi for the Patriots balls and 13.0 for the Colts balls.”