# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
from IPython.display import display, IFrame
def show_bootstrapping_slides():
src = "https://docs.google.com/presentation/d/e/2PACX-1vS_iYHJYXSVMMZ-YQVFwMEFR6EFN3FDSAvaMyUm-YJfLQgRMTHm3vI-wWJJ5999eFJq70nWp2hyItZg/embed?start=false&loop=false&delayms=3000"
width = 960
height = 509
display(IFrame(src, width, height))
You may have noticed that we've quickly moved into much more theoretical material.
Remember to read the textbook for more context and examples.
Additionally, this site contains a helpful visual explanation of permutation testing, and the Diagrams page contains all of the interactive diagrams we've seen so far in lecture.
population = bpd.read_csv('data/2021_salaries.csv')
population
Year | EmployerType | EmployerName | DepartmentOrSubdivision | ... | EmployerCounty | SpecialDistrictActivities | IncludesUnfundedLiability | SpecialDistrictType | |
---|---|---|---|---|---|---|---|---|---|
0 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
1 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
2 | 2021 | City | San Diego | Police | ... | San Diego | NaN | False | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
12302 | 2021 | City | San Diego | Fire-Rescue | ... | San Diego | NaN | False | NaN |
12303 | 2021 | City | San Diego | Fleet Operations | ... | San Diego | NaN | False | NaN |
12304 | 2021 | City | San Diego | Fire-Rescue | ... | San Diego | NaN | False | NaN |
12305 rows × 29 columns
We only need the 'TotalWages'
column, so let's get
just that column.
population = population.get(['TotalWages'])
population
TotalWages | |
---|---|
0 | 359138 |
1 | 345336 |
2 | 336250 |
... | ... |
12302 | 9 |
12303 | 9 |
12304 | 4 |
12305 rows × 1 columns
population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
title='Distribution of Total Wages of San Diego City Employees in 2021');
.median()
to find the median salary of all city employees.population_median = population.get('TotalWages').median()
population_median
74441.0
Let's survey 500 employees at random. To do so, we can use the .sample
method.
np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
# Take a sample of size 500.
my_sample = population.sample(500)
my_sample
TotalWages | |
---|---|
599 | 167191 |
10595 | 18598 |
837 | 157293 |
... | ... |
2423 | 122785 |
7142 | 62808 |
5792 | 78093 |
500 rows × 1 columns
We won't reassign my_sample
at any point in this notebook, so it will always refer to this particular sample.
# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median
72016.0
sample_medians = np.array([])
for i in np.arange(1000):
median = population.sample(500).get('TotalWages').median()
sample_medians = np.append(sample_medians, median)
sample_medians
array([81062.5, 77915.5, 70419.5, ..., 71840. , 73618.5, 79238. ])
(bpd.DataFrame()
.assign(SampleMedians=sample_medians)
.plot(kind='hist', density=True,
bins=30, ec='w', figsize=(8, 5),
title='Distribution of the Sample Median of 1000 Samples from the Population')
);
my_sample
, looks a lot like the population.fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);
Note that unlike the previous histogram we saw, this is depicting the distribution of the population and of one particular sample (my_sample
), not the distribution of sample medians for 1000 samples.
show_bootstrapping_slides()
original = [1, 2, 3]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=False)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [1 3 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [2 3 1] Median: 2.0 Resample: [1 2 3] Median: 2.0 Resample: [3 2 1] Median: 2.0 Resample: [1 2 3] Median: 2.0 Resample: [3 1 2] Median: 2.0 Resample: [1 3 2] Median: 2.0 Resample: [2 1 3] Median: 2.0
original = [1, 2, 3]
for i in np.arange(10):
resample = np.random.choice(original, 3, replace=True)
print("Resample: ", resample, " Median: ", np.median(resample))
Resample: [2 3 3] Median: 3.0 Resample: [3 1 3] Median: 3.0 Resample: [2 2 3] Median: 2.0 Resample: [2 3 1] Median: 2.0 Resample: [3 3 3] Median: 3.0 Resample: [1 3 2] Median: 2.0 Resample: [1 2 1] Median: 1.0 Resample: [3 3 2] Median: 3.0 Resample: [3 3 1] Median: 3.0 Resample: [1 1 3] Median: 1.0
When we resample without replacement, resamples look just like the original samples.
When we resample with replacement, resamples can have a different mean, median, max, and min than the original sample.
We can simulate the act of collecting new samples by sampling with replacement from our original sample, my_sample
.
# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.
n_resamples = 5000
boot_medians = np.array([])
for i in range(n_resamples):
# Resample from my_sample WITH REPLACEMENT.
resample = my_sample.sample(500, replace=True)
# Compute the median.
median = resample.get('TotalWages').median()
# Store it in our array of medians.
boot_medians = np.append(boot_medians, median)
boot_medians
array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();
The population median (blue dot) is near the middle.
In reality, we'd never get to see this!
We have a sample median wage:
my_sample.get('TotalWages').median()
72016.0
With it, we can say that the population median wage is approximately \$72,016, and not much else.
But by bootstrapping our one sample, we can generate an empirical distribution of the sample median:
(bpd.DataFrame()
.assign(BootstrapMedians=boot_medians)
.plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();
which allows us to say things like
We think the population median wage is between \$67,000 and \\$77,000.
Question: We could also say that we think the population median wage is between \$70,000 and \\$75,000, or between \$60,000 and \\$80,000. What range should we pick?
Let $p$ be a number between 0 and 100. The $p$th percentile of a collection is the smallest value in the collection that is at least as large as $p$% of all the values.
By this definition, any percentile between 0 and 100 can be computed for any collection of values and is always an element of the collection.
Suppose there are $n$ elements in the collection. To find the $p$th percentile:
What is the 25th percentile of the array np.array([4, 10, 15, 21, 100])
?
10
, so the 25th percentile is 10
.Consider the array from the previous slide, np.array([4, 10, 15, 21, 100])
. Here's how our percentile formula works:
value | 4 | 10 | 15 | 21 | 100 |
---|---|---|---|---|---|
percentile | [0, 20] | (20, 40] | (40, 60] | (60, 80] | (80, 100] |
For instance:
4
.15
.21
, but the 80.001st percentile is 100
.Notice that in the table above, each of the 5 values owns an equal percentage (20\%) of the range 0-100.
What is the 70th percentile of the array np.array([70, 18, 56, 89, 55, 35, 10, 45])
?
✅ Click here to see the solution after you've tried it yourself.
np.array([10, 18, 35, 45, 55, 56, 70, 89])
.56
, so the 70th percentile is 56
.def percentile(data, p):
data = np.sort(data) # Returns a sorted copy of data.
n = len(data)
h = (p / 100) * n
k = int(np.ceil(h)) # If h is an integer, this is h. Otherwise, it rounds up.
return data[k - 1] # - 1 because Python is 0-indexed but regular math is 1-indexed.
example = np.array([70, 18, 56, 89, 55, 35, 10, 45])
percentile(example, 50)
45
percentile(example, 70)
56
numpy
package provides a function to calculate percentiles, np.percentile(array, p)
, which returns the p
th percentile of array
.np.percentile
doesn't implement our version of percentile exactly, but for large arrays the two definitions are nearly the same.np.percentile
since it's faster.percentile(example, 50)
45
np.percentile(example, 50)
50.0
Earlier in the lecture, we generated a bootstrapped distribution of sample medians.
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();
What can we do with this distribution, now that we know about percentiles?
Let's be a bit more precise.
We think the population parameter is close to our sample statistic, $x$.
We think the population parameter is between $a$ and $b$.
boot_medians
array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])
# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left
67081.0
# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right
76271.0
# Therefore, our interval is:
[left, right]
[67081.0, 76271.0]
You will use the code above very frequently moving forward!
bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median', zorder=3)
plt.legend();
We computed the following 95% confidence interval:
print('Interval:', [left, right])
print('Width:', right - left)
Interval: [67081.0, 76271.0] Width: 9190.0
If we instead computed an 80% confidence interval, would it be wider or narrower?
Now, instead of saying
We think the population median is close to our sample median, \$72,016.
We can say:
A 95% confidence interval for the population median is \$66,996 to \\$76,527.
Some lingering questions: What does 95% confidence mean? What are we confident about? Is this technique always "good"?
We will: