# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame

def show_bootstrapping_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vS_iYHJYXSVMMZ-YQVFwMEFR6EFN3FDSAvaMyUm-YJfLQgRMTHm3vI-wWJJ5999eFJq70nWp2hyItZg/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))


population = bpd.read_csv('data/2021_salaries.csv')
population


population = population.get(['TotalWages'])
population


population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2021');


population_median = population.get('TotalWages').median()
population_median

74441.0


np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample


# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

72016.0


sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([81062.5, 77915.5, 70419.5, ..., 71840. , 73618.5, 79238. ])


(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population')
);


fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);


show_bootstrapping_slides()


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [2 3 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [2 1 3]     Median:  2.0


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [2 3 3]     Median:  3.0
Resample:  [3 1 3]     Median:  3.0
Resample:  [2 2 3]     Median:  2.0
Resample:  [2 3 1]     Median:  2.0
Resample:  [3 3 3]     Median:  3.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 2 1]     Median:  1.0
Resample:  [3 3 2]     Median:  3.0
Resample:  [3 3 1]     Median:  3.0
Resample:  [1 1 3]     Median:  1.0


# Note that the population DataFrame doesn't appear anywhere here.
# This is all based on one sample.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)


boot_medians

array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();


my_sample.get('TotalWages').median()

72016.0


(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();


def percentile(data, p):
    data = np.sort(data) # Returns a sorted copy of data.
    n = len(data)
    h = (p / 100) * n
    k = int(np.ceil(h)) # If h is an integer, this is h. Otherwise, it rounds up.
    return data[k - 1] # - 1 because Python is 0-indexed but regular math is 1-indexed.


example = np.array([70, 18, 56, 89, 55, 35, 10, 45])
percentile(example, 50)

45


percentile(example, 70)

56


percentile(example, 50)

45


np.percentile(example, 50)

50.0


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();


boot_medians

array([72538. , 70989.5, 71874. , ..., 71372. , 69750. , 71486.5])


# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left

67081.0


# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right

76271.0


# Therefore, our interval is:
[left, right]

[67081.0, 76271.0]


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median', zorder=3)
plt.legend();


print('Interval:', [left, right])
print('Width:', right - left)

Interval: [67081.0, 76271.0]
Width: 9190.0

	TotalWages
0	359138
1	345336
2	336250
...	...
12302	9
12303	9
12304	4

	TotalWages
599	167191
10595	18598
837	157293
...	...
2423	122785
7142	62808
5792	78093

Lecture 19 – Bootstrapping, Percentiles, and Confidence Intervals¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Resources¶

Bootstrapping 🥾¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

In the language of statistics¶

The sample median¶

How confident are we that this is a good estimate?¶

The sample median is random¶

An impractical approach¶

The problem¶

The bootstrap¶

To replace or not replace?¶

Running the bootstrap¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Mathematical definition¶

How to calculate percentiles using mathematical definition¶

Example¶

Reflection¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Calculating the percentile using our mathematical definition¶

Another definition of percentile¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2021	City	San Diego	Police	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12302	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12303	2021	City	San Diego	Fleet Operations	...	San Diego	NaN	False	NaN
12304	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

Lecture 19 – Bootstrapping, Percentiles, and Confidence Intervals¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Resources¶

Bootstrapping 🥾¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

In the language of statistics¶

The sample median¶

How confident are we that this is a good estimate?¶

The sample median is random¶

An impractical approach¶

The problem¶

The bootstrap¶

To replace or not replace?¶

Running the bootstrap¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Mathematical definition¶

How to calculate percentiles using mathematical definition¶

Example¶

Reflection¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Calculating the percentile using our mathematical definition¶

Another definition of percentile¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶