# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame

def show_bootstrapping_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vS_iYHJYXSVMMZ-YQVFwMEFR6EFN3FDSAvaMyUm-YJfLQgRMTHm3vI-wWJJ5999eFJq70nWp2hyItZg/embed?start=false&loop=false&delayms=3000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))


population = bpd.read_csv('data/2022_salaries.csv')
population


population.columns

Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
       'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
       'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
       'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
       'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
       'DeferredCompensationPlan', 'HealthDentalVision',
       'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
       'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
       'SpecialDistrictActivities', 'IncludesUnfundedLiability',
       'SpecialDistrictType'],
      dtype='object')


population = population.get(['TotalWages'])
population


population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2022');


population_median = population.get('TotalWages').median()
population_median

78136.0


np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample


# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

76237.0


sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([81686.5, 79641. , 75592. , ..., 79350. , 78826.5, 78459.5])


(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);


fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);


show_bootstrapping_slides()


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [2 1 3]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0


original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [3 2 1]     Median:  2.0
Resample:  [1 1 3]     Median:  1.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 1 2]     Median:  1.0
Resample:  [2 1 3]     Median:  2.0
Resample:  [3 3 3]     Median:  3.0
Resample:  [1 1 1]     Median:  1.0
Resample:  [2 2 3]     Median:  2.0
Resample:  [2 3 2]     Median:  2.0
Resample:  [3 3 2]     Median:  3.0


# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)


boot_medians

array([75592., 77408., 71970., ..., 80386., 77134., 78893.])


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();


my_sample.get('TotalWages').median()

76237.0


(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();


np.percentile([4, 6, 9, 2, 7], 50)

6.0


np.percentile([2, 4, 6, 7, 9], 50)

6.0


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();


boot_medians

array([75592., 77408., 71970., ..., 80386., 77134., 78893.])


# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left

68515.3125


# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right

81505.5


# Therefore, our interval is:
[left, right]

[68515.3125, 81505.5]


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median', zorder=3)
plt.legend();


print('Interval:', [left, right])
print('Width:', right - left)

Interval: [68515.3125, 81505.5]
Width: 12990.1875

	TotalWages
0	384909
1	381566
2	350013
...	...
12826	6
12827	4
12828	2

	TotalWages
10301	27866
6913	71861
5163	91843
...	...
3002	121209
3718	109709
2394	131409

Lecture 15 – Bootstrapping and Confidence Intervals¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2022	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12826	2022	City	San Diego	Public Utilities	...	San Diego	NaN	False	NaN
12827	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
12828	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN

Lecture 15 – Bootstrapping and Confidence Intervals¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶