# Run this cell to set up packages for lecture.
from lec14_imports import *

population = bpd.read_csv('data/2022_salaries.csv')
population

population.columns

Index(['Year', 'EmployerType', 'EmployerName', 'DepartmentOrSubdivision',
       'Position', 'ElectedOfficial', 'Judicial', 'OtherPositions',
       'MinPositionSalary', 'MaxPositionSalary', 'ReportedBaseWage',
       'RegularPay', 'OvertimePay', 'LumpSumPay', 'OtherPay', 'TotalWages',
       'DefinedBenefitPlanContribution', 'EmployeesRetirementCostCovered',
       'DeferredCompensationPlan', 'HealthDentalVision',
       'TotalRetirementAndHealthContribution', 'PensionFormula', 'EmployerURL',
       'EmployerPopulation', 'LastUpdatedDate', 'EmployerCounty',
       'SpecialDistrictActivities', 'IncludesUnfundedLiability',
       'SpecialDistrictType'],
      dtype='object')

population = population.get(['TotalWages'])
population

population.plot(kind='hist', bins=np.arange(0, 400000, 10000), density=True, ec='w', figsize=(10, 5),
                title='Distribution of Total Wages of San Diego City Employees in 2022');

population_median = population.get('TotalWages').median()
population_median

78136.0

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

# Take a sample of size 500.
my_sample = population.sample(500)
my_sample

# Compute the sample median.
sample_median = my_sample.get('TotalWages').median()
sample_median

76237.0

sample_medians = np.array([])
for i in np.arange(1000):
    median = population.sample(500).get('TotalWages').median()
    sample_medians = np.append(sample_medians, median)
sample_medians

array([81686.5, 79641. , 75592. , ..., 79350. , 78826.5, 78459.5])

(bpd.DataFrame()
 .assign(SampleMedians=sample_medians)
 .plot(kind='hist', density=True,
       bins=30, ec='w', figsize=(8, 5),
       title='Distribution of the Sample Median of 1000 Samples from the Population\nSample Size = 500')
);

fig, ax = plt.subplots(figsize=(10, 5))
bins=np.arange(10_000, 300_000, 10_000)
population.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
my_sample.plot(kind='hist', y='TotalWages', ax=ax, density=True, alpha=.75, bins=bins, ec='w')
plt.legend(['Population', 'My Sample']);

show_bootstrapping_slides()

original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=False)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [2 1 3]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [1 3 2]     Median:  2.0
Resample:  [3 1 2]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 2 3]     Median:  2.0
Resample:  [3 2 1]     Median:  2.0

original = [1, 2, 3]
for i in np.arange(10):
    resample = np.random.choice(original, 3, replace=True)
    print("Resample: ", resample, "    Median: ", np.median(resample))

Resample:  [3 2 1]     Median:  2.0
Resample:  [1 1 3]     Median:  1.0
Resample:  [3 2 1]     Median:  2.0
Resample:  [1 1 2]     Median:  1.0
Resample:  [2 1 3]     Median:  2.0
Resample:  [3 3 3]     Median:  3.0
Resample:  [1 1 1]     Median:  1.0
Resample:  [2 2 3]     Median:  2.0
Resample:  [2 3 2]     Median:  2.0
Resample:  [3 3 2]     Median:  3.0

# Note that the population DataFrame, population, doesn't appear anywhere here.
# This is all based on one sample, my_sample.

np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

n_resamples = 5000
boot_medians = np.array([])

for i in range(n_resamples):
    
    # Resample from my_sample WITH REPLACEMENT.
    resample = my_sample.sample(500, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Store it in our array of medians.
    boot_medians = np.append(boot_medians, median)

boot_medians

array([76896. , 72945. , 73555. , ..., 74431. , 75868. , 78601.5])

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();

my_sample.get('TotalWages').median()

76237.0

(bpd.DataFrame()
 .assign(BootstrapMedians=boot_medians)
 .plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
)
plt.legend();

np.percentile([4, 6, 9, 2, 7], 50)

6.0

np.percentile([2, 4, 6, 7, 9], 50)

6.0

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5))
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(2)
plt.legend();

boot_medians

array([76896. , 72945. , 73555. , ..., 74431. , 75868. , 78601.5])

# Left endpoint.
left = np.percentile(boot_medians, 2.5)
left

68469.0

# Right endpoint.
right = np.percentile(boot_medians, 97.5)
right

81253.5

# Therefore, our interval is:
[left, right]

[68469.0, 81253.5]

bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(63000, 88000, 1000), ec='w', figsize=(10, 5), zorder=1)
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval', zorder=2);
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median', zorder=3)
plt.legend();

print('Interval:', [left, right])
print('Width:', right - left)

Interval: [68469.0, 81253.5]
Width: 12784.5

	TotalWages
0	384909
1	381566
2	350013
...	...
12826	6
12827	4
12828	2

	TotalWages
10301	27866
6913	71861
5163	91843
...	...
3002	121209
3718	109709
2394	131409

Lecture 14 – Bootstrapping and Confidence Intervals¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
0	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
1	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
2	2022	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12826	2022	City	San Diego	Public Utilities	...	San Diego	NaN	False	NaN
12827	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN
12828	2022	City	San Diego	Police	...	San Diego	NaN	False	NaN

Lecture 14 – Bootstrapping and Confidence Intervals¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

City of San Diego employee salary data¶

The median salary¶

Let's be realistic...¶

Terminology recap¶

The sample median¶

How confident are we that this is a good estimate?¶

An impractical approach¶

The problem¶

Bootstrapping 🥾¶

Bootstrapping¶

To replace or not replace?¶

Bootstrapping the sample of salaries¶

Bootstrap distribution of the sample median¶

What's the point of bootstrapping?¶

Percentiles¶

Informal definition¶

Calculating percentiles¶

Confidence intervals¶

Using the bootstrapped distribution of sample medians¶

Using the bootstrapped distribution of sample medians¶

Confidence intervals¶

Finding endpoints¶

Computing a confidence interval¶

Visualizing our 95% confidence interval¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Reflection¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶