# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame

def show_confidence_interval_slides():
    src="https://docs.google.com/presentation/d/e/2PACX-1vTaPZsueXI6fey_5cj2Y1TevkR1joBvpwaWVsZNvgBlnJSrw1EiBLHJywkFH_QNLU5Tdr6JZgDrhFxG/embed?start=false&loop=false&delayms=3000&rm=minimal"
    width = 940
    height = 940
    display(IFrame(src, width, height))


population = bpd.read_csv('data/2021_salaries.csv').get(['TotalWages'])
population_median = population.get('TotalWages').median()
population_median # Can't see this in real life!

74441.0


np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.
my_sample = population.sample(500)
sample_median = my_sample.get('TotalWages').median()
sample_median

72016.0


np.random.seed(38)

# Bootstrap the sample to get more sample medians.
n_resamples = 5000
boot_medians = np.array([])

for i in np.arange(n_resamples):
    resample = my_sample.sample(500, replace=True)
    median = resample.get('TotalWages').median()
    boot_medians = np.append(boot_medians, median)
    
boot_medians

array([74261. , 73080. , 72486. , ..., 68216. , 76159. , 69768.5])


left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)

# Therefore, our interval is:
[left, right]

[66987.0, 76527.0]


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(3)
plt.legend();


print('Interval:', [left, right])
print('Width:', right - left)

Interval: [66987.0, 76527.0]
Width: 9540.0


many_cis = np.load('data/many_cis.npy')
many_cis

array([[70247.  , 80075.68],
       [63787.65, 75957.5 ],
       [71493.  , 82207.5 ],
       ...,
       [66679.64, 81308.  ],
       [65735.68, 80060.21],
       [69756.1 , 80383.5 ]])


plt.figure(figsize=(10, 6))
for i, ci in enumerate(many_cis):
    plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
plt.axvline(x=population_median, color='blue');


plt.figure(figsize=(10, 6))
count_outside = 0
for i, ci in enumerate(many_cis):
    if ci[0] > population_median or ci[1] < population_median:
        plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
        count_outside = count_outside + 1
plt.axvline(x=population_median, color='blue');


count_outside

11


# Our interval:
[left, right]

[66987.0, 76527.0]


population.plot(kind='hist', y='TotalWages', density=True, ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.legend();


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.legend();


# Our interval:
[left, right]

[66987.0, 76527.0]


show_confidence_interval_slides()


n_resamples = 5000
boot_maxes = np.array([])

for i in range(n_resamples):
    resample = my_sample.sample(my_sample.shape[0], replace=True)
    boot_max = resample.get('TotalWages').max()
    boot_maxes = np.append(boot_maxes, boot_max)


boot_maxes

array([235709., 329949., 247093., ..., 329949., 329949., 235709.])


population_max = population.get('TotalWages').max()
population_max

359138


bpd.DataFrame().assign(BootstrapMax=boot_maxes).plot(kind='hist', 
                                                     density=True, 
                                                     bins=10,
                                                     ec='w',
                                                     figsize=(10, 5))
plt.scatter(population_max, 0.0000008, color='blue', s=100, label='population max')
plt.legend();


my_sample.get('TotalWages').max()

329949


population = bpd.read_csv('data/2021_salaries.csv')
fire_rescue_population = population[population.get('DepartmentOrSubdivision') == 'Fire-Rescue']
fire_rescue_population


np.random.seed(38)

# Let's once again suppose we only have access to a sample.
fire_rescue_sample = fire_rescue_population.sample(300)
fire_rescue_sample


n_resamples = 500
fire_rescue_medians = np.array([])
for i in range(n_resamples):
    # Resample from fire_rescue_sample.
    resample = fire_rescue_sample.sample(300, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Add it to our array of bootstrapped medians.
    fire_rescue_medians = np.append(fire_rescue_medians, median)


fire_rescue_medians

array([ 90959. , 100759. ,  92676. , ...,  95701.5,  94562. ,  99148. ])


fire_left = np.percentile(fire_rescue_medians, 0.5)
fire_left

82766.5


fire_right = np.percentile(fire_rescue_medians, 99.5)
fire_right

108676.585


# Resulting interval:
[fire_left, fire_right]

[82766.5, 108676.585]


bpd.DataFrame().assign(FireRescueBootstrapMedians=fire_rescue_medians).plot(kind='hist', density=True, bins=np.arange(75000, 125000, 1000), ec='w', figsize=(10, 5))
plt.plot([fire_left, fire_right], [0, 0], color='gold', linewidth=12, label='99% confidence interval');
plt.legend();


# Actual population median of Fire-Rescue Department salaries:
fire_rescue_population.get('TotalWages').median()

97388.0

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
4	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
5	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
6	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12301	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12302	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12304	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
6762	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
8754	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
3783	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
10812	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
11112	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
11009	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

Lecture 20 – Confidence Intervals, Central Tendency¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Interpreting confidence intervals¶

Recap: City of San Diego employee salaries¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Confidence intervals describe a guess for the value of an unknown parameter¶

Interpreting confidence intervals¶

Capturing the true value¶

Many confidence intervals¶

Which confidence intervals don't contain the true parameter?¶

Confidence tradeoffs¶

Misinterpreting confidence intervals¶

Bootstrapping rules of thumb¶

Example: Estimating the max of a population¶

Visualize¶

Confidence intervals for hypothesis testing¶

Using confidence intervals for hypothesis testing¶

Example: Fire-Rescue Department 🚒¶

Setting up a hypothesis test¶

Testing the hypotheses¶

Finding the interval¶

Conclusion of the hypothesis test¶

Summary of methods¶

Central tendency¶

Some questions¶

The mean (i.e. average)¶

The median¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary, next time¶

Summary: Confidence intervals and hypothesis testing¶

Next time¶

Lecture 20 – Confidence Intervals, Central Tendency¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Interpreting confidence intervals¶

Recap: City of San Diego employee salaries¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Confidence intervals describe a guess for the value of an unknown parameter¶

Interpreting confidence intervals¶

Capturing the true value¶

Many confidence intervals¶

Which confidence intervals don't contain the true parameter?¶

Confidence tradeoffs¶

Misinterpreting confidence intervals¶

Bootstrapping rules of thumb¶

Example: Estimating the max of a population¶

Visualize¶

Confidence intervals for hypothesis testing¶

Using confidence intervals for hypothesis testing¶

Example: Fire-Rescue Department 🚒¶

Setting up a hypothesis test¶

Testing the hypotheses¶

Finding the interval¶

Conclusion of the hypothesis test¶

Summary of methods¶

Central tendency¶

Some questions¶

The mean (i.e. average)¶

The median¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary: Confidence intervals and hypothesis testing¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶