# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame

def show_confidence_interval_slides():
    src="https://docs.google.com/presentation/d/e/2PACX-1vTaPZsueXI6fey_5cj2Y1TevkR1joBvpwaWVsZNvgBlnJSrw1EiBLHJywkFH_QNLU5Tdr6JZgDrhFxG/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 989
    display(IFrame(src, width, height))


np.random.seed(38) # Magic to ensure that we get the same results every time this code is run.

population = bpd.read_csv('data/2021_salaries.csv').get(['TotalWages'])
population_median = population.get('TotalWages').median()

my_sample = population.sample(500)
sample_median = my_sample.get('TotalWages').median()
sample_median

72016.0


np.random.seed(38)

# Bootstrap the sample to get more sample medians.
n_resamples = 5000
boot_medians = np.array([])

for i in np.arange(n_resamples):
    resample = my_sample.sample(500, replace=True)
    median = resample.get('TotalWages').median()
    boot_medians = np.append(boot_medians, median)
    
boot_medians

array([74261. , 73080. , 72486. , ..., 68216. , 76159. , 69768.5])


left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)

# Therefore, our interval is:
[left, right]

[66987.0, 76527.0]


bpd.DataFrame().assign(BootstrapMedians=boot_medians).plot(kind='hist', density=True, bins=np.arange(60000, 85000, 1000), ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.scatter(population_median, 0.000004, color='blue', s=100, label='population median').set_zorder(3)
plt.legend();


many_cis = np.load('data/many_cis.npy')
many_cis

array([[70247.  , 80075.68],
       [63787.65, 75957.5 ],
       [71493.  , 82207.5 ],
       ...,
       [66679.64, 81308.  ],
       [65735.68, 80060.21],
       [69756.1 , 80383.5 ]])


plt.figure(figsize=(10, 6))
for i, ci in enumerate(many_cis):
    plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
plt.axvline(x=population_median, color='blue');


plt.figure(figsize=(10, 6))
count_outside = 0
for i, ci in enumerate(many_cis):
    if ci[0] > population_median or ci[1] < population_median:
        plt.plot([ci[0], ci[1]], [i, i], color='gold', linewidth=2)
        count_outside = count_outside + 1
plt.axvline(x=population_median, color='blue');


count_outside

11


# Our interval:
[left, right]

[66987.0, 76527.0]


population.plot(kind='hist', y='TotalWages', density=True, ec='w', figsize=(10, 5))
plt.plot([left, right], [0, 0], color='gold', linewidth=12, label='95% confidence interval');
plt.legend();


# Our interval:
[left, right]

[66987.0, 76527.0]


show_confidence_interval_slides()


n_resamples = 5000
boot_maxes = np.array([])

for i in range(n_resamples):
    resample = my_sample.sample(my_sample.shape[0], replace=True)
    boot_max = resample.get('TotalWages').max()
    boot_maxes = np.append(boot_maxes, boot_max)


boot_maxes

array([235709., 329949., 247093., ..., 329949., 329949., 235709.])


population_max = population.get('TotalWages').max()
population_max

359138


bpd.DataFrame().assign(BootstrapMax=boot_maxes).plot(kind='hist', 
                                                     density=True, 
                                                     bins=10,
                                                     ec='w',
                                                     figsize=(10, 5))
plt.scatter(population_max, 0.0000008, color='blue', s=100, label='population max')
plt.legend();


my_sample.get('TotalWages').max()

329949


population = bpd.read_csv('data/2021_salaries.csv')
fire_rescue_population = population[population.get('DepartmentOrSubdivision') == 'Fire-Rescue']
fire_rescue_population


# The median salary of all City of SD employees, in all departments.
population_median = population.get('TotalWages').median()
population_median

74441.0


np.random.seed(38)

fire_rescue_sample = fire_rescue_population.sample(300)
fire_rescue_sample


n_resamples = 500
fire_rescue_medians = np.array([])
for i in range(n_resamples):
    # Resample from fire_rescue_sample.
    resample = fire_rescue_sample.sample(300, replace=True)
    
    # Compute the median.
    median = resample.get('TotalWages').median()
    
    # Add it to our array of bootstrapped medians.
    fire_rescue_medians = np.append(fire_rescue_medians, median)


fire_rescue_medians

array([ 90959. , 100759. ,  92676. , ...,  95701.5,  94562. ,  99148. ])


fire_left = np.percentile(fire_rescue_medians, 0.5)
fire_left

82766.5


fire_right = np.percentile(fire_rescue_medians, 99.5)
fire_right

108676.585


# Resulting interval:
[fire_left, fire_right]

[82766.5, 108676.585]


bpd.DataFrame().assign(FireRescueBootstrapMedians=fire_rescue_medians).plot(kind='hist', density=True, bins=np.arange(75000, 125000, 1000), ec='w', figsize=(10, 5))
plt.plot([fire_left, fire_right], [0, 0], color='gold', linewidth=12, label='99% confidence interval');
plt.legend();


# Actual population median of Fire-Rescue Department salaries:
fire_rescue_population.get('TotalWages').median()

97388.0


delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5))
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)');


delays.get('Delay').mean()

16.658155515370705


delays.get('Delay').median()

2.0


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', alpha=0.65, figsize=(10, 5))
plt.plot([delays.get('Delay').mean(), delays.get('Delay').mean()], [0, 1], color='green', label='Mean')
plt.scatter([delays.get('Delay').mean()], [-0.0017], color='green', marker='^', s=250)
plt.plot([delays.get('Delay').median(), delays.get('Delay').median()], [0, 1], color='purple', label='Median')
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)')
plt.ylim(-0.005, 0.065)
plt.legend();


data = np.array([2, 3, 3, 9])
np.mean(data)

4.25


deviations = data - np.mean(data)
deviations

array([-2.25, -1.25, -1.25,  4.75])


np.mean(deviations)

0.0


# Square all the deviations:
deviations ** 2

array([ 5.06,  1.56,  1.56, 22.56])


variance = np.mean(deviations ** 2)
variance

7.6875


# Standard deviation (SD) is the square root of the variance.
sd = variance ** 0.5
sd

2.7726341266023544


# Note that this evaluates to the same number we found on the previous slide.
np.std(data)

2.7726341266023544

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
4	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
5	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
6	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
12301	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12302	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
12304	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

	Year	EmployerType	EmployerName	DepartmentOrSubdivision	...	EmployerCounty	SpecialDistrictActivities	IncludesUnfundedLiability	SpecialDistrictType
6762	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
8754	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
3783	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...
10812	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
11112	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN
11009	2021	City	San Diego	Fire-Rescue	...	San Diego	NaN	False	NaN

Lecture 20 – Confidence Intervals, Center and Spread¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Interpreting confidence intervals¶

Recap: City of San Diego employee salaries¶

Confidence intervals describe a guess for the value of an unknown parameter¶

Interpreting confidence intervals¶

Capturing the true value¶

Many confidence intervals¶

Which confidence intervals don't contain the true parameter?¶

Confidence tradeoffs¶

Misinterpreting confidence intervals¶

Bootstrap rules of thumb¶

Example: Estimating the max of a population¶

Visualize¶

Confidence intervals for hypothesis testing¶

Using a confidence interval for hypothesis testing¶

Example: Fire-Rescue Department 🚒¶

Setting up a hypothesis test¶

Testing the hypotheses¶

Finding the interval¶

Conclusion of the hypothesis test¶

Summary of methods¶

Mean and median¶

The mean (i.e. average)¶

The median¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example: Flight delays ✈️¶

Comparing the mean and median¶

Standard deviation¶

Question: How "wide" is a distribution?¶

Deviations from the mean¶

Average squared deviation¶

Standard deviation¶

Standard deviation¶

Variance and standard deviation¶

What can we do with the standard deviation?¶

Summary, next time¶

Summary: Confidence intervals and hypothesis testing¶

Summary: Center and spread¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶