# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)


fair_coin = [0.5, 0.5]

repetitions = 10000
test_stats = np.array([])
for i in np.arange(repetitions):
    coins = np.random.multinomial(400, fair_coin)
    test_stat = abs(coins[0]/400 - coins[1]/400)
    test_stats = np.append(test_stats, test_stat)


bpd.DataFrame().assign(test_stats = test_stats).plot(kind='hist', bins=20,
                                             density=True, ec='w', figsize=(10, 5),
                                             title='Empirical Distribution of the Absolute Difference in Proportion of Heads and Proportion of Tails');
plt.legend();


# Midterm scores from DSC 10, Winter 2023.
scores = bpd.read_csv('data/wi23-midterm-scores.csv')
scores


scores.plot(kind='hist', density=True, figsize=(10, 5), ec='w', bins=np.arange(20, 110, 5), title='Distribution of Midterm Exam Scores in DSC 10, out of 96 points');


# Total number of students who took the exam.
scores.shape[0]

295


# Calculate the number of students in each section.
scores.groupby('Section').count()


# Calculate the average midterm score in each section.
scores.groupby('Section').mean()


# Observed characteristics of the 11am section.
section_size = int(scores.groupby('Section').count().get('Score').loc['11am'])
observed_avg = scores.groupby('Section').mean().get('Score').loc['11am']
print(f'The 11am section had {section_size} students and an average midterm score of {observed_avg}.')

The 11am section had 102 students and an average midterm score of 68.1470588235294.


# Samples 102 students from the class, independent of section, 
# and computes the average score.
scores.sample(section_size, replace=False).get('Score').mean()

63.1764705882353


averages = np.array([])
repetitions = 1000
for i in np.arange(repetitions):
    random_sample = scores.sample(section_size, replace=False)
    new_average = random_sample.get('Score').mean()
    averages = np.append(averages, new_average)
    
averages

array([63.19, 64.56, 64.02, ..., 67.15, 63.08, 63.71])


bpd.DataFrame().assign(RandomSampleAverage=averages).plot(kind='hist', bins=20, 
                                                          density=True, ec='w', figsize=(10, 5), 
                                                          title='Empirical Distribution of Midterm Averages for 102 Randomly Selected Students')
plt.axvline(observed_avg, color='black', linewidth=4, label='Observed Statistic')
plt.legend();


bpd.DataFrame().assign(RandomSampleAverage=averages).plot(kind='hist', bins=20, 
                                                          density=True, ec='w', figsize=(10, 5), 
                                                          title='Empirical Distribution of Midterm Averages for 102 Randomly Selected Students')
plt.axvline(observed_avg, color='black', linewidth=4, label='observed statistic')
plt.legend();


observed_avg

68.1470588235294


np.count_nonzero(averages >= observed_avg) / repetitions

0.004


jury = bpd.DataFrame().assign(
    Ethnicity=['Asian', 'Black', 'Latino', 'White', 'Other'],
    Eligible=[0.15, 0.18, 0.12, 0.54, 0.01],
    Panels=[0.26, 0.08, 0.08, 0.54, 0.04]
)
jury


jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5));


with_diffs = jury.assign(Difference=(jury.get('Panels') - jury.get('Eligible')))
with_diffs


with_abs_diffs = with_diffs.assign(AbsoluteDifference=np.abs(with_diffs.get('Difference')))
with_abs_diffs


with_abs_diffs


with_abs_diffs.get('AbsoluteDifference').sum() / 2

0.14


def total_variation_distance(dist1, dist2):
    '''Computes the TVD between two categorical distributions, 
       assuming the categories appear in the same order.'''
    return np.abs((dist1 - dist2)).sum() / 2

jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5))
plt.annotate('If you add up the total amount by which the blue bars\n are longer than the red bars, you get 0.14.', (0.08, 3.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
plt.annotate('If you add up the total amount by which the red bars\n are longer than the blue bars, you also get 0.14!', (0.23, 0.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));


eligible = jury.get('Eligible')
sample_distribution = np.random.multinomial(1453, eligible) / 1453 
sample_distribution

array([0.15, 0.17, 0.11, 0.56, 0.01])


total_variation_distance(sample_distribution, eligible)

0.02057811424638678


tvds = np.array([])
repetitions = 10000
for i in np.arange(repetitions):
    sample_distribution = np.random.multinomial(1453, eligible) / 1453
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)


observed_tvd = total_variation_distance(jury.get('Panels'), eligible)

bpd.DataFrame().assign(tvds=tvds).plot(kind='hist', density=True, bins=20, ec='w', figsize=(10, 5),
                                      title='Empirical Distribution of TVD Between Eligible Population and Random Sample')
plt.axvline(observed_tvd, color='black', linewidth=4, label='Observed Statistic')
plt.legend();


np.count_nonzero(tvds >= observed_tvd) / repetitions

0.0


jury.assign(RandomSample=sample_distribution).plot(kind='barh', x='Ethnicity', figsize=(10, 5),
                                                   title = "A Random Sample is Usually Similar to the Eligible Population");

	Section	Score
0	10am	59.0
1	9am	67.0
2	10am	56.0
...	...	...
292	10am	30.0
293	11am	35.0
294	11am	63.0

	Score
Section
10am	62.71
11am	68.15
9am	62.57

Class Standing	DSC 10	DSC 40A
Freshman	0.45	0.15
Sophomore	0.35	0.35
Junior	0.15	0.35
Senior+	0.05	0.15

	Ethnicity	Eligible	Panels
0	Asian	0.15	0.26
1	Black	0.18	0.08
2	Latino	0.12	0.08
3	White	0.54	0.54
4	Other	0.01	0.04

Lecture 15 – Hypothesis Testing¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Decisions and uncertainty¶

Incomplete information¶

Testing hypotheses¶

Null and alternative hypotheses¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Test statistics, revisited¶

Considerations when choosing a test statistic¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Empirical distribution of the test statistic¶

Example: Is my coin fair?¶

Example: Midterm exam scores¶

The problem¶

Thought experiment 💭🧪¶

Hypothesis test¶

Simulating under the null hypothesis¶

What's the verdict? 🤔¶

Statistical significance¶

Definition of the p-value¶

Conventions about inconsistency¶

What does the p-value mean?¶

Comparing distributions¶

Jury selection in Alameda County¶

Jury panels¶

Racial and Ethnic Disparities in Alameda County Jury Pools¶

Are the differences in representation meaningful?¶

The distance between two distributions¶

The distance between two distributions¶

Statistic: Total Variation Distance¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Simulate drawing jury panels¶

The simulation¶

Repeating the experiment¶

Calculating the p-value¶

Are the jury panels representative?¶

Summary, next time¶

The hypothesis testing "recipe"¶

Why does it matter?¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶