# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
Consider the pair of hypotheses "this coin is fair" and "this coin is unfair."
Which is the null hypothesis?
Consider the pair of hypotheses "this coin is fair" and "this coin is unfair." Which test statistic(s) could we use to test these hypotheses?
Question for today: Is there a formal definition for what we mean by "consistent"?
fair_coin = [0.5, 0.5]
repetitions = 10000
test_stats = np.array([])
for i in np.arange(repetitions):
coins = np.random.multinomial(400, fair_coin)
test_stat = abs(coins[0]/400 - coins[1]/400)
test_stats = np.append(test_stats, test_stat)
bpd.DataFrame().assign(test_stats = test_stats).plot(kind='hist', bins=20,
density=True, ec='w', figsize=(10, 5),
title='Empirical Distribution of the Absolute Difference in Proportion of Heads and Proportion of Tails');
plt.legend();
Small values of the observed statistic should make you side with the null hypothesis, that the coin is fair. But how small?
# Midterm scores from DSC 10, Winter 2023.
scores = bpd.read_csv('data/wi23-midterm-scores.csv')
scores
Section | Score | |
---|---|---|
0 | 10am | 59.0 |
1 | 9am | 67.0 |
2 | 10am | 56.0 |
... | ... | ... |
292 | 10am | 30.0 |
293 | 11am | 35.0 |
294 | 11am | 63.0 |
295 rows × 2 columns
scores.plot(kind='hist', density=True, figsize=(10, 5), ec='w', bins=np.arange(20, 110, 5), title='Distribution of Midterm Exam Scores in DSC 10, out of 96 points');
# Total number of students who took the exam.
scores.shape[0]
295
# Calculate the number of students in each section.
scores.groupby('Section').count()
Score | |
---|---|
Section | |
10am | 104 |
11am | 102 |
9am | 89 |
# Calculate the average midterm score in each section.
scores.groupby('Section').mean()
Score | |
---|---|
Section | |
10am | 62.71 |
11am | 68.15 |
9am | 62.57 |
# Observed characteristics of the 11am section.
section_size = int(scores.groupby('Section').count().get('Score').loc['11am'])
observed_avg = scores.groupby('Section').mean().get('Score').loc['11am']
print(f'The 11am section had {section_size} students and an average midterm score of {observed_avg}.')
The 11am section had 102 students and an average midterm score of 68.1470588235294.
# Samples 102 students from the class, independent of section,
# and computes the average score.
scores.sample(section_size, replace=False).get('Score').mean()
63.1764705882353
averages = np.array([])
repetitions = 1000
for i in np.arange(repetitions):
random_sample = scores.sample(section_size, replace=False)
new_average = random_sample.get('Score').mean()
averages = np.append(averages, new_average)
averages
array([63.19, 64.56, 64.02, ..., 67.15, 63.08, 63.71])
bpd.DataFrame().assign(RandomSampleAverage=averages).plot(kind='hist', bins=20,
density=True, ec='w', figsize=(10, 5),
title='Empirical Distribution of Midterm Averages for 102 Randomly Selected Students')
plt.axvline(observed_avg, color='black', linewidth=4, label='Observed Statistic')
plt.legend();
Question: What is the probability that under the null hypothesis, a result at least as extreme as our observation occurs?
bpd.DataFrame().assign(RandomSampleAverage=averages).plot(kind='hist', bins=20,
density=True, ec='w', figsize=(10, 5),
title='Empirical Distribution of Midterm Averages for 102 Randomly Selected Students')
plt.axvline(observed_avg, color='black', linewidth=4, label='observed statistic')
plt.legend();
observed_avg
68.1470588235294
np.count_nonzero(averages >= observed_avg) / repetitions
0.004
The cutoff for the p-value is an error probability. If:
then there is about a 0.05 chance that your test will (incorrectly) reject the null hypothesis.
In other words, if I were to teach 20 sections of DSC 10, I would expect a "statistically significantly high" average in one of those sections.
Recall from Lecture 14:
Section 197 of California's Code of Civil Procedure says,
"All persons selected for jury service shall be selected at random, from a source or sources inclusive of a representative cross section of the population of the area served by the court."
jury = bpd.DataFrame().assign(
Ethnicity=['Asian', 'Black', 'Latino', 'White', 'Other'],
Eligible=[0.15, 0.18, 0.12, 0.54, 0.01],
Panels=[0.26, 0.08, 0.08, 0.54, 0.04]
)
jury
Ethnicity | Eligible | Panels | |
---|---|---|---|
0 | Asian | 0.15 | 0.26 |
1 | Black | 0.18 | 0.08 |
2 | Latino | 0.12 | 0.08 |
3 | White | 0.54 | 0.54 |
4 | Other | 0.01 | 0.04 |
What do you notice? 👀
jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5));
with_diffs = jury.assign(Difference=(jury.get('Panels') - jury.get('Eligible')))
with_diffs
Ethnicity | Eligible | Panels | Difference | |
---|---|---|---|---|
0 | Asian | 0.15 | 0.26 | 0.11 |
1 | Black | 0.18 | 0.08 | -0.10 |
2 | Latino | 0.12 | 0.08 | -0.04 |
3 | White | 0.54 | 0.54 | 0.00 |
4 | Other | 0.01 | 0.04 | 0.03 |
with_abs_diffs = with_diffs.assign(AbsoluteDifference=np.abs(with_diffs.get('Difference')))
with_abs_diffs
Ethnicity | Eligible | Panels | Difference | AbsoluteDifference | |
---|---|---|---|---|---|
0 | Asian | 0.15 | 0.26 | 0.11 | 0.11 |
1 | Black | 0.18 | 0.08 | -0.10 | 0.10 |
2 | Latino | 0.12 | 0.08 | -0.04 | 0.04 |
3 | White | 0.54 | 0.54 | 0.00 | 0.00 |
4 | Other | 0.01 | 0.04 | 0.03 | 0.03 |
The Total Variation Distance (TVD) of two categorical distributions is the sum of the absolute differences of their proportions, all divided by 2.
with_abs_diffs
Ethnicity | Eligible | Panels | Difference | AbsoluteDifference | |
---|---|---|---|---|---|
0 | Asian | 0.15 | 0.26 | 0.11 | 0.11 |
1 | Black | 0.18 | 0.08 | -0.10 | 0.10 |
2 | Latino | 0.12 | 0.08 | -0.04 | 0.04 |
3 | White | 0.54 | 0.54 | 0.00 | 0.00 |
4 | Other | 0.01 | 0.04 | 0.03 | 0.03 |
with_abs_diffs.get('AbsoluteDifference').sum() / 2
0.14
def total_variation_distance(dist1, dist2):
'''Computes the TVD between two categorical distributions,
assuming the categories appear in the same order.'''
return np.abs((dist1 - dist2)).sum() / 2
jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5))
plt.annotate('If you add up the total amount by which the blue bars\n are longer than the red bars, you get 0.14.', (0.08, 3.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
plt.annotate('If you add up the total amount by which the red bars\n are longer than the blue bars, you also get 0.14!', (0.23, 0.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
What is the TVD between the distributions of class standing in DSC 10 and DSC 40A?
Class Standing | DSC 10 | DSC 40A |
---|---|---|
Freshman | 0.45 | 0.15 |
Sophomore | 0.35 | 0.35 |
Junior | 0.15 | 0.35 |
Senior+ | 0.05 | 0.15 |
Note: np.random.multinomial
creates samples drawn with replacement, even though real jury panels would be drawn without replacement. However, when the sample size (1453) is small relative to the population (number of people in Alameda County), the resulting distributions will be roughly the same whether we sample with or without replacement.
eligible = jury.get('Eligible')
sample_distribution = np.random.multinomial(1453, eligible) / 1453
sample_distribution
array([0.15, 0.17, 0.11, 0.56, 0.01])
total_variation_distance(sample_distribution, eligible)
0.02057811424638678
tvds = np.array([])
repetitions = 10000
for i in np.arange(repetitions):
sample_distribution = np.random.multinomial(1453, eligible) / 1453
new_tvd = total_variation_distance(sample_distribution, eligible)
tvds = np.append(tvds, new_tvd)
observed_tvd = total_variation_distance(jury.get('Panels'), eligible)
bpd.DataFrame().assign(tvds=tvds).plot(kind='hist', density=True, bins=20, ec='w', figsize=(10, 5),
title='Empirical Distribution of TVD Between Eligible Population and Random Sample')
plt.axvline(observed_tvd, color='black', linewidth=4, label='Observed Statistic')
plt.legend();
np.count_nonzero(tvds >= observed_tvd) / repetitions
0.0
jury.assign(RandomSample=sample_distribution).plot(kind='barh', x='Ethnicity', figsize=(10, 5),
title = "A Random Sample is Usually Similar to the Eligible Population");