import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(10, 5))
plt.rc('font', size=12)


np.random.seed(42)

flips = pd.DataFrame(np.random.choice(['H', 'T'], p=[0.55, 0.45], size=(114, 1)), columns=['result'])
flips.head()


flips.shape

(114, 1)


flips['result'].value_counts()

H    68
T    46
Name: result, dtype: int64


# 100,000 times, we want to flip a coin 114 times
results = []

for _ in range(100000):
    simulation = np.random.choice(['H', 'T'], size=114)
    sim_heads = (simulation == 'H').sum()  # Test statistic
    results.append(sim_heads)


results[:10]

[54, 51, 55, 52, 56, 66, 54, 66, 57, 57]


pd.Series(results).plot(kind='hist', 
                        density=True,
                        bins=np.arange(35, 76, 1),
                        ec='w',
                        title='Number of Heads in 114 Flips of a Fair Coin');
obs = (flips['result'] == 'H').sum()
plt.axvline(x=obs, color='red', linewidth=2);


(np.array(results) >= obs).mean()

0.02481


flips.head()


flips_fast = flips.replace({'H': 1, 'T': 0})
flips_fast.head()


def flip_114(N):
    return np.random.choice([1, 0], size=(N, 114))


flip_114(50)

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 0],
       ...,
       [0, 1, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 1],
       [0, 1, 0, ..., 0, 1, 0]])


flip_114(50).shape

(50, 114)


%%time

# Flips a fair coin 100,000 * 114 times
simulations = pd.DataFrame(flip_114(100000))

# Compute test statistics
# Note that axis=1 will take the sum of each row of 114, which is what we want
results_fast = simulations.sum(axis=1)

CPU times: user 75.2 ms, sys: 37.2 ms, total: 112 ms
Wall time: 113 ms


pd.Series(results_fast).plot(kind='hist', 
                        density=True,
                        bins=np.arange(35, 76, 1),
                        ec='w',
                        title='Number of Heads in 114 Flips of a Fair Coin');
plt.axvline(x=obs, color='red', linewidth=2);


eth = pd.DataFrame([['Asian', 0.15, 0.51],
                    ['Black', 0.05, 0.02],
                    ['Latino', 0.39, 0.16],
                    ['White', 0.35, 0.2],
                    ['Other', 0.06, 0.11]],
                   columns=['Ethnicity', 'California', 'UCSD']).set_index('Ethnicity')

eth


eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD');


def total_variation_distance(dist1, dist2):
    '''Given two categorical distributions, 
    both sorted with same categories, calculates the TVD'''
    return np.sum(np.abs(dist1 - dist2)) / 2


observed_tvd = total_variation_distance(eth['California'], eth['UCSD'])
observed_tvd

0.41000000000000003


np.random.multinomial(10, [0.5, 0.5])

array([9, 1])


eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64


np.random.multinomial(30000, eth['California'])

array([ 4438,  1555, 11693, 10610,  1704])


np.random.multinomial(30000, eth['California']) / 30000

array([0.14996667, 0.04803333, 0.3917    , 0.3493    , 0.061     ])


num_reps = 10000
tvds = []

for i in np.arange(num_reps):
    random_sample = np.random.multinomial(30000, eth['California']) / 30000
    new_tvd = total_variation_distance(eth['California'], random_sample)
    tvds.append(new_tvd)


pd.Series(tvds).plot(kind='hist', 
                     density=True,
                     ec='w',
                     title='Simulated TVDs');
plt.axvline(x=observed_tvd, color='red', linewidth=2);


eth_draws = np.random.multinomial(30000, eth['California'], size=num_reps) / 30000
eth_draws

array([[0.1509    , 0.0478    , 0.3896    , 0.352     , 0.0597    ],
       [0.15026667, 0.0525    , 0.38723333, 0.35163333, 0.05836667],
       [0.15193333, 0.05166667, 0.38633333, 0.3499    , 0.06016667],
       ...,
       [0.15193333, 0.0506    , 0.39486667, 0.3439    , 0.0587    ],
       [0.14936667, 0.0477    , 0.39323333, 0.35003333, 0.05966667],
       [0.15036667, 0.0492    , 0.38883333, 0.354     , 0.0576    ]])


eth_draws.shape

(10000, 5)


tvds_fast = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
tvds_fast

array([0.0029    , 0.0044    , 0.00376667, ..., 0.0074    , 0.00326667,
       0.00436667])


pd.Series(tvds).plot(kind='hist', 
                     density=True,
                     ec='w',
                     title='Simulated TVDs (without using a loop)');
# plt.axvline(x=observed_tvd, color='red', linewidth=2);


import seaborn as sns


penguins = sns.load_dataset('penguins').dropna()
penguins.head()


penguins.groupby('island')['bill_length_mm'].agg(['mean', 'count'])


num_reps

10000


averages = []

for i in np.arange(num_reps):
    random_sample = penguins.sample(47)
    new_average = random_sample['bill_length_mm'].mean()
    averages.append(new_average)


pd.Series(averages).plot(kind='hist', 
                         density=True,
                         bins=30,
                         ec='w',
                         title='Average Bill Lengths in Samples of Size 47');

observed_average = penguins.groupby('island').mean().loc['Torgersen', 'bill_length_mm']
plt.axvline(x=observed_average, color='red', linewidth=2);


np.random.choice(penguins['bill_length_mm'], size=(5, 47))

array([[35.7, 51. , 50.8, 53.5, 43.5, 38.1, 42.3, 32.1, 36.4, 42.5, 49.3,
        35. , 42.2, 40.3, 42.9, 51.4, 35.2, 47.5, 43.6, 45.8, 45.7, 48.7,
        45.4, 51.3, 40.8, 55.9, 46. , 51.3, 49.5, 51.5, 50. , 49. , 40.8,
        45.7, 46.8, 50.5, 49.6, 43.5, 38.6, 46.9, 51.1, 49.5, 41.1, 50.5,
        49.5, 42.8, 50.4],
       [46.5, 49. , 45.8, 48.1, 49. , 40.9, 41.3, 41.1, 38.2, 39. , 38.9,
        39.2, 52. , 48.5, 38.1, 52.1, 44.9, 36.9, 40.7, 36. , 46.5, 47.5,
        41.4, 37.6, 46.7, 41.8, 38.8, 50.1, 41.5, 50.7, 47.2, 37.7, 37. ,
        35.9, 46. , 32.1, 42.4, 43.5, 49.9, 39.2, 39. , 41.7, 50.2, 48.2,
        43.5, 51.1, 46.4],
       [51.5, 38.6, 48.7, 36.2, 40.2, 43.5, 46.4, 48.8, 52.8, 41.4, 52.2,
        46.1, 38.2, 47. , 41.1, 40.9, 47.6, 46.2, 50.2, 42.2, 39.6, 49.1,
        46.7, 49.5, 45.1, 50.2, 49.3, 49.5, 46. , 50.8, 37.6, 43.3, 38.3,
        46.5, 50.7, 36.3, 39.6, 54.2, 49. , 46.7, 39.7, 41.1, 45.2, 37. ,
        44. , 41.1, 54.2],
       [36.7, 51.3, 41.1, 36. , 42.9, 41.6, 46.8, 47. , 45.5, 39.2, 36.6,
        45.8, 46. , 40.6, 43.5, 36.7, 49.5, 43.2, 50.8, 43.8, 46.5, 46. ,
        51.1, 37.9, 50. , 35.1, 50.1, 47.7, 39.5, 49. , 50.6, 51.4, 42.7,
        42.5, 35.1, 46.2, 48.4, 45.9, 46.4, 36.6, 41.8, 37.2, 48.4, 42.3,
        45.5, 51.5, 48.8],
       [42.2, 53.4, 45.5, 49. , 46.5, 45.5, 35.7, 50.2, 37. , 47.7, 37.9,
        43.3, 36.6, 38.1, 47.2, 47.2, 45.5, 36. , 48.7, 52.5, 42.2, 34.5,
        48.7, 36. , 42.2, 39.6, 44.1, 34.6, 50. , 50.8, 39.3, 38.2, 43.5,
        37.6, 37. , 45.2, 42.5, 41.1, 36.3, 46.8, 50.5, 39.6, 45.2, 50. ,
        46.2, 35.7, 39.2]])


averages_fast = np.random.choice(penguins['bill_length_mm'], size=(num_reps, 47)).mean(axis=1)
averages_fast

array([44.41276596, 44.35319149, 44.73404255, ..., 43.35957447,
       43.25957447, 44.87021277])


pd.Series(averages_fast).plot(kind='hist', 
                              density=True,
                              bins=30,
                              ec='w',
                              title='Average Bill Lengths in Samples of Size 47 (without using a loop)');

observed_average = penguins.groupby('island').mean().loc['Torgersen', 'bill_length_mm']
plt.axvline(x=observed_average, color='red', linewidth=2);

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

Lecture 6 – Hypothesis Testing¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Hypothesis testing¶

Null hypothesis¶

Alternative hypothesis¶

Test statistics¶

Generating the null distribution¶

Example: coin flipping¶

Setup¶

Simulation¶

Plotting the empirical distribution of the test statistic¶

P-values and cutoffs¶

⚠️ We can't "prove" the null!¶

Fun fact¶

Efficiency¶

Speeding things up 🏃¶

Timing the faster simulation¶

Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Simulation¶

Speeding things up 🏃¶

Summary of the method¶

Another example¶

Aside: loading data from `seaborn`¶

Average bill length by island¶

Setup¶

The plan¶

Simulation¶

Speeding things up 🏃¶

Summary, next time¶

The hypothesis testing "recipe"¶

Examples, next time¶

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	mean	count
island
Biscoe	45.248466	163
Dream	44.221951	123
Torgersen	39.038298	47

	result
0	H
1	T
2	T
3	T
4	H

	result
0	H
1	T
2	T
3	T
4	H

	result
0	1
1	0
2	0
3	0
4	1

Lecture 6 – Hypothesis Testing¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Hypothesis testing¶

Null hypothesis¶

Alternative hypothesis¶

Test statistics¶

Generating the null distribution¶

Example: coin flipping¶

Setup¶

Simulation¶

Plotting the empirical distribution of the test statistic¶

P-values and cutoffs¶

⚠️ We can't "prove" the null!¶

Fun fact¶

Efficiency¶

Speeding things up 🏃¶

Timing the faster simulation¶

Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Simulation¶

Speeding things up 🏃¶

Summary of the method¶

Another example¶

Aside: loading data from seaborn¶

Average bill length by island¶

Setup¶

The plan¶

Simulation¶

Speeding things up 🏃¶

Summary, next time¶

The hypothesis testing "recipe"¶

Examples, next time¶

Aside: loading data from `seaborn`¶