import pandas as pd
import numpy as np
import os

import seaborn as sns
import plotly.express as px
pd.options.plotting.backend = 'plotly'

from ipywidgets import interact


from scipy.special import comb

def p_k_heads(k):
    return comb(100, k) * (0.5) ** 100


sum([p_k_heads(k) for k in range(59, 101)])

0.04431304005703377


plot_df = pd.DataFrame().assign(k = range(101))
plot_df['p_k'] = p_k_heads(plot_df['k'])
plot_df['color'] = plot_df['k'].apply(lambda k: 'orange' if k >= 59 else 'blue')

fig = plot_df.plot(kind='bar', x='k', y='p_k', color='color', width=1000)
fig.add_annotation(text='This red area is called the p-value!', x=77, y=0.008, showarrow=False)


# Flipping a fair coin 100 times.
# Interpret the result as [Heads, Tails].
np.random.multinomial(100, [0.5, 0.5])

array([46, 54])


# 100,000 times, we want to flip a coin 100 times.
results = []

for _ in range(100_000):
    num_heads = np.random.multinomial(100, [0.5, 0.5])[0]
    results.append(num_heads)


results[:10]

[52, 42, 49, 46, 59, 54, 44, 45, 50, 48]


fig = px.histogram(pd.DataFrame(results), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Number of Heads in 100 Flips of a Fair Coin')
fig.add_vline(x=59, line_color='red')
fig.update_layout(xaxis_range=[0, 100])


(np.array(results) >= 59).mean()

0.0451


# An array with 100000 rows and 2 columns.
np.random.multinomial(100, [0.5, 0.5], size=100_000)

array([[49, 51],
       [57, 43],
       [49, 51],
       ...,
       [52, 48],
       [52, 48],
       [50, 50]])


# Just the first column of the above array. Note the iloc-like syntax.
np.random.multinomial(100, [0.5, 0.5], size=100_000)[:, 0]

array([49, 44, 48, ..., 48, 55, 54])


%%time

faster_results = np.random.multinomial(100, [0.5, 0.5], size=100_000)[:, 0]

CPU times: user 9.33 ms, sys: 875 µs, total: 10.2 ms
Wall time: 8.76 ms


%%time

# 100,000 times, we want to flip a coin 100 times.
results = []

for _ in range(100_000):
    num_heads = np.random.multinomial(100, [0.5, 0.5])[0]
    results.append(num_heads)

CPU times: user 285 ms, sys: 7.34 ms, total: 292 ms
Wall time: 286 ms


eth = pd.DataFrame([['Asian', 0.15, 0.51],
                    ['Black', 0.05, 0.02],
                    ['Latino', 0.39, 0.16],
                    ['White', 0.35, 0.2],
                    ['Other', 0.06, 0.11]],
                   columns=['Ethnicity', 'California', 'UCSD']).set_index('Ethnicity')

eth


eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')


def total_variation_distance(dist1, dist2):
    '''Given two categorical distributions, 
    both sorted with same categories, calculates the TVD'''
    return np.sum(np.abs(dist1 - dist2)) / 2


observed_tvd = total_variation_distance(eth['UCSD'], eth['California'])
observed_tvd

0.41000000000000003


# Number of students at UCSD in this example.
N_STUDENTS = 30_000


eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64


np.random.multinomial(N_STUDENTS, eth['California'])

array([ 4515,  1469, 11708, 10536,  1772])


np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

array([0.15213333, 0.04926667, 0.3831    , 0.3541    , 0.0614    ])


num_reps = 100_000
eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=num_reps) / N_STUDENTS
eth_draws

array([[0.14913333, 0.04916667, 0.39296667, 0.3475    , 0.06123333],
       [0.1528    , 0.05103333, 0.38396667, 0.35256667, 0.05963333],
       [0.1535    , 0.0501    , 0.3879    , 0.3483    , 0.0602    ],
       ...,
       [0.15296667, 0.04906667, 0.38603333, 0.35276667, 0.05916667],
       [0.1474    , 0.0496    , 0.38776667, 0.35326667, 0.06196667],
       [0.1522    , 0.05096667, 0.38763333, 0.34836667, 0.06083333]])


eth_draws.shape

(100000, 5)


tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
tvds

array([0.0042    , 0.0064    , 0.0038    , ..., 0.00573333, 0.00523333,
       0.004     ])


# Note that this is the same as the first element in tvds.
total_variation_distance(eth_draws[0], eth['California'])

0.004200000000000002


fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig


(np.array(tvds) >= observed_tvd).mean()

0.0


def p_value_given_n_students(N_STUDENTS):
    eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=num_reps) / N_STUDENTS
    tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
    p_value = (tvds >= observed_tvd).mean()
    return p_value


interact(p_value_given_n_students, N_STUDENTS=(1, 300));


penguins = sns.load_dataset('penguins').dropna()
penguins.head()


penguins.groupby('island')['bill_length_mm'].agg(['mean', 'count'])


# Draws two samples of size 47 from penguins['bill_length_mm'].
# Question: Why must we sample with replacement here (or, more specifically, in the next cell)?
np.random.choice(penguins['bill_length_mm'], size=(2, 47))

array([[45.1, 44.1, 46.1, 49.2, 41.1, 46.1, 35.9, 49.8, 47.8, 36. , 45.7,
        39.7, 36.6, 47.5, 36.5, 47. , 52.7, 39.6, 52. , 39.7, 46.4, 45.5,
        51.9, 40.6, 36. , 37.7, 48.2, 39. , 42.7, 43.2, 48.4, 50.3, 38.8,
        47.2, 48.1, 47.2, 45.5, 42.8, 46.5, 50. , 49.1, 40.7, 35.7, 40.2,
        50.9, 38.9, 39.6],
       [38.6, 48.5, 39.8, 36.4, 39.6, 45.2, 49. , 45.3, 50.2, 37.8, 46.1,
        41.4, 43.5, 50.8, 52.7, 41. , 51.5, 45.5, 47.5, 46.5, 44.1, 50.2,
        52.2, 50.7, 40.6, 42.7, 36.7, 50.8, 46.2, 50.2, 39. , 45.4, 46.2,
        50.5, 42.7, 47.5, 55.8, 36. , 50. , 50.1, 42.7, 45.3, 37.2, 40.9,
        49.1, 41.5, 48.7]])


# Draws 100000 samples of size 47 from penguins['bill_length_mm'].
num_reps = 100_000
averages = np.random.choice(penguins['bill_length_mm'], size=(num_reps, 47)).mean(axis=1)
averages

array([44.0106383 , 44.12340426, 44.64042553, ..., 42.97234043,
       44.72765957, 42.26382979])


fig = px.histogram(pd.DataFrame(averages), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Average Bill Length in Samples of Size 47')
fig.add_vline(x=penguins.loc[penguins['island'] == 'Torgersen', 'bill_length_mm'].mean(), line_color='red')

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	mean	count
island
Biscoe	45.248466	163
Dream	44.221951	123
Torgersen	39.038298	47

Lecture 9 – Hypothesis Testing¶

DSC 80, Spring 2023¶

Agenda¶

"Standard" hypothesis testing¶

Example: Coin flipping¶

Recap: Coin flipping¶

Generating the null distribution¶

Generating the null distribution, using math¶

Making a decision¶

⚠️ We can't "accept" the null!¶

Generating the null distribution, using simulation¶

Generating the null distribution, using simulation¶

Visualizing the empirical distribution of the test statistic¶

Reflection¶

Can we make things faster? 🏃¶

Choosing alternative hypotheses and test statistics¶

Absolute test statistics¶

Important¶

Fun fact¶

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples¶

Computing many TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

Summary of the method¶

Aside¶

Discussion Question¶

Example: Penguins (again!)¶

Average bill length by island¶

Setup¶

The plan¶

Simulation¶

Visualizing the empirical distribution of the test statistic¶

Discussion Question¶

Summary¶

The hypothesis testing "recipe"¶

Hypothesis testing vs. permutation testing¶

Additional reading¶

Null hypothesis¶

Alternative hypothesis¶

P-values and cutoffs¶

Computing many TVDs, without a `for`-loop¶