# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

np.random.multinomial(sample_size, pop_distribution)


np.random.multinomial(35, [0.3, 0.3, 0.4])

array([10, 11, 14])


demographics = [0.26, 0.74]


np.random.multinomial(100, demographics)

array([26, 74])


np.random.multinomial(100, demographics)[0]

22


counts = np.array([])

for i in np.arange(10000):
    new_count = np.random.multinomial(100, demographics)[0]
    counts = np.append(counts, new_count)


counts

array([27., 28., 25., ..., 27., 20., 22.])


(bpd.DataFrame().assign(count_black_men=counts)
                .plot(kind='hist', bins = np.arange(9.5, 45, 1), 
                      density=True, ec='w', figsize=(10, 5),
                      title='Empiricial Distribution of the Number of Black Men in Simulated Jury Panels of Size 100'));
observed_count = 8
plt.axvline(observed_count, color='black', linewidth=4, label='Observed Number of Black Men in Actual Jury Panel')
plt.legend();


# In 10,000 random experiments, the panel with the fewest Black men had how many?
counts.min()

11.0


model = [0.75, 0.25]


# Draw 929 plants and calculate the proportion of plants with purple flowers
np.random.multinomial(929,  model)[0] / 929

0.751345532831001


distances = np.array([])

for i in np.arange(10000):
    new_distance = abs(np.random.multinomial(929,  model)[0] / 929 - 0.75)
    distances = np.append(distances, new_distance)


bpd.DataFrame().assign(simulated_abs_differences=distances) \
               .plot(kind='hist', bins=np.arange(0, 0.055, 0.0025),
                     density=True, ec='w', figsize=(10, 5),
                     title='Empirical Distribution of the Statistic | proportion purple - 0.75 |');


observed_distance = abs(705 / 929 - 0.75)
observed_distance

0.008880516684607098


bpd.DataFrame().assign(simulated_absolute_differences=distances) \
               .plot(kind='hist', bins=np.arange(0, 0.055, 0.0025),
                     density=True, ec='w', figsize=(10, 5),
                     title='Empirical Distribution of the Statistic | proportion purple - 0.75 |');
plt.axvline(observed_distance, color='black', linewidth=4, label='Observed Value of the Statistic | proportion purple - 0.75 |')
plt.legend();


flips_400 = bpd.read_csv('data/flips.csv').get('flips').values
flips_400

array(['Tails', 'Tails', 'Tails', ..., 'Heads', 'Heads', 'Tails'],
      dtype=object)


heads = np.count_nonzero(flips_400 == 'Heads')
heads

188


tails = len(flips_400) - heads
tails

212


flips = np.array([heads, tails])
flips

array([188, 212])


def dist_from_200(arr):
    heads = arr[0]
    return abs(heads - 200)


dist_from_200(flips)

12


model = [0.5, 0.5]

repetitions = 10000
results = np.array([])
for i in np.arange(repetitions):
    coins = np.random.multinomial(400, model)
    result = dist_from_200(coins)
    results = np.append(results, result)

results

array([11.,  2., 13., ...,  2., 21., 22.])


bpd.DataFrame().assign(results=results).plot(kind='hist', bins=np.arange(0, 40, 2), 
                                             density=True, ec='w', figsize=(10, 5),
                                             title='Empirical Distribution of the Statistic | Number of Heads - 200 |');
plt.axvline(dist_from_200(flips), color='black', linewidth=4, label='Observed Value of the Statistic | Number of Heads - 200 |')
plt.legend();


def num_heads(arr):
    return arr[0]


model = [0.5, 0.5]

repetitions = 10000
results = np.array([])
for i in np.arange(repetitions):
    coins = np.random.multinomial(400, model)
    result = num_heads(coins)
    results = np.append(results, result)

results

array([207., 200., 222., ..., 194., 180., 204.])


bpd.DataFrame().assign(results=results).plot(kind='hist', bins=np.arange(160, 240, 4), 
                                             density=True, ec='w', figsize=(10, 5),
                                             title='Empirical Distribution of the Number of Heads');
plt.axvline(num_heads(flips), color='black', linewidth=4, label='Observed Value of the Number of Heads')
plt.legend();

Lecture 14 – Models and Viewpoints¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Statistical models¶

Models¶

Example¶

Example: Jury selection¶

Swain vs. Alabama, 1965¶

Supreme Court ruling¶

Our model for simulating Swain's jury panel¶

Our approach: simulation¶

Simulating statistics¶

Step 1 – Running the experiment once¶

Step 1 – Running the experiment once¶

Step 1 – Running the experiment once¶

Step 2 – Repeat the experiment many times¶

Step 3 – Visualize the resulting distribution¶

Conclusion¶

Example: Genetics of peas 🟢¶

Gregor Mendel, 1822-1884¶

Mendel's model¶

Choosing a statistic¶

Simulating Mendel's experiment¶

Mendel's experiment¶

Was Mendel's model any good?¶

Mendelian inheritance¶

Viewpoints and test statistics¶

Choosing one of two viewpoints¶

Test statistics¶

Choosing one of two viewpoints¶

Example: Is our coin fair?¶

Example: Is our coin fair?¶

Designing a test statistic for a pair of viewpoints¶

Simulating a fair coin¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Another pair of viewpoints¶

Simulating a fair coin, again¶

Questions to consider before choosing a test statistic¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶