from dsc80_utils import *

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

rest = (pd.read_csv(rest_path)
        .pipe(subset_rest))
insp = (pd.read_csv(insp_path)
        .pipe(subset_insp))
viol = (pd.read_csv(viol_path)
        .pipe(subset_viol))

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df.head(2)

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

business_id      0.00
inspection_id    0.00
score            0.00
grade            0.42
date             0.00
status           0.00
dtype: float64

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df[df['inspection_id'].isna()]

# Look at the dtype!
insp['date']

0       2023-02-16
1       2022-01-03
2       2020-12-03
           ...    
5176    2023-03-06
5177    2022-12-09
5178    2022-11-30
Name: date, Length: 5179, dtype: object

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
8728   2022-11-30
8729   2022-11-30
8730   2022-11-30
Name: date, Length: 8731, dtype: datetime64[ns]

insp.resample('2W', on='date')['score'].mean()

date
2020-01-05    42.67
2020-01-19    59.33
2020-02-02    56.34
              ...  
2023-09-24    66.60
2023-10-08    59.58
2023-10-22    66.81
Freq: 2W-SUN, Name: score, Length: 100, dtype: float64

# Where are those numbers coming from?
insp[
    (insp['date'] >= pd.Timestamp('2020-01-05')) &
    (insp['date'] < pd.Timestamp('2020-01-19'))
]['score']

10        0
11       92
12        0
       ... 
4709      0
4988    100
5107     96
Name: score, Length: 86, dtype: int64

(insp.resample('2W', on='date')
 .size()
 .plot(title='Number of Inspections Over Time')
)

insp['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

insp['date'].dt.day

0       16
1        3
2        3
        ..
5176     6
5177     9
5178    30
Name: date, Length: 5179, dtype: int32

insp['date'].dt.dayofweek

0       3
1       0
2       3
       ..
5176    0
5177    4
5178    2
Name: date, Length: 5179, dtype: int32

dow_counts = insp['date'].dt.dayofweek.value_counts()
fig = px.bar(dow_counts)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

names = np.load(Path('data') / 'names.npy', allow_pickle=True)

# By default, the sampling is done WITH replacement.
np.random.choice(names, 10)

array(['Brandon', 'Shir', 'Zening', 'Zoey', 'Max', 'Suraj', 'Marcus',
       'Lin', 'Aakash', 'Noah'], dtype=object)

# To sample WITHOUT replacement, set replace=False.
# This is known as "simple random sampling."
np.random.choice(names, 10, replace=False)

array(['Aile', 'Kailey', 'Colin', 'Angela', 'Suhani', 'Allen', 'Yihui',
       'Chenlong', 'Krystal', 'Shir'], dtype=object)

# Samples WITHOUT replacement by default (the opposite of np.random.choice).
pd.DataFrame(names, columns=['name']).sample(10)

# Draws 100 elements from a population in which 50% are group 0 and 50% are group 1.
# This sampling is done WITH replacement.
# In other words, each sampled element has a 50% chance of being group 0 and a 50% chance of being group 1.
np.random.multinomial(100, [0.5, 0.5])

array([58, 42])

eth = pd.DataFrame(
    [['Asian', 0.15, 0.51],
     ['Black', 0.05, 0.02],
     ['Latino', 0.39, 0.16],
     ['White', 0.35, 0.2],
     ['Other', 0.06, 0.11]],
    columns=['Ethnicity', 'California', 'UCSD']
).set_index('Ethnicity')

eth

eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')

def tvd(dist1, dist2):
    return np.abs(dist1 - dist2).sum() / 2

# The diff method finds the differences of consecutive elements in a Series.
pd.Series([4, 5, -2]).diff()

0    NaN
1    1.0
2   -7.0
dtype: float64

observed_tvd = eth.diff(axis=1).abs().sum().iloc[1] / 2
observed_tvd

np.float64(0.41000000000000003)

# Number of students at UCSD in this example.
N_STUDENTS = 30_000

eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64

np.random.multinomial(N_STUDENTS, eth['California'])

array([ 4549,  1542, 11678, 10392,  1839])

np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

array([0.15, 0.05, 0.4 , 0.35, 0.06])

eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
eth_draws

array([[0.15, 0.05, 0.39, 0.36, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       ...,
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06]])

eth_draws.shape

(100000, 5)

# The values here appear rounded.
tvds = np.abs(eth_draws - eth['California'].to_numpy()).sum(axis=1) / 2
tvds

array([0.01, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

observed_tvd

np.float64(0.41000000000000003)

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig

(np.array(tvds) >= observed_tvd).mean()

np.float64(0.0)

eth

With 2 students, the p-value is 0.72554.
With 4 students, the p-value is 0.30572.
With 8 students, the p-value is 0.08191.
With 16 students, the p-value is 0.00349.
With 32 students, the p-value is 1e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

np.float64(-9.266142572024918)

array([117, 108, 138, ...,  99, 136, 131])

[np.float64(1.8591069061657208),
 np.float64(-1.031311607782186),
 np.float64(0.3423526364702809),
 np.float64(-1.0384661090543545),
 np.float64(0.7358502064384425),
 np.float64(1.7982936453524587),
 np.float64(-2.6804241510123887),
 np.float64(-0.04041318158965623),
 np.float64(0.6499961911726615),
 np.float64(-0.22285296402942834)]

np.float64(-9.266142572024918)

count
2    1033
1       2
Name: count, dtype: int64

mar_status
married      1484
unmarried     584
dtype: int64

np.float64(0.1269754089281099)

np.float64(0.1269754089281099)

eth

def ethnicity_test(N_STUDENTS):
    eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
    tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
    return (np.array(tvds) >= observed_tvd).mean()

for i in range(1, 9):
    N_STUDENTS = 2 ** i
    print(f'With {N_STUDENTS} students, the p-value is {ethnicity_test(N_STUDENTS)}.')

With 2 students, the p-value is 0.72554.
With 4 students, the p-value is 0.30572.
With 8 students, the p-value is 0.08191.
With 16 students, the p-value is 0.00349.
With 32 students, the p-value is 1e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

baby = pd.read_csv(Path('data') / 'babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

group_means.loc[True] - group_means.loc[False]

np.float64(-9.266142572024918)

baby.head()

np.random.permutation(baby['Birth Weight'])

array([117, 108, 138, ...,  99, 136, 131])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    diff = group_means.loc[True, x] - group_means.loc[False, x]
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {diff:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.loc[True] - group_means.loc[False]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[np.float64(1.8591069061657208),
 np.float64(-1.031311607782186),
 np.float64(0.3423526364702809),
 np.float64(-1.0384661090543545),
 np.float64(0.7358502064384425),
 np.float64(1.7982936453524587),
 np.float64(-2.6804241510123887),
 np.float64(-0.04041318158965623),
 np.float64(0.6499961911726615),
 np.float64(-0.22285296402942834)]

mean_weights = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
observed_difference = mean_weights[True] - mean_weights[False]
observed_difference

np.float64(-9.266142572024918)

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

couples = pd.read_csv(Path('data') / 'married_couples.csv')
couples.head()

# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

count
2    1033
1       2
Name: count, dtype: int64

couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()

couples.head()

empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]

couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})

couples.head()

# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)

px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)

couples.sample(5).head()

# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts

empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64

cond_distr = empl_cnts / empl_cnts.sum()
cond_distr

cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')

cond_distr

(cond_distr['unmarried'] - cond_distr['married']).abs().sum() / 2

np.float64(0.1269754089281099)

def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return (distr['unmarried'] - distr['married']).abs().sum() / 2

# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

np.float64(0.1269754089281099)

couples.head()

couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))

N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)

fig = px.histogram(tvds, x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.update_layout(xaxis_range=[0, 0.2])

	business_id	name	address	zip	...	status	kind	is_major	violation
0	211898487641	MOBIL MART LA JOLLA VILLAGE	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037	...	Complete	Hot and Cold Water	Y	21. Hot & cold water available
1	211898487641	MOBIL MART LA JOLLA VILLAGE	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037	...	Complete	Hot and Cold Water	N	21. Hot & cold water available

	age
count	2068.00
mean	43.17
std	11.91
...	...
50%	44.00
75%	53.00
max	64.00

	business_id	name	address	zip	...	status	kind	is_major	violation
759	211941133403	TASTY CHAI	8878 REGENTS RD 105, SAN DIEGO, CA 92122-5853	92122-5853	...	NaN	NaN	NaN	NaN
1498	211915545446	EMBASSY SUITES SAN DIEGO LA JOLLA	4550 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122-...	92122-1248	...	NaN	NaN	NaN	NaN
1672	211937443689	SERVICENOW	4770 EASTGATE MALL, SAN DIEGO, CA 92121-1970	92121-1970	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
8094	211997340975	COOKIE SCOOP	7759 GASTON DR, SAN DIEGO, CA 92126-3036	92126-3036	...	NaN	NaN	NaN	NaN
8450	211900595220	I LOVE BANANA BREAD CO	4068 DALLES AVE, SAN DIEGO, CA 92117-5518	92117-5518	...	NaN	NaN	NaN	NaN
8545	211963768842	PETRA KITCHEN	5252 BALBOA ARMS DR 175, SAN DIEGO, CA 92117-4949	92117-4949	...	NaN	NaN	NaN	NaN

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
...	...	...	...	...	...	...
1171	130	291	30	65	150	True
1172	125	281	21	65	110	False
1173	117	297	38	65	129	False

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

	mar_status	empl_status	gender	age
0	married	Working as paid employee	M	51
1	married	Working as paid employee	F	53
2	married	Working as paid employee	M	57
3	married	Working as paid employee	F	57
4	married	Working as paid employee	M	60

	proportion
empl_status
Working as paid employee	0.61
Not working - other	0.10
Working, self-employed	0.10
Not working - looking for work	0.07
Not working - disabled	0.06
Not working - retired	0.05
Not working - on a temporary layoff from a job	0.02

	mar_status	empl_status	gender	age
1282	married	Not working - other	M	52
1221	married	Working as paid employee	F	42
1591	unmarried	Not working - disabled	F	59
1338	married	Working as paid employee	M	56
1577	married	Working as paid employee	F	41

	name
87	Shir
109	Zening
70	Monica
...	...
1	Aile
20	Daniel
102	Yash

	proportion
gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

Lecture 6 – EDA Part 2, Hypothesis Testing¶

DSC 80, Fall 2024¶

Announcements 📣¶

Agenda 📆¶

SD Food Safety Data¶

Combining the restaurant data¶

Question 🤔 (Answer at dsc80.com/q)

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at dsc80.com/q)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Hypothesis Testing¶

Why are we learning hypothesis testing again?¶

Data scope¶

Where are we in the data science lifecycle?¶

Data scope¶

Example: Wikipedia awards¶

Example: Who will win the election?¶

🔑 Key Idea: Random samples look like the access frame they were sampled from!¶

Sampling in practice¶

Overview of hypothesis testing¶

What problem does hypothesis testing solve?¶

Why hypothesis testing is difficult to learn¶

The hypothesis testing "recipe"¶

Question 🤔 (Answer at dsc80.com/q)

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples and computing TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

Summary of the method¶

Aside¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Hypothesis testing vs. permutation testing¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Permutation testing meets TVD¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it. You can also watch this podcast, starting from 4:43 for a walkthrough.

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Summary, next time¶

Summary¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶

Generating many random samples and computing TVDs, without a `for`-loop¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it.

You can also watch this podcast, starting from 4:43 for a walkthrough.

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1