# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from scipy import stats
import otter
set_matplotlib_formats("svg")
plt.style.use('ggplot')

import warnings
warnings.simplefilter('ignore')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animation
from IPython.display import IFrame, display

def show_clt_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))


restaurants = bpd.read_csv('data/restaurants.csv')
restaurants


keep_cols = ['business_name', 'inspection_date', 'inspection_score', 'risk_category', 'Neighborhoods', 'Zip Codes']
restaurants = restaurants.get(keep_cols) # SOLUTION
restaurants


restaurants.take(np.arange(5))


# Don't worry about the plotting code here.
fig, ax = plt.subplots()
score_bins = np.arange(50, 102, 2)
restaurants[restaurants.get('risk_category') == 'Low Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), title='Inspection Scores for Low Risk vs. High Risk Restaurants', alpha=0.65, label='Low Risk'
);

restaurants[restaurants.get('risk_category') == 'High Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), alpha=0.65, label='High Risk'
);


high_low = restaurants[(restaurants.get('risk_category') == 'Low Risk') | (restaurants.get('risk_category') == 'High Risk')] # SOLUTION
high_low = high_low.get(['inspection_score', 'risk_category']) # SOLUTION
high_low


np.random.permutation(high_low.get('risk_category')) # SOLUTION

array(['Low Risk', 'High Risk', 'High Risk', ..., 'Low Risk', 'Low Risk',
       'Low Risk'], dtype=object)


shuffled_labels = np.random.permutation(high_low.get('risk_category')) # SOLUTION
# add a new column called shuffled_label
original_and_shuffled = high_low.assign(shuffled_label=shuffled_labels) # SOLUTION
original_and_shuffled


# Don't worry about the plotting code here.
fig, ax = plt.subplots()
score_bins = np.arange(50, 102, 2)
restaurants[restaurants.get('risk_category') == 'Low Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), title='Inspection Scores for Low Risk vs. High Risk Restaurants Before Shuffling', alpha=0.65, label='Low Risk'
);

restaurants[restaurants.get('risk_category') == 'High Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), alpha=0.65, label='High Risk'
);


# Don't worry about the plotting code here.
fig, ax = plt.subplots()
score_bins = np.arange(50, 102, 2)
original_and_shuffled[original_and_shuffled.get('shuffled_label') == 'Low Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), title='Inspection Scores for Low Risk vs. High Risk Restaurants After Shuffling', alpha=0.65, label='Low Risk'
);

original_and_shuffled[original_and_shuffled.get('shuffled_label') == 'High Risk'].plot(
    kind='hist', y='inspection_score', density=True, ec='w', bins=score_bins, ax=ax,
    figsize=(10, 5), alpha=0.65, label='High Risk'
);


original_and_shuffled.groupby('risk_category').mean() # SOLUTION


grouped = original_and_shuffled.groupby('risk_category').mean() # SOLUTION
observed_difference = grouped.get('inspection_score').loc['Low Risk'] - grouped.get('inspection_score').loc['High Risk'] # SOLUTION
observed_difference

6.5084648663377465


original_and_shuffled.groupby('shuffled_label').mean() # SOLUTION


shuffled_and_grouped = original_and_shuffled.groupby('shuffled_label').mean() # SOLUTION
simulated_difference = shuffled_and_grouped.get('inspection_score').loc['Low Risk'] - shuffled_and_grouped.get('inspection_score').loc['High Risk'] # SOLUTION
simulated_difference

0.36321982074139214


def calculate_test_statistic():
    # BEGIN SOLUTION
    shuffled_labels = np.random.permutation(high_low.get('risk_category'))
    original_and_shuffled = high_low.assign(shuffled_label=shuffled_labels)
    shuffled_and_grouped = original_and_shuffled.groupby('shuffled_label').mean()
    simulated_difference = shuffled_and_grouped.get('inspection_score').loc['Low Risk'] - shuffled_and_grouped.get('inspection_score').loc['High Risk']
    return simulated_difference
    # END SOLUTION


calculate_test_statistic()

-0.11702175180298013


simulated_stats = np.array([])
n_reps = 100 # We're using a small number of reps to save time in lecture.

# BEGIN SOLUTION
for i in np.arange(n_reps):
    sim_stat = calculate_test_statistic()
    simulated_stats = np.append(simulated_stats, sim_stat)
# END SOLUTION


bpd.DataFrame().assign(simulated_stats=simulated_stats) \
               .plot(kind='hist', density=True, ec='w', figsize=(10, 5), bins=20, label='difference in group means');
plt.axvline(observed_difference, lw=3, color='black', label='observed statistic')
plt.legend();


np.count_nonzero(simulated_stats >= observed_difference) / n_reps # SOLUTION

0.0


restaurants_full = bpd.read_csv('data/restaurants_full.csv').get(keep_cols)
restaurants_full


restaurants_full.get('business_name').str.contains('Bake') # SOLUTION

0        False
1        False
2        False
         ...  
54474    False
54475    False
54476    False
Name: business_name, Length: 54477, dtype: bool


restaurants_full.get('business_name').str.lower().str.contains('bake') # SOLUTION

0        False
1        False
2        False
         ...  
54474    False
54475    False
54476    False
Name: business_name, Length: 54477, dtype: bool


bakeries = restaurants_full[restaurants_full.get('business_name').str.lower().str.contains('bake')] # SOLUTION
bakeries = bakeries[bakeries.get('inspection_score') >= 0] # SOLUTION # Keeping only the rows where we know the inspection score
bakeries


bakeries.plot(kind='hist', y='inspection_score', density=True, bins=score_bins, ec='w', figsize=(10, 5),
              title='Population Distribution');


bakeries.get('inspection_score').describe() # SOLUTION

count    1216.00
mean       84.20
std         8.35
          ...   
50%        86.00
75%        90.00
max       100.00
Name: inspection_score, Length: 8, dtype: float64


np.random.seed(23) # Ignore this

sample_of_bakeries = bakeries.sample(200) # SOLUTION
sample_of_bakeries


sample_of_bakeries.plot(kind='hist', y='inspection_score', density=True, bins=score_bins, ec='w', figsize=(10, 5),
                        title='Sample Distribution');


sample_of_bakeries.get('inspection_score').describe() # SOLUTION

count    200.00
mean      84.67
std        8.38
          ...  
50%       87.00
75%       91.25
max       98.00
Name: inspection_score, Length: 8, dtype: float64


sample_of_bakeries.get('inspection_score').mean()

84.665


show_clt_slides()


sample_means = np.array([])

# BEGIN SOLUTION
for i in np.arange(5000):
    sample_mean = bakeries.sample(200).get('inspection_score').mean()
    sample_means = np.append(sample_means, sample_mean)
# END SOLUTION


sample_means

array([84.34, 85.02, 83.79, ..., 84.64, 84.49, 84.17])


bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', bins=25, figsize=(10, 5));


np.std(bakeries.get('inspection_score')) / np.sqrt(200)

0.5904894545352809


np.std(sample_means)

0.5469232018985846


np.std(sample_of_bakeries.get('inspection_score')) / np.sqrt(200)

0.5909855116667413


sample_mean = sample_of_bakeries.get('inspection_score').mean()
sample_std = np.std(sample_of_bakeries.get('inspection_score'))


[sample_mean - 2 * sample_std / np.sqrt(200), sample_mean + 2 * sample_std / np.sqrt(200)] # SOLUTION

[83.48302897666652, 85.8469710233335]


# The median of our original sample – this is just one number
sample_of_bakeries.get('inspection_score').median() # SOLUTION

87.0


# The median of a single bootstrap resample – this is just one number
sample_of_bakeries.sample(200, replace=True).get('inspection_score').median() # SOLUTION

86.0


np.random.seed(23) # Ignore this

boot_medians = np.array([])

# BEGIN SOLUTION
for i in np.arange(5000):
    boot_median = sample_of_bakeries.sample(200, replace=True).get('inspection_score').median()
    boot_medians = np.append(boot_medians, boot_median)
# END SOLUTION


boot_medians

array([87. , 85. , 86.5, ..., 87.5, 88. , 86. ])


bpd.DataFrame().assign(boot_medians=boot_medians).plot(kind='hist', density=True, ec='w', bins=10, figsize=(10, 5));


# BEGIN SOLUTION
left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)

[left, right]
# END SOLUTION

[85.0, 88.0]

	Unnamed: 0	business_id	business_name	business_address	...	Supervisor Districts	Fire Prevention Districts	Zip Codes	Analysis Neighborhoods
0	9671	3838	CAFE PICARO	3120 16th St	...	5.0	8.0	28853.0	20.0
1	9679	63619	Subway Sandwiches	77 Van Ness Ave #100	...	9.0	7.0	28852.0	36.0
2	9695	7786	DIANDA'S ITAL-AMER.PASTRY CO.	2883 Mission St	...	7.0	2.0	28859.0	20.0
...	...	...	...	...	...	...	...	...	...
5274	54150	3895	Cable Car Restaurant	1040 Columbus Ave	...	10.0	5.0	308.0	23.0
5275	54202	1668	SILVER CREST DONUT SHOP	340 BAYSHORE Blvd	...	7.0	10.0	58.0	1.0
5276	54203	5820	GLEN PARK ELEMENTARY SCHOOL	151 LIPPARD Ave	...	5.0	9.0	63.0	41.0

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
0	CAFE PICARO	2018-02-22T00:00:00.000	72.0	Low Risk	19.0	28853.0
1	Subway Sandwiches	2017-09-19T00:00:00.000	92.0	Low Risk	36.0	28852.0
2	DIANDA'S ITAL-AMER.PASTRY CO.	2017-10-18T00:00:00.000	85.0	Moderate Risk	19.0	28859.0
...	...	...	...	...	...	...
5274	Cable Car Restaurant	2018-03-13T00:00:00.000	82.0	Moderate Risk	23.0	308.0
5275	SILVER CREST DONUT SHOP	2019-02-27T00:00:00.000	84.0	Moderate Risk	1.0	58.0
5276	GLEN PARK ELEMENTARY SCHOOL	2016-09-02T00:00:00.000	88.0	Low Risk	40.0	63.0

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
0	CAFE PICARO	2018-02-22T00:00:00.000	72.0	Low Risk	19.0	28853.0
1	Subway Sandwiches	2017-09-19T00:00:00.000	92.0	Low Risk	36.0	28852.0
2	DIANDA'S ITAL-AMER.PASTRY CO.	2017-10-18T00:00:00.000	85.0	Moderate Risk	19.0	28859.0
3	KEZAR PUB	2017-12-15T00:00:00.000	91.0	High Risk	9.0	29492.0
4	Piccadilly Fish & Chips	2016-10-24T00:00:00.000	90.0	Low Risk	21.0	28858.0

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
0	Golden Waffle	2018-08-08T00:00:00.000	NaN	NaN	NaN	NaN
1	Hakkasan San Francisco	2018-04-18T00:00:00.000	88.0	Moderate Risk	NaN	NaN
2	Chopsticks Restaurant	2017-08-18T00:00:00.000	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...
54474	Farmhouse Kitchen Thai Cuisine	2018-10-10T00:00:00.000	NaN	High Risk	NaN	NaN
54475	Wago Sushi	2018-01-30T00:00:00.000	88.0	Moderate Risk	NaN	NaN
54476	Yemen Cafe & Restaurant	2016-05-20T00:00:00.000	76.0	Low Risk	NaN	NaN

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
327	Le Marais Bakery Castro	2018-08-06T00:00:00.000	90.0	Moderate Risk	NaN	NaN
365	Pho Luen Fat Bakery & Restaurant	2019-04-08T00:00:00.000	76.0	Low Risk	NaN	NaN
372	Brioche Bakery & Cafe	2019-01-31T00:00:00.000	88.0	Low Risk	NaN	NaN
...	...	...	...	...	...	...
53954	Fancy Wheatfield Bakery	2019-03-04T00:00:00.000	83.0	Moderate Risk	NaN	NaN
54102	New Hollywood Bakery & Restaurant	2016-08-30T00:00:00.000	74.0	High Risk	NaN	NaN
54171	Speciality's Cafe and Bakery	2019-04-29T00:00:00.000	89.0	Moderate Risk	NaN	NaN

Lecture 26 – Review¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

The data: Restaurants 🍟¶

At-risk restaurants ⚠️¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Bakeries 🧁¶

The Central Limit Theorem¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Discussion Question¶

Next time¶

Next time¶

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
33359	Universal Bakery Inc.	2019-01-28T00:00:00.000	83.0	Low Risk	2.0	28859.0
19980	Cherry Blossom Bakery 2	2016-06-28T00:00:00.000	90.0	Moderate Risk	NaN	NaN
29825	Waterfront Bakery	2018-06-07T00:00:00.000	94.0	Low Risk	32.0	308.0
...	...	...	...	...	...	...
4835	Marla Bakery	2018-09-10T00:00:00.000	91.0	High Risk	NaN	NaN
26932	PRINCESS BAKERY	2016-08-16T00:00:00.000	79.0	Low Risk	5.0	28861.0
34201	Castro Tarts Cafe and Bakery Inc.	2017-08-23T00:00:00.000	82.0	Low Risk	NaN	NaN

Lecture 26 – Review¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

The data: Restaurants 🍟¶

At-risk restaurants ⚠️¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Bakeries 🧁¶

The Central Limit Theorem¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Discussion Question¶

Next time¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶