# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from scipy import stats
import otter
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Setup to start where we left off last time
keep_cols = ['business_name', 'inspection_date', 'inspection_score', 'risk_category', 'Neighborhoods', 'Zip Codes']
restaurants_full = bpd.read_csv('data/restaurants_full.csv').get(keep_cols)
bakeries = restaurants_full[restaurants_full.get('business_name').str.lower().str.contains('bake')]
bakeries = bakeries[bakeries.get('inspection_score') >= 0] # Keeping only the rows where we know the inspection score

# Animation
from IPython.display import IFrame, display

def show_clt_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))

C:\Users\janin\Anaconda3\lib\site-packages\scipy\__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"


bakeries


score_bins = np.arange(50, 102, 2)
bakeries.plot(kind='hist', y='inspection_score', density=True, bins=score_bins, ec='w', figsize=(10, 5),
              title='Population Distribution');


bakeries.get('inspection_score').describe()

count    1216.00
mean       84.20
std         8.35
          ...   
50%        86.00
75%        90.00
max       100.00
Name: inspection_score, Length: 8, dtype: float64


np.random.seed(23) # Ignore this

sample_of_bakeries = bakeries.sample(200) # SOLUTION
sample_of_bakeries


sample_of_bakeries.plot(kind='hist', y='inspection_score', density=True, bins=score_bins, ec='w', figsize=(10, 5),
                        title='Sample Distribution');


sample_of_bakeries.get('inspection_score').describe()

count    200.00
mean      84.67
std        8.38
          ...  
50%       87.00
75%       91.25
max       98.00
Name: inspection_score, Length: 8, dtype: float64


sample_of_bakeries.get('inspection_score').mean()

84.665


show_clt_slides()


sample_means = np.array([])

# BEGIN SOLUTION
for i in np.arange(5000):
    sample_mean = bakeries.sample(200).get('inspection_score').mean()
    sample_means = np.append(sample_means, sample_mean)
# END SOLUTION


sample_means

array([84.34, 85.02, 83.79, ..., 84.64, 84.49, 84.17])


bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', bins=25, figsize=(10, 5));


np.std(bakeries.get('inspection_score')) / np.sqrt(200)

0.5904894545352809


np.std(sample_means)

0.5469232018985846


np.std(sample_of_bakeries.get('inspection_score')) / np.sqrt(200)

0.5909855116667413


sample_mean = sample_of_bakeries.get('inspection_score').mean()
sample_std = np.std(sample_of_bakeries.get('inspection_score'))


[sample_mean - 2 * sample_std / np.sqrt(200), sample_mean + 2 * sample_std / np.sqrt(200)] # SOLUTION

[83.48302897666652, 85.8469710233335]


# The median of our original sample – this is just one number
sample_of_bakeries.get('inspection_score').median() # SOLUTION

87.0


# The median of a single bootstrap resample – this is just one number
sample_of_bakeries.sample(200, replace=True).get('inspection_score').median() # SOLUTION

86.0


np.random.seed(23) # Ignore this

boot_medians = np.array([])

# BEGIN SOLUTION
for i in np.arange(5000):
    boot_median = sample_of_bakeries.sample(200, replace=True).get('inspection_score').median()
    boot_medians = np.append(boot_medians, boot_median)
# END SOLUTION


boot_medians

array([87. , 85. , 86.5, ..., 87.5, 88. , 86. ])


bpd.DataFrame().assign(boot_medians=boot_medians).plot(kind='hist', density=True, ec='w', bins=10, figsize=(10, 5));


# BEGIN SOLUTION
left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)

[left, right]
# END SOLUTION

[85.0, 88.0]


np.random.multinomial(6354, [0.5, 0.5])[0] # SOLUTION

3172


test_stats = np.array([])

# BEGIN SOLUTION
for i in np.arange(10000):
    stat = np.random.multinomial(6354, [0.5, 0.5])[0]
    test_stats = np.append(test_stats, stat)
# END SOLUTION


test_stats

array([3204., 3213., 3172., ..., 3150., 3198., 3213.])


bpd.DataFrame().assign(test_stats=test_stats) \
               .plot(kind='hist', density=True, ec='w', figsize=(10, 5), bins=20);
plt.axvline(3115, lw=3, color='black', label='observed statistic')
plt.legend();


# Calculate the p-value


choices = np.random.choice(['adult', 'child'], p=[0.5, 0.5], size=6354, replace=True) # SOLUTION
choices

array(['adult', 'adult', 'adult', ..., 'child', 'child', 'adult'],
      dtype='<U5')


np.count_nonzero(choices == 'child') # SOLUTION

3142


import plotly.express as px


gapminder = px.data.gapminder()
gapminder


gapminder.get('year').unique()

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007], dtype=int64)


gapminder_2007 = gapminder[gapminder.get('year') == 2007]
gapminder_2007


px.scatter(gapminder_2007, x='gdpPercap', y='lifeExp', hover_name='country')


px.scatter(gapminder_2007, x='gdpPercap', y='lifeExp', log_x=True, hover_name='country')


px.scatter(gapminder,
           x = 'gdpPercap',
           y = 'lifeExp', 
           hover_name = 'country',
           color = 'continent',
           size = 'pop',
           size_max = 60,
           log_x = True,
           range_y = [30, 90],
           animation_frame = 'year',
           title = 'Life Expectancy, GDP Per Capita, and Population over Time'
          )


px.histogram(gapminder,
            x = 'lifeExp',
            animation_frame = 'year',
            range_x = [20, 90],
            range_y = [0, 50],
            title = 'Distribution of Life Expectancy over Time')


px.choropleth(gapminder,
              locations = 'iso_alpha',
              color = 'lifeExp',
              hover_name = 'country',
              hover_data = {'iso_alpha': False},
              title = 'Life Expectancy Per Country',
              color_continuous_scale = px.colors.sequential.tempo
)

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
327	Le Marais Bakery Castro	2018-08-06T00:00:00.000	90.0	Moderate Risk	NaN	NaN
365	Pho Luen Fat Bakery & Restaurant	2019-04-08T00:00:00.000	76.0	Low Risk	NaN	NaN
372	Brioche Bakery & Cafe	2019-01-31T00:00:00.000	88.0	Low Risk	NaN	NaN
...	...	...	...	...	...	...
53954	Fancy Wheatfield Bakery	2019-03-04T00:00:00.000	83.0	Moderate Risk	NaN	NaN
54102	New Hollywood Bakery & Restaurant	2016-08-30T00:00:00.000	74.0	High Risk	NaN	NaN
54171	Speciality's Cafe and Bakery	2019-04-29T00:00:00.000	89.0	Moderate Risk	NaN	NaN

	business_name	inspection_date	inspection_score	risk_category	Neighborhoods	Zip Codes
33359	Universal Bakery Inc.	2019-01-28T00:00:00.000	83.0	Low Risk	2.0	28859.0
19980	Cherry Blossom Bakery 2	2016-06-28T00:00:00.000	90.0	Moderate Risk	NaN	NaN
29825	Waterfront Bakery	2018-06-07T00:00:00.000	94.0	Low Risk	32.0	308.0
...	...	...	...	...	...	...
4835	Marla Bakery	2018-09-10T00:00:00.000	91.0	High Risk	NaN	NaN
26932	PRINCESS BAKERY	2016-08-16T00:00:00.000	79.0	Low Risk	5.0	28861.0
34201	Castro Tarts Cafe and Bakery Inc.	2017-08-23T00:00:00.000	82.0	Low Risk	NaN	NaN

	country	continent	year	lifeExp	pop	gdpPercap	iso_alpha	iso_num
0	Afghanistan	Asia	1952	28.80	8425333	779.45	AFG	4
1	Afghanistan	Asia	1957	30.33	9240934	820.85	AFG	4
2	Afghanistan	Asia	1962	32.00	10267083	853.10	AFG	4
...	...	...	...	...	...	...	...	...
1701	Zimbabwe	Africa	1997	46.81	11404948	792.45	ZWE	716
1702	Zimbabwe	Africa	2002	39.99	11926563	672.04	ZWE	716
1703	Zimbabwe	Africa	2007	43.49	12311143	469.71	ZWE	716

	country	continent	year	lifeExp	pop	gdpPercap	iso_alpha	iso_num
11	Afghanistan	Asia	2007	43.83	31889923	974.58	AFG	4
23	Albania	Europe	2007	76.42	3600523	5937.03	ALB	8
35	Algeria	Africa	2007	72.30	33333216	6223.37	DZA	12
...	...	...	...	...	...	...	...	...
1679	Yemen, Rep.	Asia	2007	62.70	22211743	2280.77	YEM	887
1691	Zambia	Africa	2007	42.38	11746035	1271.21	ZMB	894
1703	Zimbabwe	Africa	2007	43.49	12311143	469.71	ZWE	716

Lecture 27 – Review, Conclusion¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Bakeries 🧁¶

The Central Limit Theorem¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Discussion Question¶

Physicians 🩺¶

The setup¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Personal projects¶

Using Jupyter Notebooks after DSC 10¶

Finding data¶

Domain-specific sources of data¶

Join a DS3 Project Group 🤝¶

Demo: Gapminder 🌎¶

`plotly`¶

Gapminder dataset¶

Scatter plot¶

Animated scatter plot¶

Animated histogram¶

Choropleth¶

Parting thoughts¶

From Lecture 1: What is "data science"?¶

Thank you!¶

Good luck on your finals! 🎉¶

And see you tomorrow at 3PM in Galbraith Hall 242. ⏰¶

Lecture 27 – Review, Conclusion¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Bakeries 🧁¶

The Central Limit Theorem¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Discussion Question¶

Physicians 🩺¶

The setup¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Personal projects¶

Using Jupyter Notebooks after DSC 10¶

Finding data¶

Domain-specific sources of data¶

Join a DS3 Project Group 🤝¶

Demo: Gapminder 🌎¶

plotly¶

Gapminder dataset¶

Scatter plot¶

Animated scatter plot¶

Animated histogram¶

Choropleth¶

Parting thoughts¶

From Lecture 1: What is "data science"?¶

Thank you!¶

Good luck on your finals! 🎉¶

And see you tomorrow at 3PM in Galbraith Hall 242. ⏰¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

`plotly`¶