from dsc80_utils import *

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

rest = pd.read_csv(rest_path)
insp = pd.read_csv(insp_path)
viol = pd.read_csv(viol_path)

rest.head(2)

rest.columns

Index(['business_id', 'name', 'business_type', 'address', 'city', 'zip',
       'phone', 'status', 'lat', 'long', 'opened_date', 'distance'],
      dtype='object')

insp.head(2)

insp.columns

Index(['custom_id', 'business_id', 'inspection_id', 'description', 'type',
       'score', 'grade', 'completed_date', 'status', 'link', 'status_link'],
      dtype='object')

viol.head(2)

viol.columns

Index(['inspection_id', 'violation', 'major_violation', 'status',
       'violation_text', 'correction_type_link', 'violation_accela', 'link'],
      dtype='object')

fig = px.histogram(insp['score'])
fig

scores = (
    insp[['grade', 'score']]
    .dropna()
    .groupby('grade')
    .mean()
    .reset_index()
)
# x= and y= are columns of scores. Convenient!
px.bar(scores, x='grade', y='score')

# Same as the above!
scores.plot(kind='bar', x='grade', y='score')

# Your code goes here.

# pandas stores these as ints, but they're actually nominal.
rest['business_id']

0      211898487641
1      211930769329
2      211909057778
           ...     
997    211899338714
998    211942150255
999    211925713322
Name: business_id, Length: 1000, dtype: int64

# pandas stores these as strings, but they're actually numeric.
rest['opened_date']

0      2002-05-05
1      2023-07-24
2      2019-01-22
          ...    
997    2002-05-05
998    2016-11-03
999    2022-11-03
Name: opened_date, Length: 1000, dtype: object

rest.sample(5)

insp['grade'].value_counts()

A    2978
B      11
Name: grade, dtype: int64

insp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5179 entries, 0 to 5178
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   custom_id       5179 non-null   object 
 1   business_id     5179 non-null   int64  
 2   inspection_id   5179 non-null   int64  
 3   description     0 non-null      float64
 4   type            5179 non-null   object 
 5   score           5179 non-null   int64  
 6   grade           2989 non-null   object 
 7   completed_date  5179 non-null   object 
 8   status          5179 non-null   object 
 9   link            5179 non-null   object 
 10  status_link     5179 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 445.2+ KB

# Are there multiple restaurants with the same address?
rest['address'].value_counts()

5300 GRAND DEL MAR CT, SAN DIEGO, CA 92130       9
8657 VILLA LA JOLLA DR, LA JOLLA, CA 92037       8
4545 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122    8
                                                ..
3963 GOVERNOR DR, SAN DIEGO, CA 92122            1
4041 GOVERNOR DR, SAN DIEGO, CA 92122-2520       1
2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014       1
Name: address, Length: 863, dtype: int64

# Keeps all rows with duplicate addresses.
(
    rest
    .groupby('address')
    .filter(lambda df: df.shape[0] >= 2)
    .sort_values('address')
)

# Does the same thing as above!
(
    rest[rest.duplicated(subset=['address'], keep=False)]
    .sort_values('address')
)

rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

rest = (
    pd.read_csv(rest_path)
    .pipe(subset_rest)
)
rest

# Same as the above – but the above makes it easier to chain more .pipe calls afterwards.
subset_rest(pd.read_csv(rest_path))

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv(viol_path)
    .pipe(subset_viol)
)

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

business_id      0.00
inspection_id    0.00
score            0.00
grade            0.42
date             0.00
status           0.00
dtype: float64

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df[df['inspection_id'].isna()]

# Look at the dtype!
insp['date']

0       2023-02-16
1       2022-01-03
2       2020-12-03
           ...    
5176    2023-03-06
5177    2022-12-09
5178    2022-11-30
Name: date, Length: 5179, dtype: object

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
8728   2022-11-30
8729   2022-11-30
8730   2022-11-30
Name: date, Length: 8731, dtype: datetime64[ns]

insp.resample('2W', on='date').mean()

# Where are those numbers coming from?
insp[
    (insp['date'] >= pd.Timestamp('2020-01-05')) &
    (insp['date'] < pd.Timestamp('2020-01-19'))
]['score']

10        0
11       92
12        0
       ... 
4709      0
4988    100
5107     96
Name: score, Length: 86, dtype: int64

(insp.resample('2W', on='date')
 .size()
 .plot(title='Number of Inspections Over Time')
)

insp['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

insp['date'].dt.day

0       16
1        3
2        3
        ..
5176     6
5177     9
5178    30
Name: date, Length: 5179, dtype: int64

insp['date'].dt.dayofweek

0       3
1       0
2       3
       ..
5176    0
5177    4
5178    2
Name: date, Length: 5179, dtype: int64

dow_counts = insp['date'].dt.dayofweek.value_counts()
fig = px.bar(dow_counts)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

	inspection_id	violation	major_violation	status	violation_text	correction_type_link	violation_accela	link
0	6886133	Hot and Cold Water	Y	Out of Compliance - Major	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
1	6631228	Hot and Cold Water	N	Out of Compliance - Minor	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...

	business_id	name	business_type	address	...	lat	long	opened_date	distance
479	211980154608	OLIVE AND BASIL	Restaurant Food Facility	8008 GIRARD AVE, SUITE# 220, LA JOLLA, CA 9203...	...	32.85	-117.27	2019-03-13	3.23
452	211969917510	HAAGEN DAZS SHOP 146	Low Risk Food Facility	1172 PROSPECT ST, LA JOLLA, CA 92037-4533	...	32.85	-117.27	2014-12-29	3.12
635	211946349576	GLUTEN FREEDOM BAKERY	Restaurant Food Facility	8597 SPECTRUM LN, SAN DIEGO, CA 92121-2652	...	32.88	-117.17	2019-12-19	3.74
662	211899382194	CENTERPARK CAFE & CATERING II	Restaurant Food Facility	9975 SUMMERS RIDGE RD, SAN DIEGO, CA 92121-2997	...	32.90	-117.17	2011-05-09	3.83
381	211972295203	THE COFFEE BEAN AND TEA LEAF	Low Risk Food Facility	3939 GOVERNOR DR, SAN DIEGO, CA 92122-2520	...	32.85	-117.20	2014-07-02	2.51

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	address	zip	...	status	kind	is_major	violation
759	211941133403	TASTY CHAI	8878 REGENTS RD 105, SAN DIEGO, CA 92122-5853	92122-5853	...	NaN	NaN	NaN	NaN
1498	211915545446	EMBASSY SUITES SAN DIEGO LA JOLLA	4550 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122-...	92122-1248	...	NaN	NaN	NaN	NaN
1672	211937443689	SERVICENOW	4770 EASTGATE MALL, SAN DIEGO, CA 92121-1970	92121-1970	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
8094	211997340975	COOKIE SCOOP	7759 GASTON DR, SAN DIEGO, CA 92126-3036	92126-3036	...	NaN	NaN	NaN	NaN
8450	211900595220	I LOVE BANANA BREAD CO	4068 DALLES AVE, SAN DIEGO, CA 92117-5518	92117-5518	...	NaN	NaN	NaN	NaN
8545	211963768842	PETRA KITCHEN	5252 BALBOA ARMS DR 175, SAN DIEGO, CA 92117-4949	92117-4949	...	NaN	NaN	NaN	NaN

	business_id	name	business_type	address	...	lat	long	opened_date	distance
0	211898487641	MOBIL MART LA JOLLA VILLAGE	Pre-Packaged Retail Market	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	...	32.87	-117.23	2002-05-05	0.62
1	211930769329	CAFE 477	Low Risk Food Facility	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	...	32.87	-117.24	2023-07-24	0.64

	custom_id	business_id	inspection_id	description	...	completed_date	status	link	status_link
0	DEH2002-FFPN-310012	211898487641	6886133	NaN	...	2023-02-16	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
1	DEH2002-FFPN-310012	211898487641	6631228	NaN	...	2022-01-03	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...

	address	zip
0	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037
1	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	92037-1704
2	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	92037
...	...	...
997	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	92109-1856
998	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	92130
999	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	92014

	business_id	inspection_id	score
date
2020-01-05	2.12e+11	6.35e+06	42.67
2020-01-19	2.12e+11	6.30e+06	59.33
2020-02-02	2.12e+11	6.32e+06	56.34
...	...	...	...
2023-09-24	2.12e+11	7.15e+06	66.60
2023-10-08	2.12e+11	7.19e+06	59.58
2023-10-22	2.12e+11	7.20e+06	66.81

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

Lecture 5 – Exploratory Data Analysis and Data Cleaning¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Dataset overview¶

San Diego food safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The data¶

Question 🤔 (Answer at q.dsc80.com)

Introduction to plotly¶

plotly¶

Using plotly¶

Initial plots¶

Exploratory data analysis and feature types¶

The data science lifecycle, revisited¶

Exploratory data analysis (EDA)¶

Different feature types¶

Question 🤔 (Answer at q.dsc80.com)

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope¶

Measurements and values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Combining the restaurant data¶

Question 🤔 (Answer at q.dsc80.com)

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at q.dsc80.com)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Summary, next time¶

Summary¶

Next time¶

Introduction to `plotly`¶

`plotly`¶

Using `plotly`¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶