from dsc80_utils import *

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

rest = pd.read_csv(rest_path)
insp = pd.read_csv(insp_path)
viol = pd.read_csv(viol_path)

rest

rest.columns

Index(['business_id', 'name', 'business_type', 'address', 'city', 'zip',
       'phone', 'status', 'lat', 'long', 'opened_date', 'distance'],
      dtype='object')

insp.head()

insp.columns

Index(['custom_id', 'business_id', 'inspection_id', 'description', 'type',
       'score', 'grade', 'completed_date', 'status', 'link', 'status_link'],
      dtype='object')

viol.head()

viol.columns

Index(['inspection_id', 'violation', 'major_violation', 'status',
       'violation_text', 'correction_type_link', 'violation_accela', 'link'],
      dtype='object')

fig = px.histogram(insp['score'])
fig

scores = (
    insp[['grade', 'score']]
    .dropna()
    .groupby('grade')
    .mean()
    .reset_index()
)
# px.bar wants a DataFrame with two columns, one with the labels and one with the values.
px.bar(scores, x='grade', y='score')

# Same as the above!
scores.plot(kind='bar', x='grade', y='score')

# Your code goes here.

# pandas stores these as ints, but they're actually nominal.
rest['business_id']

0      211898487641
1      211930769329
2      211909057778
           ...     
997    211899338714
998    211942150255
999    211925713322
Name: business_id, Length: 1000, dtype: int64

# pandas stores these as strings, but they're actually numeric.
rest['opened_date']

0      2002-05-05
1      2023-07-24
2      2019-01-22
          ...    
997    2002-05-05
998    2016-11-03
999    2022-11-03
Name: opened_date, Length: 1000, dtype: object

rest.sample(5)

insp['grade'].value_counts()

A    2978
B      11
Name: grade, dtype: int64

insp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5179 entries, 0 to 5178
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   custom_id       5179 non-null   object 
 1   business_id     5179 non-null   int64  
 2   inspection_id   5179 non-null   int64  
 3   description     0 non-null      float64
 4   type            5179 non-null   object 
 5   score           5179 non-null   int64  
 6   grade           2989 non-null   object 
 7   completed_date  5179 non-null   object 
 8   status          5179 non-null   object 
 9   link            5179 non-null   object 
 10  status_link     5179 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 445.2+ KB

# Are there multiple restaurants with the same address?
rest['address'].value_counts()

5300 GRAND DEL MAR CT, SAN DIEGO, CA 92130       9
8657 VILLA LA JOLLA DR, LA JOLLA, CA 92037       8
4545 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122    8
                                                ..
3963 GOVERNOR DR, SAN DIEGO, CA 92122            1
4041 GOVERNOR DR, SAN DIEGO, CA 92122-2520       1
2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014       1
Name: address, Length: 863, dtype: int64

# Keeps all rows with duplicate addresses.
(
    rest
    .groupby('address')
    .filter(lambda df: df.shape[0] >= 2)
    .sort_values('address')
)

# Does the same thing as above!
(
    rest[rest.duplicated(subset=['address'], keep=False)]
    .sort_values('address')
)

rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

rest = (
    pd.read_csv(rest_path)
    .pipe(subset_rest)
)
rest

# Same as the above – but the above makes it easier to chain more .pipe calls afterwards.
subset_rest(pd.read_csv(rest_path))

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv(viol_path)
    .pipe(subset_viol)
)

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

business_id      0.00
inspection_id    0.00
score            0.00
grade            0.42
date             0.00
status           0.00
dtype: float64

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df[df['inspection_id'].isna()]

# Look at the dtype!
insp['date']

0       2023-02-16
1       2022-01-03
2       2020-12-03
           ...    
5176    2023-03-06
5177    2022-12-09
5178    2022-11-30
Name: date, Length: 5179, dtype: object

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
8728   2022-11-30
8729   2022-11-30
8730   2022-11-30
Name: date, Length: 8731, dtype: datetime64[ns]

insp.resample('2W', on='date').mean()

# Where are those numbers coming from?
insp[
    (insp['date'] >= pd.Timestamp('2020-01-05')) &
    (insp['date'] < pd.Timestamp('2020-01-19'))
]['score'].mean()

59.325581395348834

(insp.resample('2W', on='date')
 .size()
 .plot(title='Number of Inspections Over Time')
)

insp['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

insp['date'].dt.day

0       16
1        3
2        3
        ..
5176     6
5177     9
5178    30
Name: date, Length: 5179, dtype: int64

insp['date'].dt.dayofweek

0       3
1       0
2       3
       ..
5176    0
5177    4
5178    2
Name: date, Length: 5179, dtype: int64

dow_counts = insp['date'].dt.dayofweek.value_counts()
fig = px.bar(dow_counts)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

	custom_id	business_id	inspection_id	description	...	completed_date	status	link	status_link
0	DEH2002-FFPN-310012	211898487641	6886133	NaN	...	2023-02-16	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
1	DEH2002-FFPN-310012	211898487641	6631228	NaN	...	2022-01-03	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
2	DEH2002-FFPN-310012	211898487641	6357338	NaN	...	2020-12-03	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
3	DEH2023-FFPP-016887	211930769329	7329834	NaN	...	2023-09-20	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
4	DEH2019-FFPP-010654	211909057778	7233091	NaN	...	2023-05-26	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...

	inspection_id	violation	major_violation	status	violation_text	correction_type_link	violation_accela	link
0	6886133	Hot and Cold Water	Y	Out of Compliance - Major	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
1	6631228	Hot and Cold Water	N	Out of Compliance - Minor	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
2	6357338	Holding Temperatures	N	Out of Compliance - Minor	Holding Temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	7. Proper hot & cold holding temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
3	6939628	Holding Temperatures	Y	Out of Compliance - Major	Holding Temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	7. Proper hot & cold holding temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
4	6939628	Approved Procedures	N	Out of Compliance - Minor	Approved Procedures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	18. Compliance with	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...

	business_id	name	business_type	address	...	lat	long	opened_date	distance
259	211979092584	CANTEEN COMPASS GROUP ILLUMINA BUILDING A	Pre-Packaged Retail Market	4795 EXECUTIVE DR, SAN DIEGO, CA 92121-3091	...	32.88	-117.20	2017-09-29	1.83
215	211925186977	BLUE BOTTLE COFFEE	Restaurant Food Facility	4575 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122-...	...	32.87	-117.21	2019-12-23	1.57
766	211898281635	BOOMERANGS	Restaurant Food Facility	4577 CLAIREMONT DR, SAN DIEGO, CA 92117	...	32.83	-117.21	2007-04-05	4.01
629	211898997541	CARMEL VALLEY SHELL	Pre-Packaged Retail Market	3060 CARMEL VALLEY RD, SAN DIEGO, CA 92130	...	32.93	-117.24	2009-07-22	3.72
776	211947621245	BOAT TO TABLE HATANAKA	Retail Food Processing	4706 CLAIREMONT MESA BLVD, SAN DIEGO, CA 92117...	...	32.84	-117.19	2022-04-01	4.04

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	business_type	address	...	lat	long	opened_date	distance
0	211898487641	MOBIL MART LA JOLLA VILLAGE	Pre-Packaged Retail Market	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	...	32.87	-117.23	2002-05-05	0.62
1	211930769329	CAFE 477	Low Risk Food Facility	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	...	32.87	-117.24	2023-07-24	0.64
2	211909057778	VALLEY FARM MARKET	Retail Market with Deli	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	...	32.87	-117.24	2019-01-22	0.64
...	...	...	...	...	...	...	...	...	...
997	211899338714	PACIFIC BEACH ELEMENTARY	School Processing Food Facility	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	...	32.81	-117.25	2002-05-05	4.97
998	211942150255	POKEWAN DEL MAR	Restaurant Food Facility	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	...	32.95	-117.23	2016-11-03	4.97
999	211925713322	SAFFRONO LOUNGE RESTAURANT	Restaurant Food Facility	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	...	32.95	-117.25	2022-11-03	4.97

	address	zip
0	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037
1	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	92037-1704
2	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	92037
...	...	...
997	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	92109-1856
998	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	92130
999	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	92014

	business_id	name	address	zip	...	status	kind	is_major	violation
759	211941133403	TASTY CHAI	8878 REGENTS RD 105, SAN DIEGO, CA 92122-5853	92122-5853	...	NaN	NaN	NaN	NaN
1498	211915545446	EMBASSY SUITES SAN DIEGO LA JOLLA	4550 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122-...	92122-1248	...	NaN	NaN	NaN	NaN
1672	211937443689	SERVICENOW	4770 EASTGATE MALL, SAN DIEGO, CA 92121-1970	92121-1970	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
8094	211997340975	COOKIE SCOOP	7759 GASTON DR, SAN DIEGO, CA 92126-3036	92126-3036	...	NaN	NaN	NaN	NaN
8450	211900595220	I LOVE BANANA BREAD CO	4068 DALLES AVE, SAN DIEGO, CA 92117-5518	92117-5518	...	NaN	NaN	NaN	NaN
8545	211963768842	PETRA KITCHEN	5252 BALBOA ARMS DR 175, SAN DIEGO, CA 92117-4949	92117-4949	...	NaN	NaN	NaN	NaN

	business_id	inspection_id	score
date
2020-01-05	2.12e+11	6.35e+06	42.67
2020-01-19	2.12e+11	6.30e+06	59.33
2020-02-02	2.12e+11	6.32e+06	56.34
...	...	...	...
2023-09-24	2.12e+11	7.15e+06	66.60
2023-10-08	2.12e+11	7.19e+06	59.58
2023-10-22	2.12e+11	7.20e+06	66.81

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

Lecture 5 – Exploratory Data Analysis and Data Cleaning¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Other data representations¶

Representations of tabular data¶

DataFrames vs. spreadsheets¶

DataFrames vs. matrices¶

DataFrames vs. relations¶

Dataset overview¶

San Diego food safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The data¶

Exercise

Introduction to plotly¶

plotly¶

Using plotly¶

Initial plots¶

Exploratory data analysis and feature types¶

The data science lifecycle, revisited¶

Exploratory data analysis (EDA)¶

Different feature types¶

Exercise

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope¶

Measurements and values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Combining the restaurant data¶

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at q.dsc80.com)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Summary, next time¶

Summary¶

Next time¶

Introduction to `plotly`¶

`plotly`¶

Using `plotly`¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶