from dsc80_utils import *

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

rest = pd.read_csv(rest_path)
insp = pd.read_csv(insp_path)
viol = pd.read_csv(viol_path)

rest

rest.columns

Index(['business_id', 'name', 'business_type', 'address', 'city', 'zip',
       'phone', 'status', 'lat', 'long', 'opened_date', 'distance'],
      dtype='object')

insp.head()

insp.columns

Index(['custom_id', 'business_id', 'inspection_id', 'description', 'type',
       'score', 'grade', 'completed_date', 'status', 'link', 'status_link'],
      dtype='object')

viol.head()

viol.columns

Index(['inspection_id', 'violation', 'major_violation', 'status',
       'violation_text', 'correction_type_link', 'violation_accela', 'link'],
      dtype='object')

px.histogram(insp['score'])

insp.columns

Index(['custom_id', 'business_id', 'inspection_id', 'description', 'type',
       'score', 'grade', 'completed_date', 'status', 'link', 'status_link'],
      dtype='object')

scores = (
    insp[['grade', 'score']]
    .groupby('grade')
    .mean()
    .reset_index()
)
scores

px.bar(scores, x='grade', y='score')

scores.plot(kind='bar', x='grade', y='score')

# Your code goes here.

insp['score']

0       96
1       98
2       98
        ..
5176     0
5177     0
5178    90
Name: score, Length: 5179, dtype: int64

insp['grade'].value_counts()

A    2978
B      11
Name: grade, dtype: int64

viol['violation_accela']

0                          21. Hot & cold water available
1                          21. Hot & cold water available
2               7. Proper hot & cold holding temperatures
                              ...                        
5967    35. Equipment / Utensils -approved, installed,...
5968    43. Toilet facilities -properly constructed, s...
5969    45. Floor, walls and ceilings - built, maintai...
Name: violation_accela, Length: 5970, dtype: object

viol['major_violation']

0       Y
1       N
2       N
       ..
5967    N
5968    N
5969    N
Name: major_violation, Length: 5970, dtype: object

rest['business_id']

0      211898487641
1      211930769329
2      211909057778
           ...     
997    211899338714
998    211942150255
999    211925713322
Name: business_id, Length: 1000, dtype: int64

rest['opened_date']

0      2002-05-05
1      2023-07-24
2      2019-01-22
          ...    
997    2002-05-05
998    2016-11-03
999    2022-11-03
Name: opened_date, Length: 1000, dtype: object

# pandas stores these as ints, but they're actually nominal.
rest['business_id']

0      211898487641
1      211930769329
2      211909057778
           ...     
997    211899338714
998    211942150255
999    211925713322
Name: business_id, Length: 1000, dtype: int64

# pandas stores these as strings, but they're actually numeric.
rest['opened_date']

0      2002-05-05
1      2023-07-24
2      2019-01-22
          ...    
997    2002-05-05
998    2016-11-03
999    2022-11-03
Name: opened_date, Length: 1000, dtype: object

rest.sample(5)

insp['grade'].value_counts()

A    2978
B      11
Name: grade, dtype: int64

insp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5179 entries, 0 to 5178
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   custom_id       5179 non-null   object 
 1   business_id     5179 non-null   int64  
 2   inspection_id   5179 non-null   int64  
 3   description     0 non-null      float64
 4   type            5179 non-null   object 
 5   score           5179 non-null   int64  
 6   grade           2989 non-null   object 
 7   completed_date  5179 non-null   object 
 8   status          5179 non-null   object 
 9   link            5179 non-null   object 
 10  status_link     5179 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 445.2+ KB

# Are there multiple restaurants with the same address?
rest['address'].value_counts()

5300 GRAND DEL MAR CT, SAN DIEGO, CA 92130       9
8657 VILLA LA JOLLA DR, LA JOLLA, CA 92037       8
4545 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122    8
                                                ..
3963 GOVERNOR DR, SAN DIEGO, CA 92122            1
4041 GOVERNOR DR, SAN DIEGO, CA 92122-2520       1
2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014       1
Name: address, Length: 863, dtype: int64

# Keeps all rows with duplicate addresses.
(
    rest
    .groupby('address')
    .filter(lambda df: df.shape[0] >= 2)
    .sort_values('address')
)

# Does the same thing as above!
rest[rest.duplicated(subset=['address'], keep=False)].sort_values('address')

rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

rest = (
    pd.read_csv(rest_path)
    .pipe(subset_rest)
)
rest

# Same as the above – but the above makes it easier to chain more .pipe calls afterwards.
subset_rest(pd.read_csv(rest_path))

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv(viol_path)
    .pipe(subset_viol)
)

rest

insp

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

business_id      0.00
inspection_id    0.00
score            0.00
grade            0.42
date             0.00
status           0.00
dtype: float64

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df['inspection_id'].isna().mean()

0.0033214981101821095

# Look at the dtype!
insp['date']

0       2023-02-16
1       2022-01-03
2       2020-12-03
           ...    
5176    2023-03-06
5177    2022-12-09
5178    2022-11-30
Name: date, Length: 5179, dtype: object

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
8728   2022-11-30
8729   2022-11-30
8730   2022-11-30
Name: date, Length: 8731, dtype: datetime64[ns]

insp.resample('2W', on='date').mean()

# Where are those numbers coming from?
insp[(insp['date'] >= pd.Timestamp('2020-01-05')) &
     (insp['date'] < pd.Timestamp('2020-01-19'))]['score'].mean()

59.325581395348834

insp.resample('2W', on='date')['score'].mean().plot(title='Average Violation Over Time')

insp['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
          ...    
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

insp['date'].dt.day

0       16
1        3
2        3
        ..
5176     6
5177     9
5178    30
Name: date, Length: 5179, dtype: int64

insp['date'].dt.dayofweek

0       3
1       0
2       3
       ..
5176    0
5177    4
5178    2
Name: date, Length: 5179, dtype: int64

fig = px.bar(
    insp['date'].dt.dayofweek.value_counts()
)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

rank_by_violations = lambda df: (
    df
    .groupby('name')
    ['zip']
    .agg(['first', 'size'])
    .sort_values('size', ascending=False)
    .reset_index()
    .pipe(lambda frame: frame.assign(rank=frame.index + 1).rename(columns={'first': 'zip'}))
)

df.pipe(rank_by_violations)

(
    df
    .groupby('zip')
    .apply(rank_by_violations)
    .reset_index(drop=True)
    .pivot(index='rank', columns='zip', values='name')
)

df.query('zip == "92014-3149"').value_counts('name')

name
JERSEY MIKES SUBS    13
CRUMBL COOKIE         4
dtype: int64

(
    df[df['violation'].str.lower().str.contains('vermin', na=False)]
    .value_counts('name')
    .head(6)
)

name
ISOLA LA JOLLA                 4
L AND F CAFE                   3
MONGOLIAN HOT POT              3
GIRARD GOURMET                 3
HOLY GAO CHINESE RESTAURANT    3
JOSES COURT ROOM               3
dtype: int64

df.groupby('kind').filter(lambda df: df.shape[0] == 1)

df[df['name'].str.lower().str.contains('subway')].value_counts('address')

address
3860 VALLEY CENTRE DR, SUITE# 403, SAN DIEGO, CA 92130       18
6715 MIRA MESA BLVD, SUITE# 102, SAN DIEGO, CA 92121-4379    13
4653 CARMEL MTN RD, SUITE# 310, SAN DIEGO, CA 92130           9
2206 TORREY PINES RD, SUITE# C, LA JOLLA, CA 92037-3472       7
4973 CLAIREMONT DR, SUITE# A, SAN DIEGO, CA 92117             3
5517 CLAIREMONT MESA BLVD, SAN DIEGO, CA 92117                1
dtype: int64

	custom_id	business_id	inspection_id	description	...	completed_date	status	link	status_link
0	DEH2002-FFPN-310012	211898487641	6886133	NaN	...	2023-02-16	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
1	DEH2002-FFPN-310012	211898487641	6631228	NaN	...	2022-01-03	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
2	DEH2002-FFPN-310012	211898487641	6357338	NaN	...	2020-12-03	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
3	DEH2023-FFPP-016887	211930769329	7329834	NaN	...	2023-09-20	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...
4	DEH2019-FFPP-010654	211909057778	7233091	NaN	...	2023-05-26	Complete	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...	http://www.sandiegocounty.gov/deh/fhd/ffis/ins...

	inspection_id	violation	major_violation	status	violation_text	correction_type_link	violation_accela	link
0	6886133	Hot and Cold Water	Y	Out of Compliance - Major	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
1	6631228	Hot and Cold Water	N	Out of Compliance - Minor	Hot and Cold Water	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	21. Hot & cold water available	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
2	6357338	Holding Temperatures	N	Out of Compliance - Minor	Holding Temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	7. Proper hot & cold holding temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
3	6939628	Holding Temperatures	Y	Out of Compliance - Major	Holding Temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	7. Proper hot & cold holding temperatures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...
4	6939628	Approved Procedures	N	Out of Compliance - Minor	Approved Procedures	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...	18. Compliance with	http://www.sandiegocounty.gov/deh/fhd/ffis/vio...

	business_id	name	business_type	address	...	lat	long	opened_date	distance
671	211988325240	MORELIANAS LA JOLLA	Restaurant Food Facility	456 PEARL ST, LA JOLLA, CA 92037	...	32.84	-117.28	2023-05-09	3.84
571	211903764423	CUVIER CLUB BY WEDGEWOOD WEDDINGS	Restaurant Food Facility	7776 EADS AVE, LA JOLLA, CA 92037-4302	...	32.84	-117.28	2023-05-22	3.51
722	211944067622	COPA VIDA CAFE	Restaurant Food Facility	3721 VALLEY CENTRE DR, SAN DIEGO, CA 92130-3329	...	32.94	-117.23	2020-01-31	3.90
464	211935879286	KEVIES KITCHEN	Class A Cottage Food Operation	5371 COLE ST, SAN DIEGO, CA 92117-1120	...	32.84	-117.20	2015-02-25	3.18
514	211973977434	PICNIC PEOPLE AND COWGIRLQ CATERING	Caterer	6355 MARINDUSTRY DR, SUITE# B, SAN DIEGO, CA 9...	...	32.88	-117.18	2019-10-02	3.34

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	business_type	address	...	lat	long	opened_date	distance
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
401	211899558107	ARMANDOS MEXICAN FOOD	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2005-06-28	2.77
...	...	...	...	...	...	...	...	...	...
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27
343	211976587262	CANTEEN - FED EX OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2020-07-31	2.27

	business_id	name	business_type	address	...	lat	long	opened_date	distance
0	211898487641	MOBIL MART LA JOLLA VILLAGE	Pre-Packaged Retail Market	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	...	32.87	-117.23	2002-05-05	0.62
1	211930769329	CAFE 477	Low Risk Food Facility	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	...	32.87	-117.24	2023-07-24	0.64
2	211909057778	VALLEY FARM MARKET	Retail Market with Deli	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	...	32.87	-117.24	2019-01-22	0.64
...	...	...	...	...	...	...	...	...	...
997	211899338714	PACIFIC BEACH ELEMENTARY	School Processing Food Facility	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	...	32.81	-117.25	2002-05-05	4.97
998	211942150255	POKEWAN DEL MAR	Restaurant Food Facility	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	...	32.95	-117.23	2016-11-03	4.97
999	211925713322	SAFFRONO LOUNGE RESTAURANT	Restaurant Food Facility	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	...	32.95	-117.25	2022-11-03	4.97

	address	zip
0	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037
1	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	92037-1704
2	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	92037
...	...	...
997	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	92109-1856
998	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	92130
999	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	92014

	business_id	inspection_id	score
date
2020-01-05	2.12e+11	6.35e+06	42.67
2020-01-19	2.12e+11	6.30e+06	59.33
2020-02-02	2.12e+11	6.32e+06	56.34
...	...	...	...
2023-09-24	2.12e+11	7.15e+06	66.60
2023-10-08	2.12e+11	7.19e+06	59.58
2023-10-22	2.12e+11	7.20e+06	66.81

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

	name	zip	size	rank
0	WHOLE FOODS MARKET	92037-1949	62	1
1	RIGOBERTOS TACO SHOP	92037	61	2
2	OKAN DINER	92117	52	3
...	...	...	...	...
944	JUST LIKE MOM MAKES	92130-2283	1	945
945	KEVIES KITCHEN	92117-1120	1	946
946	PETRA KITCHEN	92117-4949	1	947

zip	92014	92014-3101	92014-3110	92014-3149	...	92130-6657	92130-6974	92130-8605	92590
rank
1	LE BAMBOU RESTAURANT	7-ELEVEN #13628C-2111	BUSHFIRE KITCHEN	JERSEY MIKES SUBS	...	COPA VIDA CAFE	OCEAN AIR ELEMENTARY	CRISCITO PIZZA	DEVILICIOUS FOOD TRUCK
2	SAFFRONO LOUNGE RESTAURANT	NaN	STARBUCKS	CRUMBL COOKIE	...	NaN	NaN	NaN	NaN
3	JACK IN THE BOX #0081	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
155	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
156	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN
157	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN

	business_id	name	address	zip	...	status	kind	is_major	violation
441	211996758893	SHAKA JAVA	3655 NOBEL DR, SAN DIEGO, CA 92122	92122	...	Complete	Commissary/MSU Operation	N	Commissary/MSU Operation
2311	211973776043	BARBARELLA RESTAURANT	2171 AVENIDA DE LA PLAYA, LA JOLLA, CA 92037	92037	...	Complete	Gulf Oysters	Y	17. Compliance with Gulf Oyster Regulations
5096	211898213216	BISHOP SCHOOL	7607 LA JOLLA BLVD, LA JOLLA, CA 92037	92037	...	Complete	Highly Susceptible Populations	Y	20. Licensed health care facilities / public &...
6386	211918747238	L & E MUSCLE MEALS	4577 CLAIREMONT DR, SAN DIEGO, CA 92117	92117	...	Complete	No discharge from eyes, nose, or mouth	Y	3. No discharge from eyes, nose or mouth
7848	211986408394	LAZY EYE COFFEE	4011 AVATI DR, SAN DIEGO, CA 92117-4403	92117-4403	...	Complete	Food Labeling	N	29. Food properly labeled and honestly presented

	grade	score
0	A	97.27
1	B	81.91

Lecture 5 – Exploratory Data Analysis and Data Cleaning¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Other data representations¶

Representations of tabular data¶

DataFrames vs. spreadsheets¶

DataFrames vs. matrices¶

DataFrames vs. relations¶

Dataset overview¶

San Diego food safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The data¶

Exercise

Introduction to plotly¶

plotly¶

Using plotly¶

Initial plots¶

Exploratory data analysis and feature types¶

The data science lifecycle, revisited¶

Exploratory data analysis (EDA)¶

Different feature types¶

Exercise

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope¶

Measurements and values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Combining the restaurant data¶

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at q.dsc80.com)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Example question: Which restaurants have vermin violations?¶

Example question: What are some of the most uncommon violations?¶

What's the worst subway?¶

Summary, next time¶

Summary¶

Next time¶

Introduction to `plotly`¶

`plotly`¶

Using `plotly`¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶