import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats
from IPython.display import display, IFrame

set_matplotlib_formats("svg")
sns.set_context("poster")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

rest = pd.read_csv('data/restaurants.csv')
insp = pd.read_csv('data/inspections.csv')
viol = pd.read_csv('data/violations.csv')

# Fill me in

# Will include this at the top of each notebook from now on.
import plotly.express as px

# DSC 80 preferred styles, but not necessary
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["dsc80"] = go.layout.Template(
    layout=dict(
        margin=dict(l=30, r=30, t=30, b=30),
        autosize=True,
        width=600,
        height=400,
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        title=dict(x=0.5, xanchor="center"),
    )
)
pio.templates.default = "simple_white+dsc80"

fig = px.histogram(insp['score'])
fig

scores = (
    insp[['grade', 'score']]
    .dropna()
    .groupby('grade')
    .mean()
    .reset_index()
)
px.bar(scores, x='grade', y='score')

# pandas stores as int, but actually nominal
rest['business_id']

0      211898487641
1      211930769329
2      211909057778
3      211921416009
           ...     
996    211924765619
997    211899338714
998    211942150255
999    211925713322
Name: business_id, Length: 1000, dtype: int64

# pandas stores as string, but actually numeric
rest['opened_date']

0      2002-05-05
1      2023-07-24
2      2019-01-22
3      2020-11-20
          ...    
996    2017-01-18
997    2002-05-05
998    2016-11-03
999    2022-11-03
Name: opened_date, Length: 1000, dtype: object

rest.head(2)

insp['grade'].value_counts()

A    2978
B      11
Name: grade, dtype: int64

insp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5179 entries, 0 to 5178
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   custom_id       5179 non-null   object 
 1   business_id     5179 non-null   int64  
 2   inspection_id   5179 non-null   int64  
 3   description     0 non-null      float64
 4   type            5179 non-null   object 
 5   score           5179 non-null   int64  
 6   grade           2989 non-null   object 
 7   completed_date  5179 non-null   object 
 8   status          5179 non-null   object 
 9   link            5179 non-null   object 
 10  status_link     5179 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 445.2+ KB

(rest[rest.duplicated(subset=['address'])]
 .sort_values('address')
)

# Useful to check whether ZIP codes match!
rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'opened_date']]

rest = (
    pd.read_csv('data/restaurants.csv')
    .pipe(subset_rest)
)

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv('data/inspections.csv')
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv('data/violations.csv')
    .pipe(subset_viol)
)

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

df[df['inspection_id'].isna()]

df.query('score == 0')

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'

pd.to_datetime(insp['date'], format=date_format)

0      2023-02-16
1      2022-01-03
2      2020-12-03
3      2023-09-20
          ...    
5175   2023-09-14
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

# Another advantage of defining functions is that we can reuse this function
# for the opened_date column in `rest` if we wanted to
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv('data/inspections.csv')
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# Should also remake df, since it depends on insp
df = merge_all_restaurant_data()

insp['date']

0      2023-02-16
1      2022-01-03
2      2020-12-03
3      2023-09-20
          ...    
5175   2023-09-14
5176   2023-03-06
5177   2022-12-09
5178   2022-11-30
Name: date, Length: 5179, dtype: datetime64[ns]

(insp.resample('2W', on='date')
 .size()
 .plot()
);

insp['date'].dt.day

0       16
1        3
2        3
3       20
        ..
5175    14
5176     6
5177     9
5178    30
Name: date, Length: 5179, dtype: int64

insp['date'].dt.dayofweek

0       3
1       0
2       3
3       2
       ..
5175    3
5176    0
5177    4
5178    2
Name: date, Length: 5179, dtype: int64

dow_counts = (
    insp.assign(dow=insp['date'].dt.dayofweek)
    .groupby('dow')
    .size()
)
px.bar(dow_counts)

# Doesn't look like Fan Fan (or and UCSD restaurants)
# are in the data, sad!
(df[df['name'].str.contains('REGENTS')]
 [['date', 'violation']]
 .groupby('date')
 .size()
)

date
2020-04-13    1
2021-01-11    1
2021-05-05    5
2021-05-13    1
             ..
2022-03-24    3
2022-08-03    2
2022-12-07    1
2023-08-11    4
Length: 9, dtype: int64

insp['status'].unique()

array(['Complete', 'No Access', 'Self Closed', 'Ordered Closed',
       'Approved to Reopen'], dtype=object)

(insp.query('status == ["Ordered Closed", "Approved to Reopen"]')
 .groupby('business_id')
 .filter(lambda df: df.shape[0] == 4)
)

rest.query('business_id == 211917396140')

fig = px.bar(
    viol['kind'].value_counts().sort_values(),
    orientation='h'
)
fig.update_layout(width=1000, height=800)

	business_id	name	business_type	address	...	lat	long	opened_date	distance
0	211898487641	MOBIL MART LA JOLLA VILLAGE	Pre-Packaged Retail Market	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	...	32.87	-117.23	2002-05-05	0.62
1	211930769329	CAFE 477	Low Risk Food Facility	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	...	32.87	-117.24	2023-07-24	0.64

	business_id	name	business_type	address	...	lat	long	opened_date	distance
402	211898699154	HANAYA SUSHI CAFE	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2011-03-22	2.77
406	211899308875	NASEEMS BAKERY & KABOB	Restaurant Food Facility	10066 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 92121	...	32.90	-117.19	2012-04-17	2.77
412	211976116534	QUALCOMM CAFE AZ	Restaurant Food Facility	10165 PACIFIC HEIGHTS BLVD, SAN DIEGO, CA 9212...	...	32.90	-117.19	2014-01-16	2.81
132	211921220423	INTERNATIONAL COFFEE SCRIPPS GEISEL HEALTH CLINIC	Single Operating Site	10666 N TORREY PINES RD, LA JOLLA, CA 92037-1027	...	32.90	-117.24	2020-11-20	1.39
...	...	...	...	...	...	...	...	...	...
572	211943123695	DAILY CHEW CATERING	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2020-11-04	3.51
540	211911015928	KUMO	Retail Food Processing	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2022-05-31	3.40
575	211972411855	TARA HEATHER CAKE DESIGN	Caterer	9932 MESA RIM RD, SUITE# A, SAN DIEGO, CA 9212...	...	32.90	-117.18	2014-04-24	3.51
344	211990537315	COMPASS GROUP FEDEX EXPRESS OLSON	Pre-Packaged Retail Market	9999 OLSON DR, SAN DIEGO, CA 92121-2837	...	32.89	-117.20	2022-10-19	2.27

	address	zip
0	3233 LA JOLLA VILLAGE DR, LA JOLLA, CA 92037	92037
1	8950 VILLA LA JOLLA DR, SUITE# B123, LA JOLLA,...	92037-1704
2	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	92037
3	9888 GENESEE AVE, LA JOLLA, CA 92037-1205	92037-1205
...	...	...
996	12873 EL CAMINO REAL, SUITE# M4, SAN DIEGO, CA...	92130
997	1234 TOURMALINE ST, SAN DIEGO, CA 92109-1856	92109-1856
998	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	92130
999	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	92014

	business_id	name	address	opened_date	...	status	kind	is_major	violation
759	211941133403	TASTY CHAI	8878 REGENTS RD 105, SAN DIEGO, CA 92122-5853	2019-01-07	...	NaN	NaN	NaN	NaN
1498	211915545446	EMBASSY SUITES SAN DIEGO LA JOLLA	4550 LA JOLLA VILLAGE DR, SAN DIEGO, CA 92122-...	2022-11-22	...	NaN	NaN	NaN	NaN
1672	211937443689	SERVICENOW	4770 EASTGATE MALL, SAN DIEGO, CA 92121-1970	2023-06-06	...	NaN	NaN	NaN	NaN
2526	211915567037	CAFE BOLERO	5623 STRESEMANN ST, SAN DIEGO, CA 92122-3125	2022-09-14	...	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
8008	211999542439	CANDRA'S DELIGHTFUL DESSERTS LLC	4233 MT VOSS DR, SAN DIEGO, CA 92117-4753	2022-08-30	...	NaN	NaN	NaN	NaN
8094	211997340975	COOKIE SCOOP	7759 GASTON DR, SAN DIEGO, CA 92126-3036	2016-05-04	...	NaN	NaN	NaN	NaN
8450	211900595220	I LOVE BANANA BREAD CO	4068 DALLES AVE, SAN DIEGO, CA 92117-5518	2020-08-26	...	NaN	NaN	NaN	NaN
8545	211963768842	PETRA KITCHEN	5252 BALBOA ARMS DR 175, SAN DIEGO, CA 92117-4949	2020-07-30	...	NaN	NaN	NaN	NaN

	business_id	inspection_id	score	grade	date	status
2195	211917396140	7251538	0	NaN	2023-06-09	Approved to Reopen
2198	211917396140	7155435	0	NaN	2023-06-06	Ordered Closed
2202	211917396140	7076472	0	NaN	2022-09-09	Approved to Reopen
2204	211917396140	7076471	0	NaN	2022-09-08	Ordered Closed
...	...	...	...	...	...	...
4681	211986408394	6548512	0	NaN	2020-07-31	Approved to Reopen
4682	211986408394	6548511	0	NaN	2020-07-31	Ordered Closed
4684	211986408394	6442731	0	NaN	2020-02-14	Approved to Reopen
4685	211986408394	6255627	0	NaN	2020-02-13	Ordered Closed

Lecture 5 – Exploring and Cleaning Data¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

San Diego Food Safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The Data¶

Understanding the Data¶

Aside: Working with files¶

You Try: Looking at the Data¶

Using `plotly` for Data Visualization¶

`plotly.express` Syntax¶

Feature Types¶

Different Kinds of data¶

Discussion Question¶

Feature types vs. data types¶

Data Cleaning: Quality Checks¶

Scope¶

Measurements and Values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using `pipe`¶

Joining the Restaurant Data¶

Data Cleaning: Looking for Missing Values¶

Data Cleaning: Transformations and Timestamps¶

Working with Timestamps¶

The `.dt` accessor¶

Data Cleaning: Modifying Structure¶

Using `melt`¶

Student-Submitted Questions¶

Questions submitted during lecture¶

Does FanFan have any food violations? (I like fanfan)¶

Restaurants that shut down¶

Example question: What are the most common violations?¶

Takeaways¶

	score	grade
0	96	NaN
1	98	NaN
2	98	NaN
3	100	A
...	...	...
5175	84	B
5176	0	NaN
5177	0	NaN
5178	90	A

	business_id	name	address	opened_date	...	status	kind	is_major	violation
4	211909057778	VALLEY FARM MARKET	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	2019-01-22	...	Complete	NaN	NaN	NaN
9	211909057778	VALLEY FARM MARKET	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	2019-01-22	...	Complete	NaN	NaN	NaN
13	211909057778	VALLEY FARM MARKET	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	2019-01-22	...	Complete	NaN	NaN	NaN
15	211909057778	VALLEY FARM MARKET	6902 LA JOLLA BLVD, LA JOLLA, CA 92037	2019-01-22	...	Complete	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
8708	211942150255	POKEWAN DEL MAR	12925 EL CAMINO REAL, SUITE# AA4, SAN DIEGO, C...	2016-11-03	...	Complete	Holding Temperatures	N	7. Proper hot & cold holding temperatures
8712	211925713322	SAFFRONO LOUNGE RESTAURANT	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	2022-11-03	...	Complete	NaN	NaN	NaN
8723	211925713322	SAFFRONO LOUNGE RESTAURANT	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	2022-11-03	...	Complete	NaN	NaN	NaN
8724	211925713322	SAFFRONO LOUNGE RESTAURANT	2672 DEL MAR HEIGHTS RD, DEL MAR, CA 92014	2022-11-03	...	Complete	NaN	NaN	NaN

Lecture 5 – Exploring and Cleaning Data¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

San Diego Food Safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The Data¶

Understanding the Data¶

Aside: Working with files¶

You Try: Looking at the Data¶

Using plotly for Data Visualization¶

plotly.express Syntax¶

Feature Types¶

Different Kinds of data¶

Discussion Question¶

Feature types vs. data types¶

Data Cleaning: Quality Checks¶

Scope¶

Measurements and Values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Joining the Restaurant Data¶

Data Cleaning: Looking for Missing Values¶

Data Cleaning: Transformations and Timestamps¶

Working with Timestamps¶

The .dt accessor¶

Data Cleaning: Modifying Structure¶

Using melt¶

Student-Submitted Questions¶

Questions submitted during lecture¶

Does FanFan have any food violations? (I like fanfan)¶

Restaurants that shut down¶

Example question: What are the most common violations?¶

Takeaways¶

Using `plotly` for Data Visualization¶

`plotly.express` Syntax¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶