from dsc80_utils import *

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

nyt_path = Path('data') / 'nyt_names.csv'
nyt = pd.read_csv(nyt_path)
nyt

nyt_small = nyt.iloc[[11, 12, 14]].reset_index(drop=True)

names_to_keep = ['Julius', 'Karen', 'Noah']
baby_small = (baby
 .query("Year == 2020 and Name in @names_to_keep")
 .reset_index(drop=True)
)

dfs_side_by_side(baby_small, nyt_small)

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
# Note the NaNs!
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='left')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='right')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='outer')

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'dsc80', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['dsc80', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

df1['a'] + df2['b']

# Run this cell to set up the next example.
profs = pd.DataFrame(
[['Sam', 'UCB', 5],
 ['Sam', 'UCSD', 5],
 ['Janine', 'UCSD', 8],
 ['Marina', 'UIC', 7],
 ['Justin', 'OSU', 5],
 ['Soohyun', 'UCSD', 2],
 ['Suraj', 'UCB', 2]],
    columns=['Name', 'School', 'Years']
)

schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})

dfs_side_by_side(profs, schools)

%%pt
profs.merge(schools, left_on='School', right_on='Abr', how='left')

dfs_side_by_side(profs, programs)

%%pt
profs.merge(programs, left_on='School', right_on='uni')

df = profs.merge(programs, left_on='School', right_on='uni')
df.shape[0] == (____).sum()

dfs_side_by_side(profs, programs)

# Your code goes here.

cate_counts = (
    baby
    .merge(nyt, left_on='Name', right_on='nyt_name')
    .groupby(['category', 'Year'])
    ['Count']
    .sum()
    .reset_index()
)
cate_counts

# We'll talk about plotting code soon!
import plotly.express as px
fig = px.line(cate_counts, x='Year', y='Count',
              facet_col='category', facet_col_wrap=3,
              facet_row_spacing=0.15,
              width=600, height=400)
fig.update_yaxes(matches=None, showticklabels=False)
fig

baby

def number_of_vowels(string):
    return sum(c in 'aeiou' for c in string.lower())

baby['Name'].apply(number_of_vowels)

# Built-in functions work with apply, too.
baby['Name'].apply(len)

%%timeit
baby['Name'].apply(number_of_vowels)

%%timeit
res = []
for name in baby['Name']:
    res.append(number_of_vowels(name))

%%timeit
baby['Year'] // 10 * 10 # Rounds down to the nearest multiple of 10.

%%timeit
baby['Year'].apply(lambda y: y // 10 * 10)

%%timeit
baby['Name'].str.len()

%%timeit
baby['Name'].apply(len)

rest_path = Path('data') / 'restaurants.csv'
insp_path = Path('data') / 'inspections.csv'
viol_path = Path('data') / 'violations.csv'

rest = pd.read_csv(rest_path)
insp = pd.read_csv(insp_path)
viol = pd.read_csv(viol_path)

rest.head(2)

rest.columns

insp.head(2)

insp.columns

viol.head(2)

viol.columns

fig = px.histogram(insp['score'])
fig

scores = (
    insp[['grade', 'score']]
    .dropna()
    .groupby('grade')
    .mean()
    .reset_index()
)
# x= and y= are columns of scores. Convenient!
px.bar(scores, x='grade', y='score')

# Same as the above!
scores.plot(kind='bar', x='grade', y='score')

# Your code goes here.

# pandas stores these as ints, but they're actually nominal.
rest['business_id']

# pandas stores these as strings, but they're actually numeric.
rest['opened_date']

rest.sample(5)

insp['grade'].value_counts()

insp.info()

# Are there multiple restaurants with the same address?
rest['address'].value_counts()

# Keeps all rows with duplicate addresses.
(
    rest
    .groupby('address')
    .filter(lambda df: df.shape[0] >= 2)
    .sort_values('address')
)

# Does the same thing as above!
(
    rest[rest.duplicated(subset=['address'], keep=False)]
    .sort_values('address')
)

rest[['address', 'zip']]

insp[['score', 'grade']]

def subset_rest(rest):
    return rest[['business_id', 'name', 'address', 'zip', 'opened_date']]

rest = (
    pd.read_csv(rest_path)
    .pipe(subset_rest)
)
rest

# Same as the above – but the above makes it easier to chain more .pipe calls afterwards.
subset_rest(pd.read_csv(rest_path))

def subset_insp(insp):
    return (
        insp[['business_id', 'inspection_id', 'score', 'grade', 'completed_date', 'status']]
        .rename(columns={'completed_date': 'date'})
    )

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
)

def subset_viol(viol):
    return (
        viol[['inspection_id', 'violation', 'major_violation', 'violation_accela']]
        .rename(columns={'violation': 'kind',
                         'major_violation': 'is_major',
                         'violation_accela': 'violation'})
    )

viol = (
    pd.read_csv(viol_path)
    .pipe(subset_viol)
)

def merge_all_restaurant_data():
    return (
        rest
        .merge(insp, on='business_id', how='left')
        .merge(viol, on='inspection_id', how='left')
    )

df = merge_all_restaurant_data()
df

insp[['score', 'grade']]

# The proportion of values in each column that are missing.
insp.isna().mean()

# Why are there null values here?
# insp['inspection_id'] and viol['inspection_id'] don't have any null values...
df[df['inspection_id'].isna()]

# Look at the dtype!
insp['date']

# This magical string tells Python what format the date is in.
# For more info: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
date_format = '%Y-%m-%d'
pd.to_datetime(insp['date'], format=date_format)

# Another advantage of defining functions is that we can reuse this function
# for the 'opened_date' column in `rest` if we wanted to.
def parse_dates(insp, col):
    date_format = '%Y-%m-%d'
    dates = pd.to_datetime(insp[col], format=date_format)
    return insp.assign(**{col: dates})

insp = (
    pd.read_csv(insp_path)
    .pipe(subset_insp)
    .pipe(parse_dates, 'date')
)

# We should also remake df, since it depends on insp.
# Note that the new insp is used to create df!
df = merge_all_restaurant_data()

# Look at the dtype now!
df['date']

insp.resample('2W', on='date')['score'].mean()

# Where are those numbers coming from?
insp[
    (insp['date'] >= pd.Timestamp('2020-01-05')) &
    (insp['date'] < pd.Timestamp('2020-01-19'))
]['score']

(insp.resample('2W', on='date')
 .size()
 .plot(title='Number of Inspections Over Time')
)

insp['date']

insp['date'].dt.day

insp['date'].dt.dayofweek

dow_counts = insp['date'].dt.dayofweek.value_counts()
fig = px.bar(dow_counts)
fig.update_xaxes(tickvals=np.arange(7), ticktext=['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

Lecture 5 – Exploratory Data Analysis and Data Cleaning¶

DSC 80, Fall 2024¶

Sam went to the bathroom, will be back soon.¶

Announcements 📣¶

Agenda 📆¶

Merging¶

Example: Name categories¶

Loading in the data¶

Merging¶

Example merge¶

The merge method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Notes on the merge method¶

Lots of pandas operations do an implicit outer join!¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Question 🤔 (Answer at dsc80.com/q)

Returning back to our original question¶

Questions? 🤔

Transforming¶

Transforming values¶

The price of apply¶

The price of apply¶

The .str accessor¶

Dataset overview¶

San Diego food safety¶

99% Of San Diego Restaurants Earn ‘A' Grades, Bringing Usefulness of System Into Question¶

The data¶

Question 🤔 (Answer at dsc80.com/q)

Introduction to plotly¶

plotly¶

Using plotly¶

Initial plots¶

Exploratory data analysis and feature types¶

The data science lifecycle, revisited¶

Exploratory data analysis (EDA)¶

Different feature types¶

Question 🤔 (Answer at dsc80.com/q)

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope¶

Measurements and values¶

Relationships¶

Analysis¶

💡 Pro-Tip: Using pipe¶

Combining the restaurant data¶

Question 🤔 (Answer at dsc80.com/q)

Data cleaning: Missing values¶

Missing values¶

Data cleaning: Transformations and timestamps¶

Transformations and timestamps¶

Creating timestamps¶

Working with timestamps¶

The .dt accessor¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Exploration¶

Question 🤔 (Answer at dsc80.com/q)

Example question: Can we rank restaurants by their number of violations? How about separately for each zip code?¶

Summary, next time¶

Summary¶

Next time¶

The `merge` method¶

Notes on the `merge` method¶

Lots of `pandas` operations do an implicit outer join!¶

The price of `apply`¶

The price of `apply`¶

The `.str` accessor¶

Introduction to `plotly`¶

`plotly`¶

Using `plotly`¶

💡 Pro-Tip: Using `pipe`¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶