import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats
from IPython.display import display, IFrame

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

set_matplotlib_formats("svg")
sns.set_context("poster")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false'
    width = 960
    height = 569
    display(IFrame(src, width, height))

penguins = sns.load_dataset('penguins').dropna()

counts = penguins.pivot_table(
    index='species', 
    columns='sex', 
    values='body_mass_g', 
    aggfunc='count', 
    fill_value=0
)
counts

counts

# Compute your answers here

lisa = pd.DataFrame([[20, 46], [18, 54], [5, 20]],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'],
)

bart = pd.DataFrame([[5, 10], [5, 13.5], [22, 81.4]],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'],
)

lisa

bart

quarterly_gpas = pd.DataFrame({
    "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
    "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units'],
})

quarterly_gpas

tot = lisa.sum()
tot['Grade Points Earned'] / tot['Units']

2.7906976744186047

tot = bart.sum()
tot['Grade Points Earned'] / tot['Units']

3.278125

(quarterly_gpas
 .assign(Lisa_units=lisa['Units'],
         Bart_units=bart['Units']) 
 .iloc[:, [0, 2, 1, 3]]
)

show_paradox_slides()

IFrame('https://www.youtube-nocookie.com/embed/zeuW1Z2EtLs?si=l2Dl7P-5RCq3ODpo',
       width=560, height=315)

baby = pd.read_csv('data/baby.csv')
baby

nyt = pd.read_csv('data/nyt_names.csv')
nyt

nyt_small = nyt.iloc[[11, 12, 14]].reset_index(drop=True)

names_to_keep = ['Julius', 'Karen', 'Noah']
baby_small = (baby
 .query("Year == 2020 and Name in @names_to_keep")
 .reset_index(drop=True)
)

nyt_small

baby_small

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
# Note the NaNs!
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='left')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='right')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='outer')

# Run this cell to set up the next example.

profs = pd.DataFrame(
[['Sam', 'UCB', 5],
 ['Sam', 'UCSD', 5],
 ['Janine', 'UCSD', 8],
 ['Marina', 'UIC', 7],
 ['Justin', 'OSU', 5],
 ['Soohyun', 'UCSD', 2],
 ['Suraj', 'UCB', 2]],
    columns=['Name', 'School', 'Years']
)

schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})

profs

schools

# Why is a left merge most appropriate here?
profs.merge(schools, left_on='School', right_on='Abr', how='left')

profs

programs

%%pt

profs.merge(programs, left_on='School', right_on='uni')

cate_counts = (
    baby
    .merge(nyt, left_on='Name', right_on='nyt_name')
    .groupby(['category', 'Year'])
    ['Count']
    .sum()
    .reset_index()
)
cate_counts

# We'll talk about plotting code soon!
import plotly.express as px
fig = px.line(cate_counts, x='Year', y='Count',
              facet_col='category', facet_col_wrap=3,
              facet_row_spacing=0.15,
              width=600, height=400)
fig.update_yaxes(matches=None, showticklabels=False)

baby

def number_of_vowels(string):
    return sum(c in 'aeiou' for c in string.lower())

baby['Name'].apply(number_of_vowels)

0          2
1          2
2          4
3          3
          ..
2085154    1
2085155    1
2085156    1
2085157    4
Name: Name, Length: 2085158, dtype: int64

# Builtin functions work
baby['Name'].apply(len)

0          4
1          4
2          6
3          6
          ..
2085154    6
2085155    6
2085156    4
2085157    9
Name: Name, Length: 2085158, dtype: int64

def first_letter(string):
    return string[0]

(baby
 .assign(first=baby['Name'].apply(first_letter))
 .query('first == "L"')
 .groupby('Year')
 ['Count']
 .sum()
 .plot()
)

<AxesSubplot:xlabel='Year'>

%%timeit
baby['Name'].apply(first_letter)

151 ms ± 622 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
res = []
for name in baby['Name']:
    res.append(name[0])

176 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
baby['Year'] // 10 * 10

4.87 ms ± 38.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%%timeit
baby['Year'].apply(lambda y: y // 10 * 10)

453 ms ± 3.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# Equivalent to:
# baby['Name'].apply(first_letter)
baby['Name'].str[0]

0          L
1          N
2          O
3          O
          ..
2085154    W
2085155    W
2085156    Y
2085157    Z
Name: Name, Length: 2085158, dtype: object

baby['Name'].str.len()

0          4
1          4
2          6
3          6
          ..
2085154    6
2085155    6
2085156    4
2085157    9
Name: Name, Length: 2085158, dtype: int64

sex	Female	Male
species
Adelie	73	73
Chinstrap	34	34
Gentoo	58	61

sex	Female	Male
species
Adelie	73	73
Chinstrap	34	34
Gentoo	58	61

Phone Type	Stars for Dirty Birds	Stars for The Loft
Android	4.24	4.0
iPhone	2.99	2.79
All	3.32	3.37

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
3	Oliver	M	15076	2022
...	...	...	...	...
2085154	Worthy	M	5	1880
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

	nyt_name	category
0	Lucifer	forbidden
1	Lilith	forbidden
2	Danger	forbidden
3	Amen	evangelical
...	...	...
19	Amethyst	mineral
20	Venus	celestial
21	Celestia	celestial
22	Skye	celestial

	Name	Sex	Count	Year
0	Noah	M	18407	2020
1	Julius	M	966	2020
2	Karen	F	330	2020
3	Noah	F	306	2020
4	Karen	M	6	2020

	Name	School	Years
0	Sam	UCB	5
1	Sam	UCSD	5
2	Janine	UCSD	8
3	Marina	UIC	7
4	Justin	OSU	5
5	Soohyun	UCSD	2
6	Suraj	UCB	2

	Abr	Full
0	UCSD	University of California San Diego
1	UCLA	University of California, Los Angeles
2	UCB	University of California, Berkeley
3	UIC	University of Illinois Chicago

	uni	dept	grad_students
0	UCSD	Math	205
1	UCSD	HDSI	54
2	UCSD	COGS	281
3	UCB	CS	439
4	OSU	Math	304
5	OSU	CS	193

	category	Year	Count
0	boomer	1880	292
1	boomer	1881	298
2	boomer	1882	326
3	boomer	1883	322
...	...	...	...
658	mythology	2019	3330
659	mythology	2020	3516
660	mythology	2021	3895
661	mythology	2022	4049

	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

	Units	Grade Points Earned
Fall	5	10.0
Winter	5	13.5
Spring	22	81.4

	Lisa's Quarter GPA	Bart's Quarter GPA
Fall	2.3	2.0
Winter	3.0	2.7
Spring	4.0	3.7

	Lisa's Quarter GPA	Lisa_units	Bart's Quarter GPA	Bart_units
Fall	2.3	20	2.0	5
Winter	3.0	18	2.7	5
Spring	4.0	5	3.7	22

Lecture 4 – Simpson's Paradox, Joining, Transforming¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

Clarification about for loops¶

Review: Distributions¶

Joint distribution¶

Discussion question:¶

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Caution!¶

But then...¶

Do these conflict?¶

Do these conflict?¶

Example: Restaurant reviews and phone types¶

Rule of thumb 👍¶

Takeaways¶

Really?¶

Further reading¶

Merging¶

Motivating Example: Name categories¶

Joining¶

Let's do a join!¶

The merge method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Notes on the merge method¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Returning back to our original question¶

Transforming¶

Transforming values¶

Looking at "L" names¶

The Price of .apply¶

The Price of .apply¶

The .str accessor¶

Other Data Representations¶

Dataframes vs. Spreadsheets¶

Dataframes vs. Matrices¶

Dataframes vs. Relations¶

Takeaways¶

Clarification about `for` loops¶

The `merge` method¶

Notes on the `merge` method¶

The Price of `.apply`¶

The Price of `.apply`¶

The `.str` accessor¶