from dsc80_utils import *

def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false&rm=minimal'
    width = 960
    height = 569
    display(IFrame(src, width, height))

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

species_and_island = (
    penguins
    .groupby(['species', 'island'])
    [['bill_length_mm', 'body_mass_g']]
    .mean()
)
species_and_island

species_and_island

species_and_island['body_mass_g']

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

species_and_island.reset_index()

(penguins
 .groupby(['species', 'island'], as_index=False)
 [['bill_length_mm', 'body_mass_g']]
 .mean()
)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm', # Choice of column here doesn't actually matter!
    aggfunc='count',
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='bill_length_mm',
    aggfunc='count',
    fill_value=0,
)

# Look at the similarity to the snippet above!
(penguins
 .groupby(['species', 'island'])
 [['bill_length_mm']]
 .count()
)

penguins = sns.load_dataset('penguins').dropna()
penguins

counts = penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='count',
    fill_value=0,
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows,
# which computes the sum of the **columns**.
joint.sum(axis=0)

joint.sum(axis=1)

counts

counts

counts.sum(axis=0)

counts / counts.sum(axis=0)

# Your code goes here.

lisa = pd.DataFrame([[20, 46], [18, 54], [5, 20]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
lisa.columns.name = 'Lisa' # This allows us to see the name "Lisa" in the top left of the DataFrame.

bart = pd.DataFrame([[5, 10], [5, 13.5], [22, 81.4]],
    columns=['Units', 'Grade Points Earned'],
    index=['Fall', 'Winter', 'Spring'],
)
bart.columns.name = 'Bart'

dfs_side_by_side(lisa, bart)

quarterly_gpas = pd.DataFrame({
    "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
    "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units'],
})

quarterly_gpas

# Helper function to show lisa and bart side-by-side to save screen space
dfs_side_by_side(lisa, bart)

# Your code goes here.

(quarterly_gpas
 .assign(Lisa_Units=lisa['Units'],
         Bart_Units=bart['Units'])
 .iloc[:, [0, 2, 1, 3]]
)

show_paradox_slides()

IFrame('https://www.youtube-nocookie.com/embed/zeuW1Z2EtLs?si=l2Dl7P-5RCq3ODpo',
       width=800, height=450)

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

nyt_path = Path('data') / 'nyt_names.csv'
nyt = pd.read_csv(nyt_path)
nyt

nyt_small = nyt.iloc[[11, 12, 14]].reset_index(drop=True)

names_to_keep = ['Julius', 'Karen', 'Noah']
baby_small = (baby
 .query("Year == 2020 and Name in @names_to_keep")
 .reset_index(drop=True)
)

dfs_side_by_side(baby_small, nyt_small)

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name')

%%pt
# Note the NaNs!
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='left')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='right')

%%pt
baby_small.merge(nyt_small, left_on='Name', right_on='nyt_name', how='outer')

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'dsc80', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['dsc80', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

df1['a'] + df2['b']

# Run this cell to set up the next example.
profs = pd.DataFrame(
[['Sam', 'UCB', 5],
 ['Sam', 'UCSD', 5],
 ['Janine', 'UCSD', 8],
 ['Marina', 'UIC', 7],
 ['Justin', 'OSU', 5],
 ['Soohyun', 'UCSD', 2],
 ['Suraj', 'UCB', 2]],
    columns=['Name', 'School', 'Years']
)

schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})

dfs_side_by_side(profs, schools)

%%pt
profs.merge(schools, left_on='School', right_on='Abr', how='left')

dfs_side_by_side(profs, programs)

%%pt
profs.merge(programs, left_on='School', right_on='uni')

df = profs.merge(programs, left_on='School', right_on='uni')
df.shape[0] == (____).sum()

dfs_side_by_side(profs, programs)

# Your code goes here.

cate_counts = (
    baby
    .merge(nyt, left_on='Name', right_on='nyt_name')
    .groupby(['category', 'Year'])
    ['Count']
    .sum()
    .reset_index()
)
cate_counts

# We'll talk about plotting code soon!
import plotly.express as px
fig = px.line(cate_counts, x='Year', y='Count',
              facet_col='category', facet_col_wrap=3,
              facet_row_spacing=0.15,
              width=600, height=400)
fig.update_yaxes(matches=None, showticklabels=False)

%%timeit
baby['Name'].apply(number_of_vowels)

%%timeit
res = []
for name in baby['Name']:
    res.append(number_of_vowels(name))

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

sex	Female	Male
species
Adelie	73	73
Chinstrap	34	34
Gentoo	58	61

Phone Type	Stars for Dirty Birds	Stars for The Loft
Android	4.24	4.0
iPhone	2.99	2.79
All	3.32	3.37

Lecture 4 – Simpson's Paradox, Joining, and Transforming¶

DSC 80, Winter 2025¶

Announcements 📣¶

Agenda¶

Grouping with multiple columns¶

Grouping and indexes¶

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Distributions¶

Example: Palmer Penguins¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Question 🤔 (Answer at dsc80.com/q)

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

Question 🤔 (Answer at dsc80.com/q)

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Example: Restaurant reviews and phone types¶

Rule of thumb 👍¶

Takeaways¶

Really?¶

Further reading¶

Merging¶

Example: Name categories¶

Loading in the data¶

Merging¶

Example merge¶

The merge method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Notes on the merge method¶

Lots of pandas operations do an implicit outer join!¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Question 🤔 (Answer at dsc80.com/q)

Returning back to our original question¶

Questions? 🤔

Other data representations¶

Representations of tabular data¶

DataFrames vs. spreadsheets¶

DataFrames vs. matrices¶

DataFrames vs. relations¶

Summary¶

Next time¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶

The `merge` method¶

Notes on the `merge` method¶

Lots of `pandas` operations do an implicit outer join!¶