import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats
from IPython.display import display, IFrame

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

set_matplotlib_formats("svg")
sns.set_context("poster")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

def show_paradox_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vSbFSaxaYZ0NcgrgqZLvjhkjX-5MQzAITWAsEFZHnix3j1c0qN8Vd1rogTAQP7F7Nf5r-JWExnGey7h/embed?start=false'
    width = 960
    height = 569
    display(IFrame(src, width, height))

baby = pd.read_csv('data/baby.csv')
baby

# Don't about this code, we'll cover it when we talk about data cleaning
co2 = pd.read_csv('data/co2_mm_mlo.txt', 
                  header=None, skiprows=72, sep='\s+',
                  names=['Yr', 'Mo', 'DecDate', 'Avg', 'co2', 'Trend', 'days'],
                  usecols=['Yr', 'Mo', 'DecDate', 'co2'])
co2

# Don't worry about understanding this code for now
sns.lineplot(data=co2, x='DecDate', y='co2');

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

# What is the distribution of different 'species' in this dataset?
penguins['species'].value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

# Overall, what is the distribution of different islands?
penguins['island'].value_counts()

Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

penguins['body_mass_g'].mean()

4207.057057057057

mean_body_mass = penguins['body_mass_g'].mean()
print(f'Mean penguin mass: {mean_body_mass:.2f} grams')

Mean penguin mass: 4207.06 grams

penguins['body_mass_g'].mean()

4207.057057057057

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
dtype: float64

# Before:
penguins['body_mass_g'].mean()

# After:
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

%%pt

penguins.groupby('species')['body_mass_g'].mean()

penguins.head()

penguins.shape

(333, 7)

(penguins
 .groupby('species')
 .median()
 ['bill_length_mm']
 .idxmax()
)

'Chinstrap'

(penguins
 .assign(on_dream = penguins['island'] == 'Dream')
 .groupby('species')
 .mean()['on_dream']
)

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: on_dream, dtype: float64

penguins.groupby('species').mean()

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb8c011e070>

# Simplified table for demostration:
penguins_small = penguins.iloc[[0, 1, 150, 151, 251, 300, 301], [0, 5, 6]]

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb8c00ed640>

%%pt
penguin_groups

penguin_groups.groups

{'Adelie': [0, 1], 'Chinstrap': [156, 157], 'Gentoo': [258, 308, 309]}

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species').mean()

penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Fill in this cell

# Back to the big penguins dataset
penguins.groupby('species').mean()

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').mean()['bill_length_mm']

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb8d46c14f0>

# Saves time!
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

%%timeit
penguins.groupby('species').mean()['bill_length_mm']

359 µs ± 1.94 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
penguins.groupby('species')['bill_length_mm'].mean()

137 µs ± 659 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

981 µs ± 1.88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'nunique'})
)

# The function for .agg takes in a pd.Series as its argument and returns a scalar 
def iqr(series):
    return np.percentile(series, 75) - np.percentile(series, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
4     -0.94
       ... 
340    0.80
341    1.92
342    1.23
343    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
4     -0.56
       ... 
340   -0.49
341    1.32
342    0.22
343    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

(penguins
 .groupby('species')
 .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

penguins.groupby('species').filter(lambda df: df.shape[0] > 100)

species_and_island = penguins.groupby(['species', 'island']).mean()
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm         39.04
bill_depth_mm          18.45
flipper_length_mm     191.53
body_mass_g          3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

penguins.groupby(['species', 'island'], as_index=False).mean()

baby = pd.read_csv('data/baby.csv')
baby

# Fill me in

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Look at the similarity to the snippet above
(last_5_years
 .groupby(['Year', 'Sex'])
 ['Count']
 .sum()
 .unstack('Sex')
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

counts = penguins.pivot_table(
    index='species', 
    columns='sex', 
    values='body_mass_g', 
    aggfunc='count', 
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows, which computes the sum of the **columns**.
joint.sum(axis=0)

sex
Female    0.5
Male      0.5
dtype: float64

joint.sum(axis=1)

species
Adelie       0.44
Chinstrap    0.20
Gentoo       0.36
dtype: float64

counts

counts

counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64

counts / counts.sum(axis=0)

lisa = pd.DataFrame([
        [20, 46],
        [18, 54],
        [5, 20],
    ],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'],
)

bart = pd.DataFrame([
        [5, 10],
        [5, 13.5],
        [22, 81.4],
    ],
    columns=['Units', 'Grade Points Earned'], 
    index=['Fall', 'Winter', 'Spring'],
)

lisa

bart

quarterly_gpas = pd.DataFrame({
    "Lisa's Quarter GPA": lisa['Grade Points Earned'] / lisa['Units'],
    "Bart's Quarter GPA": bart['Grade Points Earned'] / bart['Units'],
})

quarterly_gpas

tot = lisa.sum()
tot['Grade Points Earned'] / tot['Units']

2.7906976744186047

tot = bart.sum()
tot['Grade Points Earned'] / tot['Units']

3.278125

(quarterly_gpas
 .assign(Lisa_units=lisa['Units'],
         Bart_units=bart['Units']) 
 .iloc[:, [0, 2, 1, 3]]
)

show_paradox_slides()

IFrame('https://www.youtube-nocookie.com/embed/zeuW1Z2EtLs?si=l2Dl7P-5RCq3ODpo',
       width=560, height=315)

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
3	Oliver	M	15076	2022
...	...	...	...	...
2085154	Worthy	M	5	1880
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

	Yr	Mo	DecDate	co2
0	1958	3	1958.21	315.71
1	1958	4	1958.29	317.45
2	1958	5	1958.38	317.50
3	1958	6	1958.46	317.10
...	...	...	...	...
734	2019	5	2019.38	414.66
735	2019	6	2019.46	413.92
736	2019	7	2019.54	411.77
737	2019	8	2019.62	409.95

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
3	Oliver	M	15076	2022
...	...	...	...	...
2085154	Worthy	M	5	1880
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
...	...	...	...	...	...	...	...
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species
Adelie	38.82	18.35	190.10	3706.16
Chinstrap	48.83	18.42	195.82	3733.09
Gentoo	47.57	15.00	217.24	5092.44

	bill_length_mm		bill_depth_mm		flipper_length_mm		body_mass_g
	count	mean	count	mean	count	mean	count	mean
species
Adelie	146	38.82	146	18.35	146	190.10	146	3706.16
Chinstrap	68	48.83	68	18.42	68	195.82	68	3733.09
Gentoo	119	47.57	119	15.00	119	217.24	119	5092.44

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
155	Chinstrap	Dream	45.4	18.7	188.0	3525.0	Female
...	...	...	...	...	...	...	...
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

		bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species	island
Adelie	Biscoe	38.98	18.37	188.80	3709.66
	Dream	38.52	18.24	189.93	3701.36
	Torgersen	39.04	18.45	191.53	3708.51
Chinstrap	Dream	48.83	18.42	195.82	3733.09
Gentoo	Biscoe	47.57	15.00	217.24	5092.44

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

	Units	Grade Points Earned
Fall	20	46
Winter	18	54
Spring	5	20

Lecture 3 – Aggregating, Simpson's paradox¶

DSC 80, Fall 2023¶

Agenda¶

📣 Announcements 📣¶

Data granularity¶

Granularity¶

Example: Baby Names¶

Example: CO2 readings¶

Example: CO2 readings by month¶

Collecting data¶

Manipulating granularity¶

Example: Penguins¶

Video: Palmer Penguins¶

Aggregating: Basics¶

💡 Pro-Tip: Using f-strings¶

Aggregating: Basics¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Discussion Question¶

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering Groups¶

Grouping with multiple columns¶

Grouping and indexes¶

Discussion: Checking your knowledge¶

Pivot Tables: An extension of grouping¶

pivot_table¶

Example:¶

Granularity, revisited¶

Reshaping¶

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Simpson's paradox¶

Example: Grades¶

Quarter-specific vs. overall GPAs¶

What happened?¶

Simpson's paradox¶

Example: How Berkeley was almost sued for gender discrimination (1973)¶

What happened?¶

Caution!¶

But then...¶

Do these conflict?¶

Do these conflict?¶

Example: Restaurant reviews and phone types¶

Takeaways¶

Really?¶

Further reading¶

Summary, next time¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶

`pivot_table`¶