from dsc80_utils import *

# Pandas Tutor setup
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['species'].value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

penguins['species'].value_counts(normalize=True)

Adelie       0.44
Gentoo       0.36
Chinstrap    0.20
Name: species, dtype: float64

penguins['island'].value_counts()

Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64

penguins['body_mass_g'].mean()

4207.057057057057

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
dtype: float64

# Before:
penguins['body_mass_g'].mean()

4207.057057057057

# After:
penguins.groupby('species').mean()['body_mass_g']

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

%%pt

penguins.groupby('species').mean()['body_mass_g']

penguins

penguins

# Your code goes here.
penguins.groupby('species').median()['bill_length_mm'].idxmax()

'Chinstrap'

(
    penguins
    .assign(is_dream=penguins['island'] == 'Dream')
    .groupby('species')
    .mean()
    ['is_dream']
)

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_dream, dtype: float64

penguins.groupby('species').mean()

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x15ceb5ac0>

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x15ceb58b0>

%%pt
penguin_groups

penguin_groups.groups

{'Adelie': [0, 1], 'Chinstrap': [156, 157], 'Gentoo': [308, 258, 309]}

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species').mean()

penguins_small.groupby('species').sum()

penguins_small

penguins_small.groupby('species').last()

penguins_small

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins_small['species'] == 'Adelie') & (penguins_small['body_mass_g'] == 3800.0)]

# Your code goes here.
(
    penguins
    .sort_values('body_mass_g', ascending=False)
    .groupby('sex')
    .first()
)

# Back to the big penguins dataset!
penguins.groupby('species').mean()

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').mean()['bill_length_mm']

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x15d8853a0>

# Saves time!
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

%%timeit
penguins.groupby('species').mean()['bill_length_mm']

302 µs ± 3.96 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
penguins.groupby('species')['bill_length_mm'].mean()

116 µs ± 1.6 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()
    
species_map

884 µs ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

# Slower
penguins.groupby('species').mean()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].mean()

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .aggregate(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .agg({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0) # makes sure that the denominator in standard deviation is n instead of n - 1

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
341    1.92
342    1.23
343    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (
    penguins
    .groupby('species')
    ['body_mass_g']
    .transform(z_score)
)
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
341    1.32
342    0.22
343    0.62
Name: body_mass_g, Length: 333, dtype: float64

display_df(penguins.assign(z_mass=z_mass), rows=8)

penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(
    penguins
    .groupby('species')
    .filter(lambda df: df.shape[0] >= 100)
)

species_and_island = penguins.groupby(['species', 'island']).mean()
species_and_island

species_and_island

species_and_island['body_mass_g']

species    island   
Adelie     Biscoe       3709.66
           Dream        3701.36
           Torgersen    3708.51
Chinstrap  Dream        3733.09
Gentoo     Biscoe       5092.44
Name: body_mass_g, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm         39.04
bill_depth_mm          18.45
flipper_length_mm     191.53
body_mass_g          3708.51
Name: (Adelie, Torgersen), dtype: float64

species_and_island.reset_index()

penguins.groupby(['species', 'island'], as_index=False).mean()

baby_path = Path('data') / 'baby.csv'
baby = pd.read_csv(baby_path)
baby

# Your code goes here.
(
    baby
    .groupby('Year')
    .filter(lambda df: df['Count'].sum() > 1_000_000)
    .sort_values('Count', ascending=False)
    .groupby(['Year', 'Sex'])
    .first()
)

# Verify your answer!
baby.query('Year == 1913 and Sex == "M"').sort_values('Count')

df.pivot_table(index=index_col,
               columns=columns_col,
               values=values_col,
               aggfunc=func)

last_5_years = baby.query('Year >= 2018')
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum'
)

last_5_years.groupby(['Year', 'Sex'])[['Count']].sum()

penguins

penguins.pivot_table(
    index='species',
    columns='island',
    values='flipper_length_mm',
    aggfunc='count'
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='flipper_length_mm',
    aggfunc='count',
).fillna(0)

penguins.pivot_table(
    index='species',
    columns='island',
    values='flipper_length_mm',
    aggfunc='count',
    fill_value=0
)

penguins.pivot_table(
    index='species',
    columns='island',
    values='flipper_length_mm',
    aggfunc='count',
    fill_value=0
)

counts = penguins.pivot_table(
    index='species', 
    columns='sex', 
    values='body_mass_g', 
    aggfunc='count', 
    fill_value=0
)
counts

joint = counts / counts.sum().sum()
joint

joint

# Recall, joint.sum(axis=0) sums across the rows, 
# which computes the sum of the **columns**.
joint.sum(axis=0)

sex
Female    0.5
Male      0.5
dtype: float64

joint.sum(axis=1)

species
Adelie       0.44
Chinstrap    0.20
Gentoo       0.36
dtype: float64

counts

counts

counts.sum(axis=0)

sex
Female    165
Male      168
dtype: int64

counts / counts.sum(axis=0)

# Your code goes here.

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

		Name	Count
Year	Sex
1913	F	Mary	36642
1913	M	John	29329
1914	F	Mary	45346
...	...	...	...
2021	M	Liam	20365
2022	F	Olivia	16573
2022	M	Liam	20456

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
159444	Zyrie	M	5	2018
159445	Zyron	M	5	2018
159446	Zzyzx	M	5	2018

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species
Adelie	38.82	18.35	190.10	3706.16
Chinstrap	48.83	18.42	195.82	3733.09
Gentoo	47.57	15.00	217.24	5092.44

	bill_length_mm		bill_depth_mm		flipper_length_mm		body_mass_g
	count	mean	count	mean	count	mean	count	mean
species
Adelie	146	38.82	146	18.35	146	190.10	146	3706.16
Chinstrap	68	48.83	68	18.42	68	195.82	68	3733.09
Gentoo	119	47.57	119	15.00	119	217.24	119	5092.44

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
152	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
153	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
154	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

		bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
species	island
Adelie	Biscoe	38.98	18.37	188.80	3709.66
	Dream	38.52	18.24	189.93	3701.36
	Torgersen	39.04	18.45	191.53	3708.51
Chinstrap	Dream	48.83	18.42	195.82	3733.09
Gentoo	Biscoe	47.57	15.00	217.24	5092.44

	Name	Sex	Count	Year
1980373	Zenus	M	5	1913
1980072	Engelbert	M	5	1913
1980073	Ephram	M	5	1913
...	...	...	...	...
1973409	James	M	20832	1913
1973408	William	M	23538	1913
1973407	John	M	29329	1913

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

Lecture 3 – Grouping and Pivoting¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at q.dsc80.com)

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Initial exploration¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Exercise

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Exercise

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

Question 🤔 (Answer at q.dsc80.com)

Other DataFrameGroupBy methods¶

Split-apply-combine, revisited¶

Transformations¶

Transformations within groups¶

Filtering groups¶

Grouping with multiple columns¶

Grouping and indexes¶

Exercise

Pivot tables using the pivot_table method¶

Pivot tables: an extension of grouping¶

pivot_table¶

Example¶

Granularity, revisited¶

Reshaping¶

Question 🤔 (Answer at q.dsc80.com)

Distributions¶

Joint distribution¶

Marginal probabilities¶

Conditional probabilities¶

Conditional probabilities¶

Exercise

Summary, next time¶

Summary¶

Next time¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶

Other `DataFrameGroupBy` methods¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶